In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler 

In [4]:
train_data = pd.read_csv('../../airbnb_project_data/train_users_2.csv')
train_data.head(10)

Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,country_destination
0,gxn3p5htnn,2010-06-28,20090319043255,,-unknown-,,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,NDF
1,820tgsjxq7,2011-05-25,20090523174809,,MALE,38.0,facebook,0,en,seo,google,untracked,Web,Mac Desktop,Chrome,NDF
2,4ft3gnwmtx,2010-09-28,20090609231247,2010-08-02,FEMALE,56.0,basic,3,en,direct,direct,untracked,Web,Windows Desktop,IE,US
3,bjjt8pjhuk,2011-12-05,20091031060129,2012-09-08,FEMALE,42.0,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Firefox,other
4,87mebub9p4,2010-09-14,20091208061105,2010-02-18,-unknown-,41.0,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,US
5,osr2jwljor,2010-01-01,20100101215619,2010-01-02,-unknown-,,basic,0,en,other,other,omg,Web,Mac Desktop,Chrome,US
6,lsw9q7uk0j,2010-01-02,20100102012558,2010-01-05,FEMALE,46.0,basic,0,en,other,craigslist,untracked,Web,Mac Desktop,Safari,US
7,0d01nltbrs,2010-01-03,20100103191905,2010-01-13,FEMALE,47.0,basic,0,en,direct,direct,omg,Web,Mac Desktop,Safari,US
8,a1vcnhxeij,2010-01-04,20100104004211,2010-07-29,FEMALE,50.0,basic,0,en,other,craigslist,untracked,Web,Mac Desktop,Safari,US
9,6uh8zyj2gn,2010-01-04,20100104023758,2010-01-04,-unknown-,46.0,basic,0,en,other,craigslist,omg,Web,Mac Desktop,Firefox,US


In [17]:
#rearrange age data and separate as four groups: nan, young people, middle age people and old people
age = []
for i in train_data.age:
    if (i > 95) | (i < 18) | (pd.isnull(i)) :
        category = 'NaN'
    elif (i >= 18) & (i < 30):
        category = 'young_people'
    elif (i >= 30) & (i < 55):
        category = 'middle_age_people'
    elif (i >= 55) & (i <= 95):
        category = 'old_people'
    age.append(category)
age = pd.get_dummies(pd.Series(age), prefix='age', drop_first=True)

In [18]:
signup_method = pd.get_dummies(train_data.signup_method, prefix='signup_method', drop_first=True)
language = pd.get_dummies(train_data.language, prefix='language', drop_first=True) #(train.language == 'en').astype(int)
affiliate_channel = pd.get_dummies(train_data.affiliate_channel, prefix='affiliate_channel', drop_first=True)
first_tracked = pd.get_dummies(train_data.first_affiliate_tracked, prefix='first_tracked', drop_first=True)
signup_app = pd.get_dummies(train_data.signup_app, prefix='signup_app', drop_first=True)
signup_flow = pd.get_dummies(train_data.signup_flow, prefix='signup_flow', drop_first=True)
affiliate_provider = pd.get_dummies(train_data.affiliate_provider, prefix='affiliate_provider', drop_first=True)
first_browser = pd.get_dummies(train_data.first_browser, prefix='first_browser', drop_first=True)

In [19]:
#combine 'unknown' with 'other'
train_data.gender.replace('-unknown-', 'OTHER', inplace=True)
gender = pd.get_dummies(train_data.gender, prefix='gender', drop_first=True)

In [20]:
#combine 'desktop(other)', 'smartphone(other)' and 'other/unknown' as one group
first_device = train_data.first_device_type
pd.options.mode.chained_assignment = None
first_device[(first_device == 'Desktop (Other)')|(first_device == 'SmartPhone (Other)')] = 'Other/Unknown'
first_device = pd.get_dummies(first_device, prefix='first_device', drop_first=True)

In [21]:
#time
#age
#affiliate_provider
#first_browser

In [22]:
create_year = []
create_month = []
for i in train_data.date_account_created:
    y, m, d = i.split('-')
    create_year.append(int(y))
    create_month.append(int(m))

In [23]:
booking_year = []
booking_month = []
for i in train_data.date_first_booking:
    if type(i) == float:
        y = m = np.nan
    else:
        y, m, d = i.split('-')
    booking_year.append(y)
    booking_month.append(m)

In [24]:
active_year = []
active_month = []
for i in train_data.timestamp_first_active:
    active_year.append(int(str(i)[0:4]))
    active_month.append(int(str(i)[4:6]))

In [34]:
gap_month = []
for i in range(len(create_year)):
    gap_year = create_year[i] - active_year[i]
    if gap_year == 0:
        gap = create_month[i] - active_month[i]
    elif gap_year > 0:
        gap = 12*(gap_year-1) + (12-active_month[i]) + create_month[i]
    gap_month.append(float(gap))
gap_month = pd.Series(gap_month)

In [26]:
first_booking_y = pd.get_dummies(booking_year, prefix='first_booking_y', drop_first=True)
first_booking_m = pd.get_dummies(booking_month, prefix='first_booking_m', drop_first=True)

In [27]:
first_model_label = (train_data.country_destination == 'NDF').astype(int)

In [35]:
data_df_1 = pd.concat([gap_month, gender, age, signup_method, signup_flow, 
                       language, affiliate_channel, affiliate_provider, first_tracked, signup_app, first_device,
                       first_browser, first_model_label], axis=1)

In [29]:
data_df_1.shape

(213451, 139)

In [36]:
# Splitting the data to 70% training, 20% validating and 10% testing
np.random.seed(101)
data = np.random.permutation(data_df_1)
train, validate, test = np.split(data, [int(.7*len(data)), int(.9*len(data))])

X_train = train[:, :-1]
Y_train = train[:, -1]
X_test = validate[:, :-1]
Y_test = validate[:, -1]
X_final = test[:, :-1]
Y_final = test[:, -1]

print(X_train.shape)
print(X_test.shape)
print(X_final.shape)
print(Y_train.shape)
print(Y_test.shape)
print(Y_final.shape)

(149415, 138)
(42690, 138)
(21346, 138)
(149415,)
(42690,)
(21346,)


In [31]:
print(sum(Y_train)/len(Y_train), sum(Y_test)/len(Y_test))

0.582485024931 0.584680252987


In [37]:
# Standardize variables
scaler = StandardScaler()
scaler.fit(X_train) # use train data to fit
X_test = scaler.transform(X_test)
X_final = scaler.transform(X_final)
X_train = scaler.transform(X_train)
X_train

array([[-0.01918294, -0.58565845, -0.90505087, ..., -0.00258705,
        -0.00818121, -0.00517415],
       [-0.01918294, -0.58565845, -0.90505087, ..., -0.00258705,
        -0.00818121, -0.00517415],
       [-0.01918294,  1.70747985, -0.90505087, ..., -0.00258705,
        -0.00818121, -0.00517415],
       ..., 
       [-0.01918294, -0.58565845,  1.10491027, ..., -0.00258705,
        -0.00818121, -0.00517415],
       [-0.01918294, -0.58565845,  1.10491027, ..., -0.00258705,
        -0.00818121, -0.00517415],
       [-0.01918294, -0.58565845, -0.90505087, ..., -0.00258705,
        -0.00818121, -0.00517415]])