In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler 
from sklearn.preprocessing import LabelEncoder

### Load data

In [2]:
train_data = pd.read_csv('../../airbnb_project_data/train_users_2.csv')
train_data.head(10)

Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,country_destination
0,gxn3p5htnn,2010-06-28,20090319043255,,-unknown-,,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,NDF
1,820tgsjxq7,2011-05-25,20090523174809,,MALE,38.0,facebook,0,en,seo,google,untracked,Web,Mac Desktop,Chrome,NDF
2,4ft3gnwmtx,2010-09-28,20090609231247,2010-08-02,FEMALE,56.0,basic,3,en,direct,direct,untracked,Web,Windows Desktop,IE,US
3,bjjt8pjhuk,2011-12-05,20091031060129,2012-09-08,FEMALE,42.0,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Firefox,other
4,87mebub9p4,2010-09-14,20091208061105,2010-02-18,-unknown-,41.0,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,US
5,osr2jwljor,2010-01-01,20100101215619,2010-01-02,-unknown-,,basic,0,en,other,other,omg,Web,Mac Desktop,Chrome,US
6,lsw9q7uk0j,2010-01-02,20100102012558,2010-01-05,FEMALE,46.0,basic,0,en,other,craigslist,untracked,Web,Mac Desktop,Safari,US
7,0d01nltbrs,2010-01-03,20100103191905,2010-01-13,FEMALE,47.0,basic,0,en,direct,direct,omg,Web,Mac Desktop,Safari,US
8,a1vcnhxeij,2010-01-04,20100104004211,2010-07-29,FEMALE,50.0,basic,0,en,other,craigslist,untracked,Web,Mac Desktop,Safari,US
9,6uh8zyj2gn,2010-01-04,20100104023758,2010-01-04,-unknown-,46.0,basic,0,en,other,craigslist,omg,Web,Mac Desktop,Firefox,US


### Rearrange data

In [3]:
def rearrange_data(dataset):
    #rearrange age data and separate as four groups: nan, young people, middle age people and old people
    age = []
    for i in dataset.age:
        if (i > 95) | (i < 18) | (pd.isnull(i)) :
            category = 'NaN'
        elif (i >= 18) & (i < 30):
            category = 'young_people'
        elif (i >= 30) & (i < 55):
            category = 'middle_age_people'
        elif (i >= 55) & (i <= 95):
            category = 'old_people'
        age.append(category)
    age = pd.get_dummies(pd.Series(age), prefix='age', drop_first=True)
    
    #use dummy variable to separate different categories in each column
    signup_method = pd.get_dummies(dataset.signup_method, prefix='signup_method', drop_first=True)
    language = pd.get_dummies(dataset.language, prefix='language', drop_first=True) #(train.language == 'en').astype(int)
    affiliate_channel = pd.get_dummies(dataset.affiliate_channel, prefix='affiliate_channel', drop_first=True)
    first_tracked = pd.get_dummies(dataset.first_affiliate_tracked, prefix='first_tracked', drop_first=True)
    signup_app = pd.get_dummies(dataset.signup_app, prefix='signup_app', drop_first=True)
    signup_flow = pd.get_dummies(dataset.signup_flow, prefix='signup_flow', drop_first=True)
    affiliate_provider = pd.get_dummies(dataset.affiliate_provider, prefix='affiliate_provider', drop_first=True)
    first_browser = pd.get_dummies(dataset.first_browser, prefix='first_browser', drop_first=True)
    
    #for gender column, combine 'unknown' with 'other'
    dataset.gender.replace('-unknown-', 'OTHER', inplace=True)
    gender = pd.get_dummies(dataset.gender, prefix='gender', drop_first=True)
    
    #for first device type, combine 'desktop(other)', 'smartphone(other)' and 'other/unknown' as one group
    first_device = dataset.first_device_type
    pd.options.mode.chained_assignment = None
    first_device[(first_device == 'Desktop (Other)')|(first_device == 'SmartPhone (Other)')] = 'Other/Unknown'
    first_device = pd.get_dummies(first_device, prefix='first_device', drop_first=True)

    return gender, age, signup_method, signup_flow, language, affiliate_channel, affiliate_provider, first_tracked, signup_app, first_device, first_browser

### First model

In [4]:
#deal with time
create_time = []
for i in train_data.date_account_created:
    create_time.append(pd.Timestamp(i))
    
active_time = []
for i in train_data.timestamp_first_active:
    active_time.append(pd.Timestamp(str(i)[0:8]))
    
gap = []
for i in range(len(create_time)):
    gap_day = (create_time[i] - active_time[i]).days
    gap.append(float(gap_day))
gap = pd.Series(gap).rename('gap')

In [5]:
gender, age, signup_method, signup_flow, language, affiliate_channel, affiliate_provider, first_tracked, signup_app, first_device, first_browser = rearrange_data(train_data)

In [6]:
first_model_label = (train_data.country_destination == 'NDF').astype(int)

In [7]:
data_df_1 = pd.concat([gap, gender, age, signup_method, signup_flow, 
                       language, affiliate_channel, affiliate_provider, first_tracked, signup_app, first_device,
                       first_browser, first_model_label], axis=1)

In [8]:
data_df_1.head()

Unnamed: 0,gap,gender_MALE,gender_OTHER,age_middle_age_people,age_old_people,age_young_people,signup_method_facebook,signup_method_google,signup_flow_1,signup_flow_2,...,first_browser_Silk,first_browser_SiteKiosk,first_browser_SlimBrowser,first_browser_Sogou Explorer,first_browser_Stainless,first_browser_TenFourFox,first_browser_TheWorld Browser,first_browser_Yandex.Browser,first_browser_wOSBrowser,country_destination
0,466.0,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,732.0,1,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,476.0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,765.0,0,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,280.0,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
data_df_1.shape

(213451, 139)

In [10]:
# Splitting the data to 70% training, 20% validating and 10% testing
np.random.seed(101)
data = np.random.permutation(data_df_1)
train, validate, test = np.split(data, [int(.7*len(data)), int(.9*len(data))])

X_train = train[:, :-1]
Y_train = train[:, -1]
X_test = validate[:, :-1]
Y_test = validate[:, -1]
X_final = test[:, :-1]
Y_final = test[:, -1]

print(X_train.shape)
print(X_test.shape)
print(X_final.shape)
print(Y_train.shape)
print(Y_test.shape)
print(Y_final.shape)

(149415, 138)
(42690, 138)
(21346, 138)
(149415,)
(42690,)
(21346,)


In [11]:
print(sum(Y_train)/len(Y_train), sum(Y_test)/len(Y_test))

0.582485024931 0.584680252987


In [12]:
# Standardize variables
scaler = StandardScaler()
scaler.fit(X_train) # use train data to fit
X_test = scaler.transform(X_test)
X_final = scaler.transform(X_final)
X_train = scaler.transform(X_train)
X_train

array([[-0.0192129 , -0.58565845, -0.90505087, ..., -0.00258705,
        -0.00818121, -0.00517415],
       [-0.0192129 , -0.58565845, -0.90505087, ..., -0.00258705,
        -0.00818121, -0.00517415],
       [-0.0192129 ,  1.70747985, -0.90505087, ..., -0.00258705,
        -0.00818121, -0.00517415],
       ..., 
       [-0.0192129 , -0.58565845,  1.10491027, ..., -0.00258705,
        -0.00818121, -0.00517415],
       [-0.0192129 , -0.58565845,  1.10491027, ..., -0.00258705,
        -0.00818121, -0.00517415],
       [-0.0192129 , -0.58565845, -0.90505087, ..., -0.00258705,
        -0.00818121, -0.00517415]])

In [16]:
# Try gridsearch for tuning parameters
#parameters = {'C':[0.1, 1, 10, 20], 'tol':[1e-05, 1e-04, 0.001]}
#clf = GridSearchCV(model1, parameters, cv=5, scoring='accuracy')
#clf.fit(X_train, Y_train)
#clf.best_estimator_, clf.best_params_, clf.cv_results_['params'], clf.cv_results_['mean_test_score']

In [13]:
model1 = LogisticRegression()
model1.fit(X_train, Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [14]:
print(classification_report(y_true=Y_test, y_pred=model1.predict(X_test), target_names=['no', 'yes']))

             precision    recall  f1-score   support

         no       0.66      0.53      0.59     17730
        yes       0.71      0.81      0.75     24960

avg / total       0.69      0.69      0.69     42690



### Second model

In [18]:
had_first_book = train_data[train_data.country_destination != 'NDF']
had_first_book.reset_index(drop=True, inplace=True)
had_first_book.head()

Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,country_destination
0,4ft3gnwmtx,2010-09-28,20090609231247,2010-08-02,FEMALE,56.0,basic,3,en,direct,direct,untracked,Web,Windows Desktop,IE,US
1,bjjt8pjhuk,2011-12-05,20091031060129,2012-09-08,FEMALE,42.0,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Firefox,other
2,87mebub9p4,2010-09-14,20091208061105,2010-02-18,OTHER,41.0,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,US
3,osr2jwljor,2010-01-01,20100101215619,2010-01-02,OTHER,,basic,0,en,other,other,omg,Web,Mac Desktop,Chrome,US
4,lsw9q7uk0j,2010-01-02,20100102012558,2010-01-05,FEMALE,46.0,basic,0,en,other,craigslist,untracked,Web,Mac Desktop,Safari,US


In [19]:
# deal with time
booking_year = []
booking_month = []
for i in had_first_book.date_first_booking:
    y, m, d = i.split('-')
    booking_year.append(y)
    booking_month.append(m)
    
first_booking_y = pd.get_dummies(pd.Series(booking_year), prefix='first_booking_y', drop_first=True)
first_booking_m = pd.get_dummies(pd.Series(booking_month), prefix='first_booking_m', drop_first=True)

In [20]:
gender, age, signup_method, signup_flow, language, affiliate_channel, affiliate_provider, first_tracked, signup_app, first_device, first_browser = rearrange_data(had_first_book)

In [21]:
le = LabelEncoder()
le.fit(had_first_book.country_destination.unique())
second_model_label = pd.Series(le.transform(had_first_book.country_destination))

In [22]:
data_df_2 = pd.concat([first_booking_y, first_booking_m, gender, age, signup_method, signup_flow, 
                       language, affiliate_channel, affiliate_provider, first_tracked, signup_app, first_device,
                       first_browser, second_model_label], axis=1)

In [23]:
data_df_2.shape

(88908, 138)

In [24]:
# Splitting the data to 70% training, 20% validating and 10% testing
np.random.seed(101)
data = np.random.permutation(data_df_2)
train, validate, test = np.split(data, [int(.7*len(data)), int(.9*len(data))])

X_train = train[:, :-1]
Y_train = train[:, -1]
X_test = validate[:, :-1]
Y_test = validate[:, -1]
X_final = test[:, :-1]
Y_final = test[:, -1]

print(X_train.shape)
print(X_test.shape)
print(X_final.shape)
print(Y_train.shape)
print(Y_test.shape)
print(Y_final.shape)

(62235, 137)
(17782, 137)
(8891, 137)
(62235,)
(17782,)
(8891,)


In [26]:
# Standardize variables
scaler = StandardScaler()
scaler.fit(X_train) # use train data to fit
X_test = scaler.transform(X_test)
X_final = scaler.transform(X_final)
X_train = scaler.transform(X_train)
X_train

array([[-0.26320103, -0.47215656, -0.73685329, ..., -0.00566898,
        -0.00400854, -0.00566898],
       [-0.26320103, -0.47215656, -0.73685329, ..., -0.00566898,
        -0.00400854, -0.00566898],
       [-0.26320103,  2.11794157, -0.73685329, ..., -0.00566898,
        -0.00400854, -0.00566898],
       ..., 
       [-0.26320103,  2.11794157, -0.73685329, ..., -0.00566898,
        -0.00400854, -0.00566898],
       [-0.26320103,  2.11794157, -0.73685329, ..., -0.00566898,
        -0.00400854, -0.00566898],
       [-0.26320103, -0.47215656, -0.73685329, ..., -0.00566898,
        -0.00400854, -0.00566898]])

In [29]:
#pd.set_option('display.max_columns', None)

In [30]:
data_df_1.shape

(213451, 139)

In [31]:
data_df_2.shape

(88908, 138)