In [1]:
import os
import numpy as np
import pandas as pd
import csv
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn import feature_selection, linear_model, neighbors, ensemble, cross_validation, grid_search, metrics

# pd.set_option('display.max_rows', 10)
# pd.set_option('display.notebook_repr_html', True)
# pd.set_option('display.max_columns', 999)

%matplotlib inline
plt.style.use('ggplot')

In [2]:
users = pd.read_csv(os.path.join('datasets', 'airbnb_features.csv'))

In [3]:
users.head(2)

Unnamed: 0,id,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,...,dest_ES,dest_FR,dest_GB,dest_IT,dest_NL,dest_PT,dest_US,dest_other,binary_dest_US,binary_dest_other
0,0,FEMALE,44.0,facebook,zero,en,direct,direct,linked,not-iOS,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,1,MALE,47.0,basic,zero,en,other,other,tracked-other,not-iOS,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


In [4]:
users = users[ ['days', 'age_b', 'Gender_MALE', 'signup_basic', 'lang_fr', 'lang_ja', 'lang_pt', 
                'affiliate_channel_direct','affiliate_provider_direct', 'affiliate_provider_google', 'aff_linked', 
                'aff_other', 'aff_omg', 'Mac_Desktop','Safari' ]]

In [5]:
users.rename(columns={
'age_b': 'age', 'Gender_MALE': 'gender','signup_basic': 'signup','lang_fr': 'french',
'lang_ja': 'japanese','lang_pt': 'portuguese','affiliate_channel_direct': 'direct_affiliate_channel', 
'affiliate_provider_direct': 'direct_affiliate_provider', 'affiliate_provider_google': 'google_affiliate_provider',
'aff_linked':'linked_affiliate', 'aff_other': 'other_affiliate', 'aff_omg': 'omg_affiliate'}, inplace = True)

In [6]:
users.head(2)

Unnamed: 0,days,age,gender,signup,french,japanese,portuguese,direct_affiliate_channel,direct_affiliate_provider,google_affiliate_provider,linked_affiliate,other_affiliate,omg_affiliate,Mac_Desktop,Safari
0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0
1,2.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0


In [7]:
users.shape

(15470, 15)

1. days = booking date - first active date

2. age = 1.0 (18-29), 0.0 (30+)
3. gender = 1.0 (male), 0.0 (female)

4. signup = 1.0 (basic), 0.0 (facebook)
5. french = 1.0 (french), 0.0 (other language)
6. japanese = 1.0 (japanese), 0.0 (other language)
7. portuguese = 1.0 (portuguese), 0.0 (other language)

8. direct_affiliate_channel = 1.0 (direct), 0.0 (other affiliate)
9. direct_affiliate_provider = 1.0 (direct), 0.0 (other affiliate)
10. google_affiliate_provider = 1.0 (google), 0.0 (other affiliate)

11. linked_affiliate = 1.0 (linked), 0.0 (other)
12. other_affiliate = 1.0 (other), 0.0 (other)
13. omg_affiliate = 1.0 (omg), 0.0 (all non-other)

14. Mac_Desktop = 1.0 (Mac Desktop), 0.0 (other device)
15. Safari = 1.0 (Safari), 0.0 (other browser)

In [8]:
smf.ols(formula = 
'days ~ age + gender + signup + french + japanese + portuguese + direct_affiliate_channel + direct_affiliate_provider + google_affiliate_provider + linked_affiliate + other_affiliate + omg_affiliate + Mac_Desktop + Safari', 
data = users).fit().summary()

0,1,2,3
Dep. Variable:,days,R-squared:,0.016
Model:,OLS,Adj. R-squared:,0.015
Method:,Least Squares,F-statistic:,18.99
Date:,"Tue, 12 Jul 2016",Prob (F-statistic):,7.03e-45
Time:,05:06:50,Log-Likelihood:,-86479.0
No. Observations:,15470,AIC:,173000.0
Df Residuals:,15456,BIC:,173100.0
Df Model:,13,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
Intercept,27.5626,4.695,5.871,0.000,18.360 36.766
age,5.9442,1.047,5.678,0.000,3.892 7.996
gender,-5.6844,1.052,-5.401,0.000,-7.747 -3.621
signup,-8.0096,1.079,-7.420,0.000,-10.125 -5.894
french,9.6176,6.546,1.469,0.142,-3.213 22.448
japanese,-30.0272,14.887,-2.017,0.044,-59.208 -0.847
portuguese,37.0913,17.341,2.139,0.032,3.101 71.081
direct_affiliate_channel,-2.6634,1.013,-2.630,0.009,-4.648 -0.678
direct_affiliate_provider,-2.6634,1.013,-2.630,0.009,-4.648 -0.678

0,1,2,3
Omnibus:,9285.454,Durbin-Watson:,1.894
Prob(Omnibus):,0.0,Jarque-Bera (JB):,70464.08
Skew:,2.925,Prob(JB):,0.0
Kurtosis:,11.665,Cond. No.,2.77e+16


In [9]:
bins = [0, 1, 2]

In [10]:
group_names = ['0', '1']

In [11]:
days_binary = pd.cut(users['days'], bins, labels=group_names)
users['days_binary'] = pd.cut(users['days'], bins, labels=group_names)
users.days_binary.value_counts()

0    2695
1    1241
dtype: int64

In [12]:
sum(users.days == 0)

3889

In [13]:
sum(users.days > 0)

11581

In [14]:
3889+11581

15470

In [15]:
users.days = np.where(users.days !=0, 1, 0)

In [16]:
users.days.dtype

dtype('int64')

In [17]:
train_users = users.sample(frac = .6, random_state = 0)
test_users = users.drop(train_users.index)

In [18]:
train_X = train_users[['age','gender','signup','french','japanese','portuguese','direct_affiliate_channel',
                       'direct_affiliate_provider','google_affiliate_provider','linked_affiliate','other_affiliate',
                       'omg_affiliate','Mac_Desktop','Safari']]
train_y = train_users.days


test_X = test_users[['age','gender','signup','french','japanese','portuguese','direct_affiliate_channel',
                       'direct_affiliate_provider','google_affiliate_provider','linked_affiliate','other_affiliate',
                       'omg_affiliate','Mac_Desktop','Safari']]
test_y = test_users.days

In [19]:
logmodel = linear_model.LogisticRegression()
logmodel.fit(train_X, train_y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [20]:
logmodel.score(train_X, train_y)

0.74682180564533507

In [21]:
logmodel.score(test_X, test_y)

0.75129282482223658

In [22]:
knnmodel = neighbors.KNeighborsClassifier(n_neighbors = 5, weights = 'distance')
knnmodel.fit(train_X, train_y)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='distance')

In [23]:
cross_validation.cross_val_score(knnmodel, train_X, train_y, cv = 5).mean()

0.69909522217889442

In [24]:
knnmodel.score(test_X, test_y)

0.70006464124111178

In [25]:
rfmodel = ensemble.RandomForestClassifier(n_estimators = 1000, oob_score = True)

rfmodel.fit(train_X, train_y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
            oob_score=True, random_state=None, verbose=0, warm_start=False)

In [26]:
rfmodel.oob_score_

0.74272786037491922

In [27]:
cross_validation.cross_val_score(rfmodel, train_X, train_y, cv = 5).mean()

0.7402493825785007

In [28]:
rfmodel.score(test_X, test_y)

0.74806076276664513