# Airbnb New User Bookings prediction Using Xgboost




In [1]:
import os, sys
import numpy as np
import pandas as pd
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.preprocessing import LabelEncoder

np.random.seed(0)



# Loading data

In [2]:
df_train = pd.read_csv('../input/train_users.csv')
df_test = pd.read_csv('../input/test_users.csv')
labels = df_train['country_destination'].values
df_train = df_train.drop(['country_destination'], axis=1)
id_test = df_test['id']
piv_train = df_train.shape[0]

In [3]:
df_train.head(10)

Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser
0,gxn3p5htnn,2010-06-28,20090319043255,,-unknown-,,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome
1,820tgsjxq7,2011-05-25,20090523174809,,MALE,38.0,facebook,0,en,seo,google,untracked,Web,Mac Desktop,Chrome
2,4ft3gnwmtx,2010-09-28,20090609231247,2010-08-02,FEMALE,56.0,basic,3,en,direct,direct,untracked,Web,Windows Desktop,IE
3,bjjt8pjhuk,2011-12-05,20091031060129,2012-09-08,FEMALE,42.0,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Firefox
4,87mebub9p4,2010-09-14,20091208061105,2010-02-18,-unknown-,41.0,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome
5,osr2jwljor,2010-01-01,20100101215619,2010-01-02,-unknown-,,basic,0,en,other,other,omg,Web,Mac Desktop,Chrome
6,lsw9q7uk0j,2010-01-02,20100102012558,2010-01-05,FEMALE,46.0,basic,0,en,other,craigslist,untracked,Web,Mac Desktop,Safari
7,0d01nltbrs,2010-01-03,20100103191905,2010-01-13,FEMALE,47.0,basic,0,en,direct,direct,omg,Web,Mac Desktop,Safari
8,a1vcnhxeij,2010-01-04,20100104004211,2010-07-29,FEMALE,50.0,basic,0,en,other,craigslist,untracked,Web,Mac Desktop,Safari
9,6uh8zyj2gn,2010-01-04,20100104023758,2010-01-04,-unknown-,46.0,basic,0,en,other,craigslist,omg,Web,Mac Desktop,Firefox


In [4]:
#Creating a DataFrame with train+test data
df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True)
#Removing id and date_first_booking
df_all = df_all.drop(['id', 'date_first_booking'], axis=1)
#Filling nan
df_all = df_all.fillna(-1)


In [5]:
df_all.head(10)

Unnamed: 0,date_account_created,timestamp_first_active,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser
0,2010-06-28,20090319043255,-unknown-,-1.0,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome
1,2011-05-25,20090523174809,MALE,38.0,facebook,0,en,seo,google,untracked,Web,Mac Desktop,Chrome
2,2010-09-28,20090609231247,FEMALE,56.0,basic,3,en,direct,direct,untracked,Web,Windows Desktop,IE
3,2011-12-05,20091031060129,FEMALE,42.0,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Firefox
4,2010-09-14,20091208061105,-unknown-,41.0,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome
5,2010-01-01,20100101215619,-unknown-,-1.0,basic,0,en,other,other,omg,Web,Mac Desktop,Chrome
6,2010-01-02,20100102012558,FEMALE,46.0,basic,0,en,other,craigslist,untracked,Web,Mac Desktop,Safari
7,2010-01-03,20100103191905,FEMALE,47.0,basic,0,en,direct,direct,omg,Web,Mac Desktop,Safari
8,2010-01-04,20100104004211,FEMALE,50.0,basic,0,en,other,craigslist,untracked,Web,Mac Desktop,Safari
9,2010-01-04,20100104023758,-unknown-,46.0,basic,0,en,other,craigslist,omg,Web,Mac Desktop,Firefox


# Feature engineering


In [6]:
#date_account_created
dac = np.vstack(df_all.date_account_created.astype(str).apply(lambda x: list(map(int, x.split('-')))).values)
df_all['dac_year'] = dac[:,0]
df_all['dac_month'] = dac[:,1]
df_all['dac_day'] = dac[:,2]
df_all = df_all.drop(['date_account_created'], axis=1)

#timestamp_first_active
tfa = np.vstack(df_all.timestamp_first_active.astype(str).apply(lambda x: list(map(int, [x[:4],x[4:6],x[6:8],x[8:10],x[10:12],x[12:14]]))).values)
df_all['tfa_year'] = tfa[:,0]
df_all['tfa_month'] = tfa[:,1]
df_all['tfa_day'] = tfa[:,2]
df_all = df_all.drop(['timestamp_first_active'], axis=1)

#Age
av = df_all.age.values
df_all['age'] = np.where(np.logical_or(av<14, av>100), -1, av)

#One-hot-encoding features
ohe_feats = ['gender', 'signup_method', 'signup_flow', 'language', 'affiliate_channel', 'affiliate_provider', 'first_affiliate_tracked', 'signup_app', 'first_device_type', 'first_browser']
for f in ohe_feats:
    df_all_dummy = pd.get_dummies(df_all[f], prefix=f)
    df_all = df_all.drop([f], axis=1)
    df_all = pd.concat((df_all, df_all_dummy), axis=1)

#Splitting train and test
vals = df_all.values
X = vals[:piv_train]
le = LabelEncoder()
y = le.fit_transform(labels)   
X_test = vals[piv_train:]

print X.shape
print y.shape

(213451, 161)
(213451,)


In [7]:
df_all.head()

Unnamed: 0,age,dac_year,dac_month,dac_day,tfa_year,tfa_month,tfa_day,gender_-unknown-,gender_FEMALE,gender_MALE,...,first_browser_Silk,first_browser_SiteKiosk,first_browser_SlimBrowser,first_browser_Sogou Explorer,first_browser_Stainless,first_browser_TenFourFox,first_browser_TheWorld Browser,first_browser_UC Browser,first_browser_Yandex.Browser,first_browser_wOSBrowser
0,-1.0,2010,6,28,2009,3,19,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,38.0,2011,5,25,2009,5,23,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,56.0,2010,9,28,2009,6,9,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,42.0,2011,12,5,2009,10,31,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,41.0,2010,9,14,2009,12,8,1,0,0,...,0,0,0,0,0,0,0,0,0,0


# Xgboost setup and training 

In [8]:
# http://xgboost.readthedocs.io/en/latest/python/python_api.html
# http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html

In [9]:
# timeit decorator
import time
def timeit(f):
    def timed(*args, **kw):
        ts = time.time()
        result = f(*args, **kw)
        te = time.time()

#         print 'func:%r args:[%r, %r] took: %2.4f sec' % \
#           (f.__name__, args, kw, te-ts)

        print 'func:%r  took: %2.4f sec' % \
          (f.__name__, te-ts)
        return result

    return timed

In [10]:
"""Metrics to compute the model performance."""

# https://www.kaggle.com/c/airbnb-recruiting-new-user-bookings#evaluation
# ndcg_k = dcg_k / idcg_k. 
# In this case, the idcg_k = N



def dcg_score(y_true, y_score, k=5):
    """Discounted cumulative gain (DCG) at rank K.

    Parameters
    ----------
    y_true : array, shape = [n_samples]
        Ground truth (true relevance labels).
    y_score : array, shape = [n_samples, n_classes]
        Predicted scores.
    k : int
        Rank.

    Returns
    -------
    score : float
    """
    n = len(y_true)
    order = np.argsort(y_score)[...,::-1]
    ret = 0.0
    for i in range(len(y_true)):
        y = y_true[i]
        for j in range(k):
            if order[i,j] == y:
                ret += 1.0/np.log2(j+2)
                break
    return ret/n



# NDCG Scorer function
ndcg_scorer = make_scorer(dcg_score, needs_proba=True, k=5)

In [11]:
# xgb = XGBClassifier(max_depth=6, learning_rate=0.3, n_estimators=25,
#                     objective='multi:softprob', subsample=0.5, colsample_bytree=0.5, seed=0)                  
# xgb.fit(X, y)
# y_pred = xgb.predict_proba(X_test)  

# this will give us a result of 0.8655

### tuning parameters

class sklearn.model_selection.GridSearchCV(estimator, param_grid, scoring=None, fit_params=None, n_jobs=1, iid=True, refit=True, cv=None, verbose=0, pre_dispatch=‘2*n_jobs’, error_score=’raise’, return_train_score=’warn’)

In [12]:
from sklearn.model_selection import GridSearchCV
n_estimators_list = [25, 50 ,75, 100]
colsample_bytree_list = [0.5]
params = {
    'learning_rate' : [0.1, 0.3, 0.5, 0.7],
    'n_estimators'       : [25, 50 ,75, 100]   ,
    'colsample_bytree'   : [0.5, 0.7, 0.9]     ,
}

xgb = XGBClassifier(max_depth=6, objective='multi:softprob', subsample=0.5,  seed=0)
clf = GridSearchCV(estimator=xgb, param_grid=params, scoring=ndcg_scorer, cv = 5, return_train_score = True, \
                   verbose = 2)
# parallel computing using 5 cores

In [13]:
clf

GridSearchCV(cv=5, error_score='raise',
       estimator=XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=6,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='multi:softprob', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=0.5),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': [25, 50, 75, 100], 'learning_rate': [0.1, 0.3, 0.5, 0.7], 'colsample_bytree': [0.5, 0.7, 0.9]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=make_scorer(dcg_score, needs_proba=True, k=5), verbose=2)

In [14]:
# 421.3 min
cv_ret = clf.fit(X,y)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
[CV] n_estimators=25, learning_rate=0.1, colsample_bytree=0.5 ........
[CV]  n_estimators=25, learning_rate=0.1, colsample_bytree=0.5, total=  31.5s
[CV] n_estimators=25, learning_rate=0.1, colsample_bytree=0.5 ........


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   32.4s remaining:    0.0s


[CV]  n_estimators=25, learning_rate=0.1, colsample_bytree=0.5, total=  28.8s
[CV] n_estimators=25, learning_rate=0.1, colsample_bytree=0.5 ........
[CV]  n_estimators=25, learning_rate=0.1, colsample_bytree=0.5, total=  28.9s
[CV] n_estimators=25, learning_rate=0.1, colsample_bytree=0.5 ........
[CV]  n_estimators=25, learning_rate=0.1, colsample_bytree=0.5, total=  28.6s
[CV] n_estimators=25, learning_rate=0.1, colsample_bytree=0.5 ........
[CV]  n_estimators=25, learning_rate=0.1, colsample_bytree=0.5, total=  28.5s
[CV] n_estimators=50, learning_rate=0.1, colsample_bytree=0.5 ........
[CV]  n_estimators=50, learning_rate=0.1, colsample_bytree=0.5, total=  58.6s
[CV] n_estimators=50, learning_rate=0.1, colsample_bytree=0.5 ........
[CV]  n_estimators=50, learning_rate=0.1, colsample_bytree=0.5, total=  58.4s
[CV] n_estimators=50, learning_rate=0.1, colsample_bytree=0.5 ........
[CV]  n_estimators=50, learning_rate=0.1, colsample_bytree=0.5, total=  58.9s
[CV] n_estimators=50, learni

[Parallel(n_jobs=1)]: Done 240 out of 240 | elapsed: 421.3min finished


In [15]:
best_model = cv_ret.best_estimator_  # refit= True, save the best model to cv_ret.best_estimator_

In [19]:
cv_ret.best_params_

{'colsample_bytree': 0.5, 'learning_rate': 0.1, 'n_estimators': 25}

In [16]:
import pickle
# save model to file
pickle.dump(best_model, open("../model/xgb-1126.dat", "wb"))
 

In [26]:
# save cv_ret to file
pickle.dump(cv_ret, open("../output/xgb_cv_ret-1126.dat", "wb"))

In [23]:
y_pred = best_model.predict_proba(X_test)

#Taking the 5 classes with highest probabilities
ids = []  #list of ids
cts = []  #list of countries
for i in range(len(id_test)):
    idx = id_test[i]
    ids += [idx] * 5
    cts += le.inverse_transform(np.argsort(y_pred[i])[::-1])[:5].tolist()

In [24]:
#Generate submission
sub = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country'])
sub.to_csv('sub.csv',index=False)

In [27]:
# give us a result of 0.86400

##### reference: https://www.kaggle.com/svpons/script-0-8655