In [2]:
import numpy as np
seed = 123
np.random.seed(seed)
import collections
from importlib import reload
import pandas as pd
import sklearn

In [3]:
import xgboost as xgb

import sklearn.ensemble
import sklearn.svm
import sklearn.tree
import sklearn.linear_model
import sklearn.neighbors

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
import sklearn.metrics

from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

In [4]:
# Consider the training dataset
# Split the data up in n subsets, based on `search_id` to prevent occurence of a `search_id` in multiple sets
# Resample each subset, now based on classes: select 1/3 booking, 1/3 click (but no booking), 1/3 none
#     The resampling is used the preserve the class sizes
# Use crossvalidation on the n sets to select hyperparams
# Finally train model on full training dataset and make a prediction of the (unseen) test dataset

In [5]:
import util.data
import util.model
import util.ndcg

In [6]:
data_all = pd.read_csv('data/training_set_VU_DM_clean.csv', sep=';', nrows=5*1000)
data_test_unlabelled = pd.read_csv('data/test_set_VU_DM_clean.csv', sep=';', nrows=1000)

for k in data_all.columns:
    if data_all[k].isna().sum() > 0:
#         print('rm %0.4f' % (data_all[k].isna().sum() / data_all.shape[0]), k)
        data_all.drop(columns=[k], inplace=True)

In [11]:
# set aside some labelled data for testing (based on srch_id)
ids = data_all.srch_id.unique()
ids_train, ids_test = train_test_split(ids, test_size=0.5, random_state=123)
data = data_all[data_all.srch_id.isin(ids_train)]
data_test = data_all[data_all.srch_id.isin(ids_test)]

In [12]:
# split cross validation folds
folds = util.data.cv_folds_for_sklearn(data, n_cv_folds=5, resampling_ratio=1)

# Use the full (pre-splitted) dataset because we use custom indices
x, y = util.data.split_xy(data_all)

In [13]:
seed = 123
models = {
#           'Logit': sklearn.linear_model.LogisticRegression(solver='liblinear',
#                                                            multi_class='ovr'),
# #           'SGD': sklearn.linear_model.SGDClassifier(loss="hinge", penalty="l2", max_iter=1000, tol=1e-3),
#           'SVM': sklearn.svm.SVR(kernel='linear'),
          'Decision Tree':  sklearn.tree.DecisionTreeClassifier(),
#           'KNN 10': sklearn.neighbors.KNeighborsClassifier(n_neighbors=10),
          'Ensemble Random Forest': sklearn.ensemble.RandomForestClassifier(n_estimators=100),
#             'Gradient Boost': sklearn.ensemble.GradientBoostingRegressor(loss='ls', learning_rate=0.1, 
#                             n_estimators=100, subsample=1.0, criterion='friedman_mse', 
#                             max_depth=3,random_state=seed, alpha=0.9, tol=0.0001)    
         'AdaBoost': sklearn.ensemble.AdaBoostRegressor(),
          'Gradient Boost': xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.01, 
                          max_depth = 4, alpha = 10, n_estimators = 500, n_jobs=2, seed=123)
         }

In [14]:
# reload(util.model)
n_folds_used = 5
results = {}
for k,m in models.items():
    print(k)
    util.model.cross_validation(m, x, y, folds[:n_folds_used], k, results, scoring='neg_mean_squared_error')

Decision Tree
[34m	 scoring: neg_mean_squared_error[0m
scores per fold  [-5.1012, -5.5702, -7.3737, -6.2065, -6.6842]
  mean score     -6.187172822263962
  standard dev.  0.8019118648503532
Ensemble Random Forest
[34m	 scoring: neg_mean_squared_error[0m
scores per fold  [-8.0119, -8.6667, -8.6667, -7.4418, -8.6667]
  mean score     -8.290744906886639
  standard dev.  0.494444581045631
AdaBoost
[34m	 scoring: neg_mean_squared_error[0m
scores per fold  [-3.0384, -3.8781, -3.6152, -2.8355, -3.4977]
  mean score     -3.372973944024543
  standard dev.  0.3821534223812928
Gradient Boost
[34m	 scoring: neg_mean_squared_error[0m
scores per fold  [-3.4909, -3.7724, -3.2861, -2.9599, -3.2712]
  mean score     -3.3560911864476894
  standard dev.  0.26854764609785087


In [15]:
k = util.model.scores_table(results)

Model & Mean & Std. dev. \\ 
\hline
Decision Tree & -6.1872 & 0.8019\\
Ensemble Random Forest & -8.2907 & 0.4944\\
AdaBoost & -3.3730 & 0.3822\\
Gradient Boost & -3.3561 & 0.2685\\
[32m
best score: Gradient Boost, with mean: -3.3561[0m


In [16]:
model = models[k]
# retrain model on train (+validation) data
x_test, y_test = util.data.split_xy(data_test)
model.fit(x,y)

XGBRegressor(alpha=10, base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.3, gamma=0, importance_type='gain',
       learning_rate=0.01, max_delta_step=0, max_depth=4,
       min_child_weight=1, missing=None, n_estimators=500, n_jobs=2,
       nthread=None, objective='reg:linear', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=123, silent=True,
       subsample=1)

In [17]:
y_pred = model.predict(x_test)

In [21]:
reload(util.ndcg)
util.ndcg.ndcg(x_test, y_test, y_pred)

0.7113746059639432

# Predict test data

In [11]:
# TODO nrows
data = pd.read_csv('data/training_set_VU_DM_clean.csv', sep=';', nrows=10*1000)
data_test = pd.read_csv('data/test_set_VU_DM_clean.csv', sep=';', nrows=10*1000)

# data = pd.read_csv('data/training_set_VU_DM_clean.csv', sep=';')
# data_test = pd.read_csv('data/test_set_VU_DM_clean.csv', sep=';')

In [15]:
util.data.rm_na(data)
util.data.rm_na(data_test)

k = 'gross_bookings_usd'
gross_booking_scaler = preprocessing.RobustScaler()

data[k] = gross_booking_scaler.fit_transform(data[[k]].values)
data_test.loc[data_test.index, k] = gross_booking_scaler.transform(data_test[[k]].values)

# add noise to reduce dependency on this attr
data.loc[data.index, k] += np.random.normal(loc=0, scale=0.01, size=data[k].size)

# use a single fold, to allow oversampling of bookings or undersampling of clicks/others
folds = util.data.cv_folds_for_sklearn(data, n_cv_folds=1, resampling_ratio=0)
train_indices = folds[0][0]
x_train, y_train = util.data.split_xy(data, selection=train_indices)
x_test = data_test[x_train.columns]

In [16]:
x_train.shape, y_train.shape

((528, 82), (528,))

In [18]:
kwargs = {'random_state': 1234}
# model = xgb.XGBRegressor(criterion='friedman_mse',
#                          n_estimators=100, learning_rate=0.001, max_depth=1,
#                          loss='ls', base_score=0, tol=1e-4, n_jobs=4, **kwargs)
model = xgb.XGBRegressor(objective='rank:ndcg', criterion='friedman_mse', 
                         n_estimators=100, learning_rate=0.001, max_depth=1,
                        loss='ls', base_score=0, tol=1e-4, n_jobs=4, **kwargs)
# model = sklearn.ensemble.RandomForestClassifier(n_jobs=4, n_estimators=500)

model.fit(x_train, y_train)

XGBRegressor(base_score=0, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, criterion='friedman_mse', gamma=0,
       importance_type='gain', learning_rate=0.001, loss='ls',
       max_delta_step=0, max_depth=1, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=4, nthread=None, objective='rank:ndcg',
       random_state=1234, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1, tol=0.0001)

In [20]:
y_pred = model.predict(x_test)
Xy_pred = util.data.Xy_pred(x_test, y_pred, save=True, suffix='xgb-reg-undersampling')

saved to `data/y_pred_result_xgb-reg-undersampling.csv`


In [39]:
# pd.read_csv('data/y_pred_result_xgb-ndcg.csv', sep=',', nrows=10)

In [25]:
# model = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.01, 
#                           max_depth = 4, alpha = 10, n_estimators = 500, n_jobs=2, seed=123)
# model.fit(x_train, y_train)

In [36]:
pd.read_csv('data/y_pred_result_xgb_1b.csv', sep=',', nrows=10)

Unnamed: 0,srch_id,prop_id
0,1,54937
1,1,61934
2,1,78599
3,1,99484
4,1,123675
5,1,82231
6,1,94729
7,1,63894
8,1,73666
9,1,28181
