In [2]:
import numpy as np
seed = 123
np.random.seed(seed)
import collections
from importlib import reload
import pandas as pd
import sklearn

In [11]:
import xgboost as xgb

import sklearn.ensemble
import sklearn.svm
import sklearn.tree
import sklearn.linear_model
import sklearn.neighbors

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
import sklearn.metrics

from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

In [12]:
# Consider the training dataset
# Split the data up in n subsets, based on `search_id` to prevent occurence of a `search_id` in multiple sets
# Resample each subset, now based on classes: select 1/3 booking, 1/3 click (but no booking), 1/3 none
#     The resampling is used the preserve the class sizes
# Use crossvalidation on the n sets to select hyperparams
# Finally train model on full training dataset and make a prediction of the (unseen) test dataset

In [13]:
import util.data
import util.model
import util.ndcg

In [14]:
data_all = pd.read_csv('data/training_set_VU_DM_clean.csv', sep=';', nrows=5*1000)
data_test_unlabelled = pd.read_csv('data/test_set_VU_DM_clean.csv', sep=';', nrows=1000)
data_all.drop(columns=['position'], inplace=True)

for k in data_all.columns:
    if data_all[k].isna().sum() > 0:
#         print('rm %0.4f' % (data_all[k].isna().sum() / data_all.shape[0]), k)
        data_all.drop(columns=[k], inplace=True)

In [15]:
# set aside some labelled data for testing (based on srch_id)
ids = data_all.srch_id.unique()
ids_train, ids_test = train_test_split(ids, test_size=0.5, random_state=123)
data = data_all[data_all.srch_id.isin(ids_train)]
data_test = data_all[data_all.srch_id.isin(ids_test)]

In [16]:
# split cross validation folds
folds = util.data.cv_folds_for_sklearn(data, n_cv_folds=5, resampling_ratio=1)

# Use the full (pre-splitted) dataset because we use custom indices
x, y = util.data.split_xy(data_all)

In [17]:
seed = 123
models = {
#           'Logit': sklearn.linear_model.LogisticRegression(solver='liblinear',
#                                                            multi_class='ovr'),
# #           'SGD': sklearn.linear_model.SGDClassifier(loss="hinge", penalty="l2", max_iter=1000, tol=1e-3),
#           'SVM': sklearn.svm.SVR(kernel='linear'),
          'Decision Tree':  sklearn.tree.DecisionTreeClassifier(),
#           'KNN 10': sklearn.neighbors.KNeighborsClassifier(n_neighbors=10),
          'Ensemble Random Forest': sklearn.ensemble.RandomForestClassifier(n_estimators=100),
#             'Gradient Boost': sklearn.ensemble.GradientBoostingRegressor(loss='ls', learning_rate=0.1, 
#                             n_estimators=100, subsample=1.0, criterion='friedman_mse', 
#                             max_depth=3,random_state=seed, alpha=0.9, tol=0.0001)    
         'AdaBoost': sklearn.ensemble.AdaBoostRegressor(),
          'Gradient Boost': xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.01, 
                          max_depth = 4, alpha = 10, n_estimators = 500, n_jobs=2, seed=123)
         }

In [18]:
# reload(util.model)
n_folds_used = 5
results = {}
for k,m in models.items():
    print(k)
    util.model.cross_validation(m, x, y, folds[:n_folds_used], k, results, scoring='neg_mean_squared_error')

Decision Tree
[34m	 scoring: neg_mean_squared_error[0m
scores per fold  [-5.2226, -5.6173, -5.7808, -4.5556, -5.9567]
  mean score     -5.426596685086084
  standard dev.  0.4986318838216973
Ensemble Random Forest
[34m	 scoring: neg_mean_squared_error[0m
scores per fold  [-8.6667, -8.6667, -8.1378, -8.0105, -8.6667]
  mean score     -8.42966384009691
  standard dev.  0.29304708566917703
AdaBoost
[34m	 scoring: neg_mean_squared_error[0m
scores per fold  [-3.3738, -3.5364, -3.948, -2.6292, -2.9393]
  mean score     -3.2853553506171993
  standard dev.  0.46047158708654734
Gradient Boost
[34m	 scoring: neg_mean_squared_error[0m
scores per fold  [-3.4948, -3.6474, -3.2325, -2.8904, -3.2581]
  mean score     -3.3046289440703043
  standard dev.  0.25791790098037665


In [19]:
k = util.model.scores_table(results)

Model & Mean & Std. dev. \\ 
\hline
Decision Tree & -5.4266 & 0.4986\\
Ensemble Random Forest & -8.4297 & 0.2930\\
AdaBoost & -3.2854 & 0.4605\\
Gradient Boost & -3.3046 & 0.2579\\
[32m
best score: AdaBoost, with mean: -3.2854[0m


In [20]:
model = models[k]
# retrain model on train (+validation) data
x_test, y_test = util.data.split_xy(data_test)
model.fit(x,y)

AdaBoostRegressor(base_estimator=None, learning_rate=1.0, loss='linear',
         n_estimators=50, random_state=None)

In [21]:
y_pred = model.predict(x_test)

In [49]:
# reload(util.ndcg)
# reload(util.data)

# y_pred_ = util.data.y_pred(x_test, y_pred)


# # ndcg_true = util.ndcg.y_true(data_test)
# # ndcg_pred = util.ndcg.y_pred(x_test, y_pred)

# # ndcg_true, ndcg_pred, ndcg_true / ndcg_pred

Index(['SearchId', 'PropertyId'], dtype='object')


# Predict test data

In [24]:
data_all = pd.read_csv('data/training_set_VU_DM_clean.csv', sep=';')
scores = pd.read_csv('data/scores_train.csv', sep=';')
data_all.drop(columns=['position'], inplace=True)

for k in data_all.columns:
    if data_all[k].isna().sum() > 0:
#         print('rm %0.4f' % (data_all[k].isna().sum() / data_all.shape[0]), k)
        data_all.drop(columns=[k], inplace=True)

In [25]:
reload(util.data)
# use a single fold, to allow oversampling of bookings or undersampling of clicks/others
folds = util.data.cv_folds_for_sklearn(data_all, n_cv_folds=1, resampling_ratio=0)
train_indices = folds[0][0]
x_train, y_train = util.data.split_xy(data_all, selection=train_indices)

In [26]:
# read the full dataset
data_test_unlabelled = pd.read_csv('data/test_set_VU_DM_clean.csv', sep=';')
# init x_test, neglect columns not present in x_train
x_test = data_test_unlabelled[x_train.columns]
assert x_train.shape[1] == x_test.shape[1]

In [27]:
x_test = data_test_unlabelled[x_train.columns]

In [28]:
# model = models[k]
model = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.01, 
                          max_depth = 4, alpha = 10, n_estimators = 500, n_jobs=2, seed=123)
model.fit(x_train, y_train)

XGBRegressor(alpha=10, base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.3, gamma=0, importance_type='gain',
       learning_rate=0.01, max_delta_step=0, max_depth=4,
       min_child_weight=1, missing=None, n_estimators=500, n_jobs=2,
       nthread=None, objective='reg:linear', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=123, silent=True,
       subsample=1)

In [29]:
y_pred = model.predict(x_test)
y_pred_ = util.data.y_pred(x_test, y_pred, save=True)

In [36]:
pd.read_csv('data/y_pred_result_xgb_1b.csv', sep=',', nrows=10)

Unnamed: 0,srch_id,prop_id
0,1,54937
1,1,61934
2,1,78599
3,1,99484
4,1,123675
5,1,82231
6,1,94729
7,1,63894
8,1,73666
9,1,28181


In [36]:
# some tests
reload(util.data)

n_cv_folds = 5
ids = sklearn.utils.shuffle(data.srch_id.unique(), random_state=123)
ids_per_fold  = np.array_split(ids, n_cv_folds)
data_splits = util.data.split_data_based_on_ids(data, ids_per_fold)
sum_ = sum([split.shape[0] for split in data_splits])
assert sum_ == data.shape[0], (sum_, data.shape[0])

for i in range(min(2,len(data_splits))):
    for j in range(min(4, len(data_splits))):
        # check index
        if i != j:
            for idx in data_splits[i].index:
                assert idx not in data_splits[j].index

        # check attr srch_id
        if i != j:
            for srch_id in data_splits[i].srch_id:
                assert data_splits[j].query('srch_id == @srch_id').srch_id.size == 0

In [17]:
bco_splits = [ util.data.split_bookings_clicks_others(data) for data in data_splits ]

for i in range(len(bco_splits)):
    for j in range(len(bco_splits)):
        if i != j:
            for srch_id in bco_splits[i][0].srch_id:
                assert bco_splits[j][0].query('srch_id == @srch_id').shape[0] == 0
                assert bco_splits[j][1].query('srch_id == @srch_id').shape[0] == 0
                assert bco_splits[j][2].query('srch_id == @srch_id').shape[0] == 0

In [20]:
# e.g. for every cv split i
i = 0
bco_split = bco_splits[i]
bookings, clicks, others = bco_split
size_per_sample = 2
assert len(util.data.sample([bookings, clicks, others], size_per_sample)) == 3 * size_per_sample