In [474]:
import numpy as np
seed = 123
np.random.seed(seed)
import collections, copy, pickle
from termcolor import colored
from importlib import reload
import gc
from dateutil.parser import parse
import scipy.linalg, scipy.stats
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
from matplotlib.ticker import NullFormatter
from matplotlib import rcParams
rcParams['font.family'] = 'serif'
rcParams['font.size'] = 14
# rcParams['text.usetex'] = True
import seaborn as sns
from IPython.display import HTML

In [475]:
import sklearn.ensemble
import sklearn.svm
import sklearn.tree
import sklearn.linear_model
import sklearn.neighbors

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
import sklearn.metrics

from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

In [476]:
# Consider the training dataset
# Split the data up in n sets, based on `search_id` to prevent occurence of a `search_id` in multiple sets
#   CF score-matrices can be computed here as well
# Use crossvalidation on the n sets to select hyperparams
# Finally train model on full training dataset and make a prediction of the (unseen) test dataset

In [477]:
import util.data

In [478]:
data = pd.read_csv('data/training_set_VU_DM_clean.csv', sep=';', nrows=1*1000)
data_test = pd.read_csv('data/test_set_VU_DM_clean.csv', sep=';', nrows=1000)
# scores = pd.read_csv('data/scores_train.csv', sep=';')

In [479]:
data.drop(columns=['position'], inplace=True)

In [480]:
for k in data.columns:
    if data[k].isna().sum() > 0:
        print('rm ', data[k].isna().sum() / data.shape[0], k)
        data.drop(columns=[k], inplace=True)

rm  1.0 comp1_rate
rm  1.0 comp1_inv
rm  1.0 comp1_rate_percent_diff
rm  0.55 comp2_rate
rm  0.523 comp2_inv
rm  0.882 comp2_rate_percent_diff
rm  0.6 comp3_rate
rm  0.586 comp3_inv
rm  0.904 comp3_rate_percent_diff
rm  0.913 comp4_rate
rm  0.906 comp4_inv
rm  0.967 comp4_rate_percent_diff
rm  0.482 comp5_rate
rm  0.453 comp5_inv
rm  0.827 comp5_rate_percent_diff
rm  0.919 comp6_rate
rm  0.913 comp6_inv
rm  0.967 comp6_rate_percent_diff
rm  0.895 comp7_rate
rm  0.883 comp7_inv
rm  0.955 comp7_rate_percent_diff
rm  0.562 comp8_rate
rm  0.554 comp8_inv
rm  0.889 comp8_rate_percent_diff


In [481]:
ids = sklearn.utils.shuffle(data.srch_id.unique(), random_state=123)
# ids = data.srch_id.unique()
N = ids.size
N # total number of samples

40

In [482]:
y_labels = [ k for k in data.columns if k not in data_test.columns ]
y_labels

['click_bool', 'booking_bool', 'score']

In [483]:
# y_data = data[['click_bool', 'booking_bool', 'score']].copy()
# x_data = data.drop(columns=y_labels)
# TODO split xy at the end

In [484]:
# # clear memory
# data = None
# gc.collect();

In [515]:
n = 5 # number of cv splits
selection_size = np.floor(N/n).astype(int)
a = selection_size
# note that the final splice may be smaller
ids_selections = [ ids[i*a: min((i+1)*a, N)] for i in range(n) ]
assert len(ids_selections) == n

for i in range(len(ids_selections)):
    for j in range(len(ids_selections)):
        if i != j: assert all([id_ not in ids_selections[j] for id_ in ids_selections[i]]), 'id\'s must be unique'

print(selection_size)

8


In [516]:
# data_splits_indices = [ data.loc[data.srch_id.isin(srch_ids)].index for srch_ids in ids_selections ]
data_splits = [ data.loc[data.srch_id.isin(srch_ids)] for srch_ids in ids_selections ]
# x_data_splits = [ x_data.loc[x_data.srch_id.isin(srch_ids)] for srch_ids in ids_selections ]
# y_data_splits = [ y_data.iloc[x.index] for x in x_data_splits ]
# y_data_splits = [ y_data.loc[y_data.srch_id.isin(srch_ids)] for srch_ids in ids_selections ]
# len(data_splits_indices)
len(data_splits)

5

In [517]:
for split in data_splits:
    print(split.shape)

data.shape[0], sum([split.shape[0] for split in data_splits])

(227, 85)
(196, 85)
(195, 85)
(213, 85)
(169, 85)


(1000, 1000)

In [500]:
assert sum([split.shape[0] for split in data_splits]) == data.shape[0]

In [558]:
for i in data_splits[0].index:
    assert i not in data_splits[1].index

for i in range(len(data_splits)):
    for j in range(len(data_splits)):
        # check index
        if i != j:
            for idx in data_splits[i].index:
                assert idx not in data_splits[j].index

        # check attr srch_id
        if i != j:
            for srch_id in data_splits[i].srch_id:
                assert data_splits[j].query('srch_id == @srch_id').srch_id.size == 0

In [559]:
# we can now use (n-1) of the splits to train and the one other split to validate
# Now we will sample from a split to prevent class imbalance

In [560]:
# split slices again, into classes: select 1/3 booking, 1/3 click (but no booking), 1/3 none

In [583]:
def split_bookings_clicks_others(data):
    bookings = data.query('booking_bool == 1')
    clicks = data.query('click_bool == 1 and booking_bool != 1')
    others = data.query('click_bool != 1')
    
    for i in bookings.index[:100]:
        assert i not in clicks.index
        assert i not in others.index
    for i in clicks.index[:100]:
        assert i not in bookings.index
        assert i not in others.index

    return bookings, clicks, others


def sample(datasets=[], size_per_sample=100):
    data = None
    sample_indices = [ np.random.choice(data.index, size_per_sample)
                      for data in datasets
                     ]
    for i in range(len(sample_indices)):
        for j in range(len(sample_indices)):
            if i != j: assert sample_indices[i][0] not in sample_indices[j]

    # TODO should concatenation be shuffled?
    # sklearn.utils.shuffle
    return np.concatenate(sample_indices)
#     samples = [ data.loc[np.random.choice(data.index, size_per_sample)] 
#                 for data in datasets
#               ]
#     ordered_dataset_samples = pd.concat(samples)
#     return ordered_dataset_samples.sample(frac=1)

def cv_folds(bco_splits):
    # generate an iterable that yields (train, test) incides
    # :bco_splits = list of (bookings, clicks, others)
    folds = []
    for bco in bco_splits:
        n_max = max([df.shape[0] for df in bco_split ])
        fold_indices = sample(bco, n_max)
        folds.append((cv_fold_indices, cv_fold_indices))
    return folds
    
def split_xy(data: pd.DataFrame):
    return data.drop(columns=y_labels).values, data['score'].values

In [584]:
bco_splits = [ split_bookings_clicks_others(data) for data in data_splits ]

for i in range(len(bco_splits)):
    for j in range(len(bco_splits)):
        if i != j:
            for srch_id in bco_splits[i][0].srch_id:
                assert bco_splits[j][0].query('srch_id == @srch_id').shape[0] == 0
                assert bco_splits[j][1].query('srch_id == @srch_id').shape[0] == 0
                assert bco_splits[j][2].query('srch_id == @srch_id').shape[0] == 0

In [585]:
# e.g. for every cv split i
i = 0
bco_split = bco_splits[i]
bookings, clicks, others = bco_split
n_max = max([xy.shape[0] for xy in bco_split ])
n_min = min([xy.shape[0] for xy in bco_split ])
n_max, n_min

(219, 4)

In [586]:
sample([bookings, clicks, others], size_per_sample=2)

array([274, 624, 168,  59, 642, 320])

In [587]:
# bco_splits = [ split_bookings_clicks_others(data) for data in data_splits ]
folds = cv_folds(bco_splits)
x_train, y_train = split_xy(data)
# model = sklearn.ensemble.RandomForestClassifier(n_estimators=100)
# cross_val_score(model, x_train, y_train, cv=folds, scoring='accuracy') # roc_auc accuracy

In [589]:
# for i in range(len(folds)):
#     print(i)
#     for j in range(len(folds)):
#         print('\t', j)
#         if i != j:
#             x = folds[i][0][0]
#             assert folds[i][0][0] not in folds[j][0], (i,j, folds[i][0][0])

In [590]:
def cross_validation(model_func, x_train, y_train, cv_folds, k=None, results=None, v=1):
    scores_acc = cross_val_score(model_func, x_train, y_train, cv=cv_folds, scoring='accuracy') # roc_auc accuracy
    if results is not None:
        results[k] = scores_acc
    if v:
        print('scores per fold ', scores_acc)
        print('  mean score    ', np.mean(scores_acc))
        print('  standard dev. ', np.std(scores_acc))

In [591]:
seed = 123
models = {
#           'Logit': sklearn.linear_model.LogisticRegression(solver='liblinear',
#                                                            multi_class='ovr'),
# #           'SGD': sklearn.linear_model.SGDClassifier(loss="hinge", penalty="l2", max_iter=1000, tol=1e-3),
# #           'SVC auto': sklearn.svm.SVC(gamma='auto'), 
#           'SVC': sklearn.svm.SVC(kernel='linear'), 
# #           'SVC polynomial': sklearn.svm.SVC(kernel='poly', gamma='auto', degree=4),    
          'Decision Tree':  sklearn.tree.DecisionTreeClassifier(),
#           'KNN 5': sklearn.neighbors.KNeighborsClassifier(n_neighbors=5),
# #           'KNN 10': sklearn.neighbors.KNeighborsClassifier(n_neighbors=10),
#           'Ensemble Random Forest': sklearn.ensemble.RandomForestClassifier(n_estimators=100),
# #           'Ensemble Bagging': sklearn.ensemble.BaggingClassifier(n_estimators=100)
#             'GradBoost': sklearn.ensemble.GradientBoostingRegressor(loss='ls', learning_rate=0.1, 
#                             n_estimators=100, subsample=1.0, criterion='friedman_mse', 
#                             max_depth=3,random_state=seed, alpha=0.9, tol=0.0001)    
# #          'AdaBoost': sklearn.ensemble.AdaBoostRegressor()
         }

results = {}
for k,m in models.items():
    print(k)
    cross_validation(m, x_train, y_train, folds, k, results)

Decision Tree
scores per fold  [1. 1. 1. 1. 1.]
  mean score     1.0
  standard dev.  0.0


In [592]:
# render latex table
print('Model & Mean & Std. dev. \\\\ \n\\hline')
best_k = ''
best_mean = 0
for k, scores_acc in results.items():
    if np.mean(scores_acc) > best_mean:
        best_mean = np.mean(scores_acc)
        best_k = k
    print('%s & %0.4f & %0.4f\\\\' % (k, np.mean(scores_acc), np.std(scores_acc)))
print('\nbest acc:', best_k, round(best_mean,4))

Model & Mean & Std. dev. \\ 
\hline
Decision Tree & 1.0000 & 0.0000\\

best acc: Decision Tree 1.0
