In [701]:
import numpy as np
seed = 123
np.random.seed(seed)
import collections, copy, pickle
from termcolor import colored
from importlib import reload
import gc
from dateutil.parser import parse
import scipy.linalg, scipy.stats
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
from matplotlib.ticker import NullFormatter
from matplotlib import rcParams
rcParams['font.family'] = 'serif'
rcParams['font.size'] = 14
# rcParams['text.usetex'] = True
import seaborn as sns
from IPython.display import HTML

In [702]:
import sklearn.ensemble
import sklearn.svm
import sklearn.tree
import sklearn.linear_model
import sklearn.neighbors

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
import sklearn.metrics

from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

In [703]:
# Consider the training dataset
# Split the data up in n subsets, based on `search_id` to prevent occurence of a `search_id` in multiple sets
# Resample each subset, now based on classes: select 1/3 booking, 1/3 click (but no booking), 1/3 none
#     The resampling is used the preserve the class sizes
# Use crossvalidation on the n sets to select hyperparams
# Finally train model on full training dataset and make a prediction of the (unseen) test dataset

In [704]:
import util.data

In [705]:
data = pd.read_csv('data/training_set_VU_DM_clean.csv', sep=';', nrows=2*1000)
data_test = pd.read_csv('data/test_set_VU_DM_clean.csv', sep=';', nrows=1*1000)
# scores = pd.read_csv('data/scores_train.csv', sep=';')

In [706]:
data.drop(columns=['position'], inplace=True)

In [707]:
for k in data.columns:
    if data[k].isna().sum() > 0:
        print('rm ', data[k].isna().sum() / data.shape[0], k)
        data.drop(columns=[k], inplace=True)

rm  0.9815 comp1_rate
rm  0.98 comp1_inv
rm  0.984 comp1_rate_percent_diff
rm  0.59 comp2_rate
rm  0.5705 comp2_inv
rm  0.8935 comp2_rate_percent_diff
rm  0.6495 comp3_rate
rm  0.6335 comp3_inv
rm  0.9185 comp3_rate_percent_diff
rm  0.94 comp4_rate
rm  0.9315 comp4_inv
rm  0.9785 comp4_rate_percent_diff
rm  0.5105 comp5_rate
rm  0.4855 comp5_inv
rm  0.817 comp5_rate_percent_diff
rm  0.9235 comp6_rate
rm  0.9175 comp6_inv
rm  0.9655 comp6_rate_percent_diff
rm  0.9075 comp7_rate
rm  0.8915 comp7_inv
rm  0.96 comp7_rate_percent_diff
rm  0.5755 comp8_rate
rm  0.556 comp8_inv
rm  0.876 comp8_rate_percent_diff


In [800]:
ids = sklearn.utils.shuffle(data.srch_id.unique(), random_state=123)
# ids = data.srch_id.unique()
N = ids.size
N # total number of samples

80

In [797]:
y_labels = [ k for k in data.columns if k not in data_test.columns ]
y_labels

['click_bool', 'booking_bool', 'score']

In [801]:
n_cv_folds = 5
ids_selections = np.array_split(ids, n_cv_folds)
# split data based on id folds
data_splits = [ data.loc[data.srch_id.isin(srch_ids)] for srch_ids in ids_selections ]
selection_size, len(data_splits)
# a = [i for i in ids_selections]
# a

(16, 5)

In [802]:
sum_ = sum([split.shape[0] for split in data_splits])
assert sum_ == data.shape[0], (sum_, data.shape[0])

In [804]:
for i in data_splits[0].index:
    assert i not in data_splits[1].index

for i in range(min(10,len(data_splits))):
    for j in range(min(100, len(data_splits))):
        # check index
        if i != j:
            for idx in data_splits[i].index:
                assert idx not in data_splits[j].index

        # check attr srch_id
        if i != j:
            for srch_id in data_splits[i].srch_id:
                assert data_splits[j].query('srch_id == @srch_id').srch_id.size == 0

In [805]:
# we can now use (n-1) of the splits to train and the one other split to validate
# Now we will sample from a split to prevent class imbalance

In [806]:
# split slices again, into classes: select 1/3 booking, 1/3 click (but no booking), 1/3 none

In [865]:
def split_bookings_clicks_others(data):
    bookings = data.query('booking_bool == 1')
    clicks = data.query('click_bool == 1 and booking_bool != 1')
    others = data.query('click_bool != 1')
    
    for i in bookings.index[:100]:
        assert i not in clicks.index
        assert i not in others.index
    for i in clicks.index[:100]:
        assert i not in bookings.index
        assert i not in others.index

    return bookings, clicks, others


def sample(datasets=[], size_per_sample=100):
    data = None
    sample_indices = [ np.random.choice(data.index, size_per_sample)
                      for data in datasets
                     ]
    for i in range(len(sample_indices)):
        for j in range(len(sample_indices)):
            if i != j: assert sample_indices[i][0] not in sample_indices[j]

    # TODO should concatenation be shuffled?
    # sklearn.utils.shuffle
    return np.concatenate(sample_indices)
#     return sklearn.utils.shuffle(np.concatenate(sample_indices), random_state=1234)
#     samples = [ data.loc[np.random.choice(data.index, size_per_sample)] 
#                 for data in datasets
#               ]
#     ordered_dataset_samples = pd.concat(samples)
#     return ordered_dataset_samples.sample(frac=1)

def resample_bco_splits(bco_splits):
    # Returns a list of of folds, where each fold contains indices of bookings, clicks, others
    # :bco_splits = list of tuple of dataframes: (bookings, clicks, others)    
    folds = []
    for bco in bco_splits:
        n_max = max([df.shape[0] for df in bco_split ])
        # TODO use the average of n_max, n_min, to reduce the amount of artificial resampling?
        fold_indices = sample(bco, n_max)
        folds.append(fold_indices)

    return folds

def cv_folds(bco_splits):
    # Return "An iterable yielding (train, test) splits as arrays of indices"
    # I.e. the arg for sklearn.model_selection.cross_val_score(_, cv=arg)
    # :bco_splits = list of tuple of dataframes: (bookings, clicks, others)    
    folds = resample_bco_splits(bco_splits)
    # for each step, choose (n-1) train folds and 1 test fold
    n_folds = len(folds)
    cv_folds = []
    for i in range(n_folds):
        fold_indices = np.delete(np.arange(n_folds), i)
        # select & concatenate folds[indices]
        indices_train = np.concatenate([ folds[j] for j in fold_indices ])
        indices_test= folds[i]
        cv_folds.append((indices_train, indices_test))

    return cv_folds

# def combine_folds(folds, indices):
#     return np.concatenate([ folds[i][0] for i in indices ])
    
def split_xy(data: pd.DataFrame):
    return data.drop(columns=y_labels).values, data['score'].values

In [866]:
bco_splits = [ split_bookings_clicks_others(data) for data in data_splits ]

for i in range(len(bco_splits)):
    for j in range(len(bco_splits)):
        if i != j:
            for srch_id in bco_splits[i][0].srch_id:
                assert bco_splits[j][0].query('srch_id == @srch_id').shape[0] == 0
                assert bco_splits[j][1].query('srch_id == @srch_id').shape[0] == 0
                assert bco_splits[j][2].query('srch_id == @srch_id').shape[0] == 0

In [867]:
# e.g. for every cv split i
i = 0
bco_split = bco_splits[i]
bookings, clicks, others = bco_split
n_max = max([xy.shape[0] for xy in bco_split ])
n_min = min([xy.shape[0] for xy in bco_split ])
n_max, n_min

(370, 7)

In [868]:
size_per_sample = 2
assert len(sample([bookings, clicks, others], size_per_sample)) == 3 * size_per_sample

In [869]:
folds = cv_folds(bco_splits)
x, y = split_xy(data)
# sklearn.model_selection.PredefinedSplit()

In [816]:
# for i in range(len(folds)):
#     print(i)
#     for j in range(len(folds)):
#         print('\t', j)
#         if i != j:
#             x = folds[i][0][0]
#             assert folds[i][0][0] not in folds[j][0], (i,j, folds[i][0][0])

In [870]:
def cross_validation(model_func, x_train, y_train, cv_folds, k=None, results=None, v=1):
    scores_acc = cross_val_score(model_func, x_train, y_train, cv=cv_folds, scoring='accuracy') # roc_auc accuracy
    if results is not None:
        results[k] = scores_acc
    if v:
        print('scores per fold ', scores_acc)
        print('  mean score    ', np.mean(scores_acc))
        print('  standard dev. ', np.std(scores_acc))

In [871]:
seed = 123
models = {
#           'Logit': sklearn.linear_model.LogisticRegression(solver='liblinear',
#                                                            multi_class='ovr'),
# #           'SGD': sklearn.linear_model.SGDClassifier(loss="hinge", penalty="l2", max_iter=1000, tol=1e-3),
# #           'SVC auto': sklearn.svm.SVC(gamma='auto'), 
#           'SVC': sklearn.svm.SVC(kernel='linear'), 
# #           'SVC polynomial': sklearn.svm.SVC(kernel='poly', gamma='auto', degree=4),    
          'Decision Tree':  sklearn.tree.DecisionTreeClassifier(),
#           'KNN 5': sklearn.neighbors.KNeighborsClassifier(n_neighbors=5),
# #           'KNN 10': sklearn.neighbors.KNeighborsClassifier(n_neighbors=10),
#           'Ensemble Random Forest': sklearn.ensemble.RandomForestClassifier(n_estimators=100),
# #           'Ensemble Bagging': sklearn.ensemble.BaggingClassifier(n_estimators=100)
#             'Gradient Boost': sklearn.ensemble.GradientBoostingRegressor(loss='ls', learning_rate=0.1, 
#                             n_estimators=100, subsample=1.0, criterion='friedman_mse', 
#                             max_depth=3,random_state=seed, alpha=0.9, tol=0.0001)    
# #          'AdaBoost': sklearn.ensemble.AdaBoostRegressor()
         }

In [872]:
results = {}
for k,m in models.items():
    print(k)
    cross_validation(m, x, y, folds, k, results)

Decision Tree
scores per fold  [0.44954955 0.42702703 0.35135135 0.3954955  0.52072072]
  mean score     0.4288288288288289
  standard dev.  0.05655189683781503


In [873]:
# render latex table
print('Model & Mean & Std. dev. \\\\ \n\\hline')
best_k = ''
best_mean = 0
for k, scores_acc in results.items():
    if np.mean(scores_acc) > best_mean:
        best_mean = np.mean(scores_acc)
        best_k = k
    print('%s & %0.4f & %0.4f\\\\' % (k, np.mean(scores_acc), np.std(scores_acc)))
print('\nbest acc:', best_k, round(best_mean,4))

Model & Mean & Std. dev. \\ 
\hline
Decision Tree & 0.4288 & 0.0566\\

best acc: Decision Tree 0.4288


In [874]:
y = y_train[folds[0][0]]
y[y == 5].shape, y[y == 0].shape, y[y == 1].shape

((1480,), (1480,), (1480,))