In [1]:
import numpy as np
seed = 123
np.random.seed(seed)
import collections
from importlib import reload
import pandas as pd
import sklearn
from termcolor import colored # if error: pip3 install termcolor; conda install termcolor 

In [2]:
import xgboost as xgb
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
import sklearn.metrics

from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
import sklearn.ensemble

In [3]:
import util.data
import util.model
import util.ndcg

# 1) Use all features

In [4]:
# TODO nrows
data_all = pd.read_csv('data/training_set_VU_DM_clean.csv', sep=';', nrows=10*1000)
# data_all = pd.read_csv('data/training_set_VU_DM_clean.csv', sep=';', nrows=500*1000)
# data_all = pd.read_csv('data/training_set_VU_DM_clean.csv', sep=';')

data_test_unlabelled = pd.read_csv('data/test_set_VU_DM_clean.csv', sep=';', nrows=1000)

In [5]:
for k in data_all.columns:
    if data_all[k].isna().sum() > 0:
#         print('rm %0.4f' % (data_all[k].isna().sum() / data_all.shape[0]), k)
        data_all.drop(columns=[k], inplace=True)

gross_booking_scaler = preprocessing.RobustScaler(copy=False)

# set aside some labelled data for testing (based on srch_id)
ids = data_all.srch_id.unique()
ids_train, ids_test = train_test_split(ids, test_size=0.5, random_state=123)
data = data_all[data_all.srch_id.isin(ids_train)]
data_test = data_all[data_all.srch_id.isin(ids_test)]

k = 'gross_bookings_usd'
data_all.loc[data.index, k] = gross_booking_scaler.fit_transform(data[[k]].values)
data_all.loc[data_test.index, k] = gross_booking_scaler.transform(data_test[[k]].values)
# add noise to reduce dependency on this attr
data_all.loc[data.index, k] += np.random.normal(loc=0, scale=0.01, size=data[k].size)

# split cross validation folds
folds = util.data.cv_folds_for_sklearn(data, n_cv_folds=5, resampling_ratio=0)

# Use the full (pre-splitted) dataset because we use custom indices
# x_train, y_train will depend on the folds
x, y = util.data.split_xy(data_all)

x_test, y_test = util.data.split_xy(data_test)

In [6]:
data.shape, data_test.shape, ids.size # 20086

((4884, 85), (5116, 85), 412)

In [8]:
# TODO
# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html#sklearn.model_selection.RandomizedSearchCV
    
kwargs = {'random_state': 1234}
models = {}
params = {}
models['GBoost reg'] = xgb.XGBRegressor(criterion='friedman_mse', loss='ls', base_score=0, tol=1e-4, n_jobs=2, **kwargs)
models['GBoost ndcg'] = xgb.XGBRegressor(objective='rank:ndcg', criterion='friedman_mse', 
                                         loss='ls', base_score=0, tol=1e-4, n_jobs=2, **kwargs)
params['GBoost ndcg'] = {'n_estimators': [100, 1000],
                    'learning_rate': [0.01, 0.001],
                    'max_depth': [1,4]
                   }
params['GBoost reg'] = params['GBoost ndcg']
models['Ensemble Random Forest'] = sklearn.ensemble.RandomForestClassifier(n_jobs=2)
params['Ensemble Random Forest'] = {'n_estimators': [1000]}
models['AdaBoost'] = sklearn.ensemble.AdaBoostRegressor()
params['AdaBoost'] = {'learning_rate':[0.1],
                     'n_estimators': [50, 100],
                      'loss': ['linear']
                     }


# models['new'] = xgb.XGBRegressor(criterion='friedman_mse', 
#                           learning_rate=0.001, max_depth=4,
#                         loss='ls', base_score=0, tol=1e-4, n_jobs=4, **kwargs)
# params['new'] = {'n_estimators': [1000]}


scores = ['neg_mean_squared_error'] # TODO add ndcg here
print('shape', data_all.shape)
for k, model in models.items():
    for score in scores:
        print("\n# %s\t (score: %s)" % (k, score))
        assert k in params.keys(), 'models and params should have the same keys'
        clf = sklearn.model_selection.GridSearchCV(model, params[k], cv=folds, scoring=score, iid=True, n_jobs=2)
        s = 5. # scalar for score
        clf.fit(x, y/s) # cv/folds is used to select folds
        print("\tBest params (train)", clf.best_params_)
        y_true, y_pred = y_test/s, clf.predict(x_test)
        mse =  sklearn.metrics.mean_squared_error(y_true, y_pred)
        print(colored('\tmse (^2): %0.4f (%0.4f)' % (mse, mse*s), 'green'))
#         ndcg = util.ndcg.ndcg(x_test, y_test, y_pred)
#         print(colored('\tndcg: %0.4f' % ndcg, 'blue'))
        print(np.median(y_pred), np.max(y_pred))

print('done')

shape (10000, 85)

# GBoost reg	 (score: neg_mean_squared_error)
	Best params (train) {'learning_rate': 0.01, 'max_depth': 2, 'n_estimators': 500}
[32m	mse (^2): 0.9041 (4.5203)[0m
0.97659487 1.2980322

# GBoost ndcg	 (score: neg_mean_squared_error)


KeyboardInterrupt: 

# 2) Use a subset of features

In [11]:
# TODO nrows
data_all = pd.read_csv('data/training_set_VU_DM_clean.csv', sep=';', nrows=1000*1000)
# data_all = pd.read_csv('data/training_set_VU_DM_clean.csv', sep=';')

data_test_unlabelled = pd.read_csv('data/test_set_VU_DM_clean.csv', sep=';', nrows=1000)

In [12]:
features = [k for k in data_all.columns if 
            'score' in k or 
            'srch_id' in k or 
            'prop_id' in k or 
            'booking_bool' in k or
            'click_bool' in k or
            'unavailable_comp' in k or
            'available_comp' in k or
            'visitor_hist_starrating' in k or
            'delta_starrating' in k or
            'visitor_hist_adr_usd_log' in k or
            'price_usd_log' in k or
            'month' in k or
            'starrating' in k or
            'promotion' in k or
            'srch_length' in k or
            'distance' in k or
            'gross_booking' in k
           ]
features

['srch_id',
 'visitor_hist_starrating',
 'prop_id',
 'prop_review_score',
 'prop_location_score1',
 'prop_location_score2',
 'promotion_flag',
 'srch_length_of_stay',
 'srch_query_affinity_score',
 'orig_destination_distance',
 'click_bool',
 'gross_bookings_usd',
 'booking_bool',
 'srch_person_per_room_score',
 'srch_adults_per_room_score',
 'delta_starrating',
 'visitor_hist_adr_usd_log',
 'price_usd_log',
 'unavailable_comp',
 'available_comp',
 'month',
 'score',
 'prop_starrating_label0',
 'prop_starrating_label1',
 'prop_starrating_label2',
 'prop_starrating_label3',
 'srch_length_of_stay_bin0',
 'srch_length_of_stay_bin1',
 'srch_length_of_stay_bin2',
 'orig_destination_distance_bin0',
 'orig_destination_distance_bin1',
 'orig_destination_distance_bin2']

In [14]:
for k in data_all.columns:
    if data_all[k].isna().sum() > 0:
#         print('rm %0.4f' % (data_all[k].isna().sum() / data_all.shape[0]), k)
        data_all.drop(columns=[k], inplace=True)

gross_booking_scaler = preprocessing.RobustScaler(copy=False)

# set aside some labelled data for testing (based on srch_id)
ids = data_all.srch_id.unique()
ids_train, ids_test = train_test_split(ids, test_size=0.5, random_state=123)
data = data_all[data_all.srch_id.isin(ids_train)]
data_test = data_all[data_all.srch_id.isin(ids_test)]

k = 'gross_bookings_usd'
data_all.loc[data.index, k] = gross_booking_scaler.fit_transform(data[[k]].values)
data_all.loc[data_test.index, k] = gross_booking_scaler.transform(data_test[[k]].values)
# add noise to reduce dependency on this attr
data_all.loc[data.index, k] += np.random.normal(loc=0, scale=0.01, size=data[k].size)

# split cross validation folds
folds = util.data.cv_folds_for_sklearn(data, n_cv_folds=5, resampling_ratio=0)

# Use the full (pre-splitted) dataset because we use custom indices
# x_train, y_train will depend on the folds
x, y = util.data.split_xy(data_all)

x_test, y_test = util.data.split_xy(data_test)

In [15]:
kwargs = {'random_state': 1234}
models = {}
params = {}
models['GBoost reg'] = xgb.XGBRegressor(criterion='friedman_mse', loss='ls', base_score=0, tol=1e-4, n_jobs=2, **kwargs)
models['GBoost ndcg'] = xgb.XGBRegressor(objective='rank:ndcg', criterion='friedman_mse', 
                                         loss='ls', base_score=0, tol=1e-4, n_jobs=2, **kwargs)
params['GBoost reg'] = {'n_estimators': [100, 500, 1000],
                    'learning_rate': [0.2, 0.1, 0.01, 0.001],
                    'max_depth': [1,2,3,4]
                   }
params['GBoost ndcg'] = params['GBoost reg']
# models['Ensemble Random Forest'] = sklearn.ensemble.RandomForestClassifier(n_jobs=2)
# params['Ensemble Random Forest'] = {'n_estimators': [100]}
# models['AdaBoost'] = sklearn.ensemble.AdaBoostRegressor()
# params['AdaBoost'] = {'learning_rate':[0.2, 0.1, 0.01],
#                      'n_estimators': [50, 100, 500],
#                       'loss': ['linear', 'square']
#                      }

scores = ['neg_mean_squared_error'] # TODO add ndcg here
for k, model in models.items():
    for score in scores:
        print("\n# %s\t (score: %s)" % (k, score))
        assert k in params.keys(), 'models and params should have the same keys'
        clf = sklearn.model_selection.GridSearchCV(model, params[k], cv=folds, scoring=score, iid=True, n_jobs=2)
        clf.fit(x, y) # cv/folds is used to select folds
        print("\tBest params (train)", clf.best_params_)
        y_true, y_pred = y_test, clf.predict(x_test)
        mse =  sklearn.metrics.mean_squared_error(y_true, y_pred)
        print(colored('\tmse: %0.4f' % mse, 'green'))
        ndcg = util.ndcg.ndcg(x_test, y_test, y_pred)
        print(colored('\tndcg: %0.4f' % ndcg, 'blue'))

print('done')


# GBoost reg	 (score: neg_mean_squared_error)
	Best params (train) {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 100}
[32m	mse: 0.4282[0m
[34m	ndcg: 0.4022[0m

# GBoost ndcg	 (score: neg_mean_squared_error)
	Best params (train) {'learning_rate': 0.2, 'max_depth': 1, 'n_estimators': 500}
[32m	mse: 0.7070[0m
[34m	ndcg: 0.4022[0m
done
