In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import holidays
import pytz

from datetime import datetime
from scipy import stats

from catboost import *
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import mean_squared_error
from sklearn.cluster import KMeans
from scipy.cluster.hierarchy import ward, fcluster
from scipy.spatial.distance import pdist
from bayes_opt import BayesianOptimization

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
from sklearn.model_selection import train_test_split
train = pd.read_csv('data/train_v2.csv', parse_dates=['date'])
test = pd.read_csv('data/test_non_anomaly_v2.csv', parse_dates=['date'])
y_train = train[['online_hours']]
X_train = train.drop(['driver_id', 'day', 'date', 'online_hours', 
                      'holiday', 'next_holiday', 'prev_holiday'],axis=1)
X_test = test.drop(['driver_id', 'day', 'date', 'online_hours', 
                      'holiday', 'next_holiday', 'prev_holiday'],axis=1)
y_test = test[['online_hours']]

In [4]:
X_train_subset, X_test_subset, y_train_subset, y_test_subset = train_test_split(X_train, y_train, 
                                                                                test_size=0.2, random_state=0)

# XGBoost 

In [5]:
class xgboost_target :
    def __init__(self, x_train, y_train, x_test, y_test) :
        self.x_train = x_train.values
        self.y_train = y_train.values
        self.x_test = x_test.values
        self.y_test = y_test.values
        
    def clean_param(self, param) :
        booster_dict = {1:'gbtree', 2:'gblinear', 3:'dart'}
        params = {'random_state':0, 
                  'subsample':1, 
                  'colsample_bytree':1, 
                  'colsample_bylevel':1}
        
        params['objective'] = 'reg:squarederror'
        params['learning_rate'] = 0.1
        params['n_estimators'] = 100
        params['booster'] = booster_dict[int(param['booster'])]
        params['gamma'] = param['gamma']
        params['reg_alpha'] = param['reg_alpha']
        params['reg_lambda'] = param['reg_lambda']
        params['scale_pos_weight'] = param['scale_pos_weight']
        params['base_score'] = param['base_score']
        params['rate_drop'] = param['rate_drop']
        return params
        
    def evaluate(self,  
                 booster, 
                 gamma, 
                 reg_alpha, 
                 reg_lambda, 
                 scale_pos_weight, 
                 base_score, 
                 rate_drop):

        params = {}
        params['learning_rate'] = 0.1
        params['n_estimators'] = 100
        params['booster'] = booster
        params['gamma'] = gamma
        params['reg_alpha'] = reg_alpha
        params['reg_lambda'] = reg_lambda
        params['scale_pos_weight'] = scale_pos_weight
        params['base_score'] = base_score
        params['rate_drop'] = rate_drop
        params = self.clean_param(params)

        xgb_model = XGBRegressor(**params)
        xgb_model.fit(self.x_train, self.y_train)
        y_pred = xgb_model.predict(self.x_test)
        predictions = [value for value in y_pred]
        mse = mean_squared_error(self.y_test, predictions)
        rmse = np.sqrt(mse)
        return -1*rmse

In [6]:
xt = xgboost_target(X_train_subset, y_train_subset, X_test_subset, y_test_subset)
xgbBO = BayesianOptimization(xt.evaluate, {'booster' : (1, 3),
                                            'gamma' : (0, 50),
                                            'reg_alpha' : (0, 50),
                                            'reg_lambda' : (0, 50),
                                            'scale_pos_weight' : (0, 50),
                                            'base_score' : (0, 50),
                                            'rate_drop': (0, 1)},
                            random_state = 1)

xgbBO.maximize(init_points=15, n_iter=10)

|   iter    |  target   | base_s... |  booster  |   gamma   | rate_drop | reg_alpha | reg_la... | scale_... |
-------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-3.543   [0m | [0m 20.85   [0m | [0m 2.441   [0m | [0m 0.005719[0m | [0m 0.3023  [0m | [0m 7.338   [0m | [0m 4.617   [0m | [0m 9.313   [0m |
| [95m 2       [0m | [95m-1.926   [0m | [95m 17.28   [0m | [95m 1.794   [0m | [95m 26.94   [0m | [95m 0.4192  [0m | [95m 34.26   [0m | [95m 10.22   [0m | [95m 43.91   [0m |
| [0m 3       [0m | [0m-3.156   [0m | [0m 1.369   [0m | [0m 2.341   [0m | [0m 20.87   [0m | [0m 0.5587  [0m | [0m 7.019   [0m | [0m 9.905   [0m | [0m 40.04   [0m |
| [0m 4       [0m | [0m-1.932   [0m | [0m 48.41   [0m | [0m 1.627   [0m | [0m 34.62   [0m | [0m 0.8764  [0m | [0m 44.73   [0m | [0m 4.252   [0m | [0m 1.953   [0m |
| [0m 5       [0m | [0m-3.427   [0m | 

In [7]:
xgbBO.max

{'target': -1.914544397899434,
 'params': {'base_score': 6.501428605913883,
  'booster': 1.0387339157405941,
  'gamma': 33.94177664699455,
  'rate_drop': 0.21162811600005904,
  'reg_alpha': 13.277332968611312,
  'reg_lambda': 24.578657964016916,
  'scale_pos_weight': 2.668127255854019}}

# LGBM 

In [8]:
class lgbm_target :
    def __init__(self, x_train, y_train, x_test, y_test) :
        self.x_train = x_train
        self.y_train = y_train
        self.x_test = x_test
        self.y_test = y_test
        
    def clean_param(self, param) :
        params = {'boosting_type':'gbdt', 'class_weight':None, 'colsample_bytree':1.0, 
                  'importance_type':'split', 'learning_rate':0.1,
                  'min_child_samples':20, 'min_split_gain':0.0, 'n_estimators':100, 'objective':None,
                  'random_state':0, 'reg_alpha':0.0, 'reg_lambda':0.0, 'silent':True,
                  'subsample':1.0, 'subsample_for_bin':200000, 'subsample_freq':0}
        params['num_leaves'] = int(param['num_leaves'])
        params['min_child_weight'] = int(param['min_child_weight'])
        params['max_depth'] = int(param['max_depth'])
        params['learning_rate'] = 0.1
        params['min_data_in_bin'] = 1
        params['min_data'] = 1
        
        params['min_child_samples'] = int(param['min_child_samples'])
        params['bagging_fraction'] = param['bagging_fraction']
        params['lambda_l1'] = param['lambda_l1']
        params['lambda_l2'] = param['lambda_l2']

        return params
        
    def evaluate(self, min_child_weight, max_depth, num_leaves,
                min_child_samples, bagging_fraction, lambda_l1, lambda_l2):
        params = {'num_leaves':num_leaves, 
                  'min_child_weight':min_child_weight, 
                  'max_depth':max_depth,
                 'min_child_samples':min_child_samples,
                 'bagging_fraction' : bagging_fraction,
                 'lambda_l1' : lambda_l1,
                 'lambda_l2' : lambda_l2}
        
        params = self.clean_param(params)

        lgbm_model = LGBMRegressor(**params)
        lgbm_model.fit(self.x_train, self.y_train)
        y_pred = lgbm_model.predict(self.x_test)
        predictions = [value for value in y_pred]
        mse = mean_squared_error(self.y_test, predictions)
        rmse = np.sqrt(mse)
        return -1*rmse

In [9]:
lt = lgbm_target(X_train_subset, y_train_subset, X_test_subset, y_test_subset)
lgbmBO = BayesianOptimization(lt.evaluate, {'min_child_weight': (0.01, 1),
                                              'max_depth': (1, 7),
                                              'num_leaves': (5, 50),
                                            'min_child_samples' :(10,50),
                                            'bagging_fraction' : (0.5,1),
                                            'lambda_l1' : (0,1),
                                            'lambda_l2' : (0,1)
                                           }, 
                             random_state=3)

lgbmBO.maximize(init_points=15, n_iter=10)

|   iter    |  target   | baggin... | lambda_l1 | lambda_l2 | max_depth | min_ch... | min_ch... | num_le... |
-------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-1.898   [0m | [0m 0.7754  [0m | [0m 0.7081  [0m | [0m 0.2909  [0m | [0m 4.065   [0m | [0m 45.72   [0m | [0m 0.8973  [0m | [0m 10.65   [0m |
| [0m 2       [0m | [0m-2.056   [0m | [0m 0.6036  [0m | [0m 0.05147 [0m | [0m 0.4408  [0m | [0m 1.179   [0m | [0m 28.27   [0m | [0m 0.6527  [0m | [0m 17.53   [0m |
| [95m 3       [0m | [95m-1.893   [0m | [95m 0.8381  [0m | [95m 0.5909  [0m | [95m 0.02398 [0m | [95m 4.353   [0m | [95m 20.37   [0m | [95m 0.421   [0m | [95m 17.76   [0m |
| [0m 4       [0m | [0m-1.895   [0m | [0m 0.8466  [0m | [0m 0.4405  [0m | [0m 0.1569  [0m | [0m 4.268   [0m | [0m 41.21   [0m | [0m 0.3133  [0m | [0m 14.99   [0m |
| [0m 5       [0m | [0m-1.894   [0m | 

In [10]:
lgbmBO.max

{'target': -1.8888994232221157,
 'params': {'bagging_fraction': 0.5,
  'lambda_l1': 0.0,
  'lambda_l2': 1.0,
  'max_depth': 7.0,
  'min_child_samples': 17.795932537903322,
  'min_child_weight': 0.01,
  'num_leaves': 24.30000086877875}}

# Random Forest 

In [11]:
class rf_target :
    def __init__(self, x_train, y_train, x_test, y_test) :
        self.x_train = x_train
        self.y_train = y_train
        self.x_test = x_test
        self.y_test = y_test
        
    def clean_param(self, param) :
        params = {
            'max_depth' : 5,
            'min_samples_split' : 2,
            'min_samples_leaf': 1,
            'min_weight_fraction_leaf': 0.0,
            'min_impurity_decrease': 0.0
            
        }
        params['max_depth'] = int(param['max_depth'])
        params['min_samples_split'] = int(param['min_samples_split'])
        params['min_samples_leaf'] = int(param['min_samples_leaf'])
        params['min_weight_fraction_leaf'] = param['min_weight_fraction_leaf']
        params['min_impurity_decrease'] = param['min_impurity_decrease']

        return params
        
    def evaluate(self, max_depth, min_samples_split, min_samples_leaf,
                min_weight_fraction_leaf, min_impurity_decrease):
        params = {'max_depth':max_depth, 
                  'min_samples_split':min_samples_split, 
                  'min_samples_leaf':min_samples_leaf,
                 'min_weight_fraction_leaf':min_weight_fraction_leaf,
                 'min_impurity_decrease' : min_impurity_decrease}
        
        params = self.clean_param(params)

        rf_model = RandomForestRegressor(**params)
        rf_model.fit(self.x_train, self.y_train)
        y_pred = rf_model.predict(self.x_test)
        predictions = [value for value in y_pred]
        mse = mean_squared_error(self.y_test, predictions)
        rmse = np.sqrt(mse)
        return -1*rmse

In [12]:
rf = rf_target(X_train_subset, y_train_subset, X_test_subset, y_test_subset)
rfBO = BayesianOptimization(rf.evaluate, {'max_depth': (1, 10),
                                              'min_samples_split': (10, 20),
                                              'min_samples_leaf': (5, 50),
                                            'min_weight_fraction_leaf' :(0,0.5),
                                            'min_impurity_decrease' : (0,1)
                                           }, 
                             random_state=3)

rfBO.maximize(init_points=15, n_iter=10)

|   iter    |  target   | max_depth | min_im... | min_sa... | min_sa... | min_we... |
-------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-2.363   [0m | [0m 5.957   [0m | [0m 0.7081  [0m | [0m 18.09   [0m | [0m 15.11   [0m | [0m 0.4465  [0m |
| [95m 2       [0m | [95m-2.15    [0m | [95m 9.067   [0m | [95m 0.1256  [0m | [95m 14.33   [0m | [95m 10.51   [0m | [95m 0.2204  [0m |
| [0m 3       [0m | [0m-2.39    [0m | [0m 1.269   [0m | [0m 0.4568  [0m | [0m 34.21   [0m | [0m 12.78   [0m | [0m 0.3381  [0m |
| [0m 4       [0m | [0m-2.184   [0m | [0m 6.318   [0m | [0m 0.02398 [0m | [0m 30.15   [0m | [0m 12.59   [0m | [0m 0.2076  [0m |
| [0m 5       [0m | [0m-2.202   [0m | [0m 3.552   [0m | [0m 0.6931  [0m | [0m 24.82   [0m | [0m 11.57   [0m | [0m 0.2723  [0m |
| [0m 6       [0m | [0m-2.372   [0m | [0m 8.023   [0m | [0m 0.3064  [0m | [0m 14.99   [0m | [0m 13.8

In [13]:
rfBO.max

{'target': -1.8979323609243068,
 'params': {'max_depth': 10.0,
  'min_impurity_decrease': 0.0,
  'min_samples_leaf': 27.643044588529033,
  'min_samples_split': 20.0,
  'min_weight_fraction_leaf': 0.0}}

# Extra Tree 

In [14]:
class et_target :
    def __init__(self, x_train, y_train, x_test, y_test) :
        self.x_train = x_train
        self.y_train = y_train
        self.x_test = x_test
        self.y_test = y_test
        
    def clean_param(self, param) :
        params = {
            'max_depth' : 5,
            'min_samples_split' : 2,
            'min_samples_leaf': 1,
            'min_weight_fraction_leaf': 0.0,
            'min_impurity_decrease': 0.0
            
        }
        params['max_depth'] = int(param['max_depth'])
        params['min_samples_split'] = int(param['min_samples_split'])
        params['min_samples_leaf'] = int(param['min_samples_leaf'])
        params['min_weight_fraction_leaf'] = param['min_weight_fraction_leaf']
        params['min_impurity_decrease'] = param['min_impurity_decrease']

        return params
        
    def evaluate(self, max_depth, min_samples_split, min_samples_leaf,
                min_weight_fraction_leaf, min_impurity_decrease):
        params = {'max_depth':max_depth, 
                  'min_samples_split':min_samples_split, 
                  'min_samples_leaf':min_samples_leaf,
                 'min_weight_fraction_leaf':min_weight_fraction_leaf,
                 'min_impurity_decrease' : min_impurity_decrease}
        
        params = self.clean_param(params)

        et_model = ExtraTreesRegressor(**params)
        et_model.fit(self.x_train, self.y_train)
        y_pred = et_model.predict(self.x_test)
        predictions = [value for value in y_pred]
        mse = mean_squared_error(self.y_test, predictions)
        rmse = np.sqrt(mse)
        return -1*rmse

In [15]:
et = et_target(X_train_subset, y_train_subset, X_test_subset, y_test_subset)
etBO = BayesianOptimization(et.evaluate, {'max_depth': (1, 10),
                                              'min_samples_split': (10, 20),
                                              'min_samples_leaf': (5, 50),
                                            'min_weight_fraction_leaf' :(0,0.5),
                                            'min_impurity_decrease' : (0,1)
                                           }, 
                             random_state=3)

etBO.maximize(init_points=15, n_iter=10)

|   iter    |  target   | max_depth | min_im... | min_sa... | min_sa... | min_we... |
-------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-2.564   [0m | [0m 5.957   [0m | [0m 0.7081  [0m | [0m 18.09   [0m | [0m 15.11   [0m | [0m 0.4465  [0m |
| [95m 2       [0m | [95m-2.259   [0m | [95m 9.067   [0m | [95m 0.1256  [0m | [95m 14.33   [0m | [95m 10.51   [0m | [95m 0.2204  [0m |
| [0m 3       [0m | [0m-2.376   [0m | [0m 1.269   [0m | [0m 0.4568  [0m | [0m 34.21   [0m | [0m 12.78   [0m | [0m 0.3381  [0m |
| [0m 4       [0m | [0m-2.283   [0m | [0m 6.318   [0m | [0m 0.02398 [0m | [0m 30.15   [0m | [0m 12.59   [0m | [0m 0.2076  [0m |
| [0m 5       [0m | [0m-2.337   [0m | [0m 3.552   [0m | [0m 0.6931  [0m | [0m 24.82   [0m | [0m 11.57   [0m | [0m 0.2723  [0m |
| [0m 6       [0m | [0m-2.602   [0m | [0m 8.023   [0m | [0m 0.3064  [0m | [0m 14.99   [0m | [0m 13.8

In [16]:
etBO.max

{'target': -1.8652185981184386,
 'params': {'max_depth': 10.0,
  'min_impurity_decrease': 0.0,
  'min_samples_leaf': 5.0,
  'min_samples_split': 20.0,
  'min_weight_fraction_leaf': 0.0}}