## Training Model

In [127]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
import xgboost
import lightgbm as lgb
from catboost import Pool, CatBoostRegressor
from scipy.stats import uniform
import pickle
import time
import math

from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingClassifier

from vecstack import stacking
from bayes_opt import BayesianOptimization

import warnings
warnings.filterwarnings('ignore')
pd.set_option("display.max_columns", None)

SEED = 123
start_0 = time.time()
%matplotlib inline

In [86]:
alldfs = [var for var in dir() if isinstance(eval(var), pd.core.frame.DataFrame)]
print(alldfs)

['_', '_17', '_19', '_20', '_23', '_25', '_3', '_30', '_32', '_34', '_46', '_5', '_51', '_53', '_57', '_6', '_61', '_62', '_65', '_67', '_68', '_74', '_80', '_84', '__', '___', 'all_data', 'all_data_1', 'all_data_2', 'correlation', 'data', 'df_cat_var', 'df_cities', 'df_numeric_var', 'df_numeric_var_scaled', 'group', 'item_categories', 'items', 'missing_value_df', 'newcols', 'sales', 'shop_life', 'shops', 'test', 'transactions', 'x_price']


In [87]:
for i in dir():
    if isinstance(globals()[i], pd.DataFrame):
        del globals()[i]

gc.collect()

121174

#### Load Data

In [88]:
DATA_FOLDER = "./"

data = pd.read_pickle('data.pkl')
test = pd.read_csv(os.path.join(DATA_FOLDER, 'test.csv.gz')) #test data

In [89]:
data.pop('ID')

4488710          0.0
4488711          0.0
4488712          0.0
4488713          0.0
4488714          0.0
              ...   
11127999    214195.0
11128000    214196.0
11128001    214197.0
11128002    214198.0
11128003    214199.0
Name: ID, Length: 6639294, dtype: float64

In [90]:
data.columns

Index(['date_block_num', 'shop_id', 'item_id', 'target', 'city_code',
       'item_category_id', 'meta_category_code', 'subtype_code',
       'item_target_enc', 'shop_target_enc', 'item_category_target_enc',
       'city_code_target_enc', 'meta_category_code_target_enc',
       'subtype_code_target_enc', 'target_lag_1', 'target_lag_3',
       'target_lag_6', 'date_avg_item_cnt_lag_1',
       'date_item_avg_item_cnt_lag_1', 'date_item_avg_item_cnt_lag_3',
       'date_item_avg_item_cnt_lag_6', 'date_shop_avg_item_cnt_lag_1',
       'date_shop_avg_item_cnt_lag_3', 'date_shop_avg_item_cnt_lag_6',
       'date_category_avg_item_cnt_lag_1',
       'date_shop_category_avg_item_cnt_lag_1',
       'date_shop_meta_category_avg_item_cnt_lag_1',
       'date_shop_subtype_avg_item_cnt_lag_1', 'date_city_avg_item_cnt_lag_1',
       'date_item_city_avg_item_cnt_lag_1',
       'date_meta_category_avg_item_cnt_lag_1',
       'date_subtype_avg_item_cnt_lag_1', 'delta_price_lag', 'month', 'days'],
     

In [91]:
data.dtypes

date_block_num                                  int64
shop_id                                         int64
item_id                                         int64
target                                        float16
city_code                                       int64
item_category_id                                int64
meta_category_code                              int64
subtype_code                                    int64
item_target_enc                               float64
shop_target_enc                               float64
item_category_target_enc                      float64
city_code_target_enc                          float64
meta_category_code_target_enc                 float64
subtype_code_target_enc                       float64
target_lag_1                                  float64
target_lag_3                                  float64
target_lag_6                                  float64
date_avg_item_cnt_lag_1                       float16
date_item_avg_item_cnt_lag_1

In [92]:
#Function used to downcast
def downcast_dtypes(df):
    '''
          Changes column types in the dataframe: 
            `float64` type to `float32`
            `int64`   type to `int32`
      '''

    # Select columns to downcast
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype == "int64"]

    # Downcast
    df[float_cols] = df[float_cols].astype(np.float16)
    df[int_cols]   = df[int_cols].astype(np.int16)

    return df
downcast_dtypes(data)

Unnamed: 0,date_block_num,shop_id,item_id,target,city_code,item_category_id,meta_category_code,subtype_code,item_target_enc,shop_target_enc,item_category_target_enc,city_code_target_enc,meta_category_code_target_enc,subtype_code_target_enc,target_lag_1,target_lag_3,target_lag_6,date_avg_item_cnt_lag_1,date_item_avg_item_cnt_lag_1,date_item_avg_item_cnt_lag_3,date_item_avg_item_cnt_lag_6,date_shop_avg_item_cnt_lag_1,date_shop_avg_item_cnt_lag_3,date_shop_avg_item_cnt_lag_6,date_category_avg_item_cnt_lag_1,date_shop_category_avg_item_cnt_lag_1,date_shop_meta_category_avg_item_cnt_lag_1,date_shop_subtype_avg_item_cnt_lag_1,date_city_avg_item_cnt_lag_1,date_item_city_avg_item_cnt_lag_1,date_meta_category_avg_item_cnt_lag_1,date_subtype_avg_item_cnt_lag_1,delta_price_lag,month,days
4488710,12,2,27,0.0,1,19,5,10,0.065796,0.101196,0.989746,0.101196,0.818848,0.897949,0.0,0.0,0.0,0.470459,0.086975,0.130493,0.065247,0.156006,0.098877,0.096008,1.181641,0.965820,0.814941,0.943359,0.156006,0.0,1.125000,1.163086,-0.282715,0,31
4488711,12,2,30,0.0,1,40,11,4,3.562500,0.101196,0.263672,0.101196,0.242188,0.263672,0.0,0.0,0.0,0.470459,1.021484,0.521973,0.891113,0.156006,0.098877,0.096008,0.309082,0.046234,0.051727,0.046234,0.156006,0.0,0.281006,0.309082,-0.483398,0,31
4488712,12,2,31,0.0,1,37,11,1,2.179688,0.101196,0.198242,0.101196,0.242188,0.199829,0.0,0.0,0.0,0.470459,0.543457,0.543457,0.304443,0.156006,0.098877,0.096008,0.234009,0.059448,0.051727,0.064697,0.156006,0.0,0.281006,0.235107,-0.137451,0,31
4488713,12,2,32,1.0,1,40,11,4,2.509766,0.101135,0.263672,0.101135,0.242188,0.263672,0.0,0.0,0.0,0.470459,1.934570,1.260742,1.891602,0.156006,0.098877,0.096008,0.309082,0.046234,0.051727,0.046234,0.156006,0.0,0.281006,0.309082,-0.407227,0,31
4488714,12,2,33,1.0,1,37,11,1,0.764160,0.101196,0.198242,0.101196,0.242188,0.199829,1.0,0.0,0.0,0.470459,0.913086,0.717285,1.000000,0.156006,0.098877,0.096008,0.234009,0.059448,0.051727,0.064697,0.156006,1.0,0.281006,0.235107,-0.225464,0,31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11127999,34,45,18454,0.0,21,55,13,2,0.957520,0.175293,0.220215,0.181641,0.169678,0.220215,1.0,0.0,0.0,0.289307,0.045441,0.071411,0.590820,0.129639,0.139038,0.144287,0.197021,0.126831,0.089294,0.126831,0.136841,0.5,0.147095,0.197021,-0.475098,10,30
11128000,34,45,16188,0.0,21,64,14,42,0.031250,0.175293,0.298096,0.181641,0.396729,0.298096,0.0,0.0,0.0,0.289307,0.022720,0.000000,0.000000,0.129639,0.000000,0.000000,0.155640,0.094482,0.112976,0.094482,0.136841,0.0,0.313232,0.155640,0.081116,10,30
11128001,34,45,15757,0.0,21,55,13,2,0.227417,0.175293,0.220215,0.181641,0.169678,0.220215,0.0,0.0,0.0,0.289307,0.113647,0.095215,0.250000,0.129639,0.139038,0.144287,0.197021,0.126831,0.089294,0.126831,0.136841,0.0,0.147095,0.197021,0.155884,10,30
11128002,34,45,19648,0.0,21,40,11,4,0.103699,0.175293,0.250732,0.181641,0.220825,0.250732,0.0,0.0,0.0,0.289307,0.045441,0.166626,0.090881,0.129639,0.139038,0.144287,0.221558,0.083740,0.097046,0.083740,0.136841,0.0,0.226318,0.221558,-0.091736,10,30


In [93]:
data["target"] = data["target"].astype(np.float64)

In [94]:
data.dtypes

date_block_num                                  int16
shop_id                                         int16
item_id                                         int16
target                                        float64
city_code                                       int16
item_category_id                                int16
meta_category_code                              int16
subtype_code                                    int16
item_target_enc                               float16
shop_target_enc                               float16
item_category_target_enc                      float16
city_code_target_enc                          float16
meta_category_code_target_enc                 float16
subtype_code_target_enc                       float16
target_lag_1                                  float16
target_lag_3                                  float16
target_lag_6                                  float16
date_avg_item_cnt_lag_1                       float16
date_item_avg_item_cnt_lag_1

#### Split Data into Train Val and Test Set

In [95]:
%%time
X_train = data[data.date_block_num < 33].drop(['target'], axis=1)
y_train = data[data.date_block_num < 33]['target']
X_val = data[data.date_block_num == 33].drop(['target'], axis=1)
y_val = data[data.date_block_num == 33]['target']
X_test = data[data.date_block_num == 34].drop(['target'], axis=1)

CPU times: user 3.22 s, sys: 937 ms, total: 4.16 s
Wall time: 4.64 s


In [96]:
alldfs = [var for var in dir() if isinstance(eval(var), pd.core.frame.DataFrame)]
print(alldfs)

['X_test', 'X_train', 'X_val', '_92', '__', 'data', 'test']


#### Hyperparameter Tuning (XGBRegressor)

In [116]:
def xgboost_bo(n_estimators, eta):
    params_xgboost = {}
    
    params_xgboost['n_estimators'] = round(n_estimators)
    params_xgboost['eta'] = eta
    
    scores = cross_val_score(XGBRegressor(random_state=SEED, **params_xgboost),
                             X_train, y_train, scoring='neg_root_mean_squared_error', cv=2).mean()
    
    score = scores.mean()
    return score

In [117]:
%%time
# Run Bayesian Optimization

params_xgboost ={
    'n_estimators':(100, 1000),
    'eta': (0.1, 0.3)
}

xgboost_bo = BayesianOptimization(xgboost_bo, params_xgboost, random_state=SEED)
xgboost_bo.maximize(init_points=20, n_iter=4)

|   iter    |  target   |    eta    | n_esti... |
-------------------------------------------------
| [0m 1       [0m | [0m-0.8508  [0m | [0m 0.2393  [0m | [0m 357.5   [0m |
| [95m 2       [0m | [95m-0.8147  [0m | [95m 0.1454  [0m | [95m 596.2   [0m |
| [0m 3       [0m | [0m-0.8373  [0m | [0m 0.2439  [0m | [0m 480.8   [0m |
| [0m 4       [0m | [0m-0.9099  [0m | [0m 0.2962  [0m | [0m 716.3   [0m |
| [95m 5       [0m | [95m-0.8095  [0m | [95m 0.1962  [0m | [95m 452.9   [0m |
| [0m 6       [0m | [0m-0.8608  [0m | [0m 0.1686  [0m | [0m 756.1   [0m |
| [95m 7       [0m | [95m-0.8064  [0m | [95m 0.1877  [0m | [95m 153.7   [0m |
| [0m 8       [0m | [0m-0.8518  [0m | [0m 0.1796  [0m | [0m 764.2   [0m |
| [95m 9       [0m | [95m-0.7867  [0m | [95m 0.1365  [0m | [95m 257.9   [0m |
| [0m 10      [0m | [0m-0.8432  [0m | [0m 0.2063  [0m | [0m 578.6   [0m |
| [0m 11      [0m | [0m-0.9436  [0m | [0m 0.2269  [0m | [

In [123]:
params_xgboost = xgboost_bo.max['params']

params_xgboost['eta'] = round(params_xgboost['eta'],2)
params_xgboost['n_estimators'] = round(params_xgboost['n_estimators'])

params_xgboost

{'eta': 0.14, 'n_estimators': 258}

In [124]:
%%time
model = XGBRegressor(**params_xgboost)

model.fit(
    X_train, 
    y_train, 
    eval_metric="rmse", 
    eval_set=[(X_train, y_train), (X_val, y_val)], 
    verbose=True, 
    early_stopping_rounds = 5)

[0]	validation_0-rmse:1.12273	validation_1-rmse:1.09591
[1]	validation_0-rmse:1.05656	validation_1-rmse:1.04374
[2]	validation_0-rmse:1.00519	validation_1-rmse:1.00452
[3]	validation_0-rmse:0.96123	validation_1-rmse:0.97524
[4]	validation_0-rmse:0.92722	validation_1-rmse:0.95275
[5]	validation_0-rmse:0.90216	validation_1-rmse:0.93567
[6]	validation_0-rmse:0.87999	validation_1-rmse:0.92408
[7]	validation_0-rmse:0.86180	validation_1-rmse:0.91476
[8]	validation_0-rmse:0.84890	validation_1-rmse:0.90773
[9]	validation_0-rmse:0.83803	validation_1-rmse:0.90280
[10]	validation_0-rmse:0.82687	validation_1-rmse:0.89962
[11]	validation_0-rmse:0.81900	validation_1-rmse:0.89407
[12]	validation_0-rmse:0.81267	validation_1-rmse:0.89226
[13]	validation_0-rmse:0.80793	validation_1-rmse:0.89129
[14]	validation_0-rmse:0.80322	validation_1-rmse:0.88945
[15]	validation_0-rmse:0.79942	validation_1-rmse:0.88808
[16]	validation_0-rmse:0.79635	validation_1-rmse:0.88832
[17]	validation_0-rmse:0.79338	validation

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, eta=0.14, gamma=0,
             gpu_id=-1, importance_type='gain', interaction_constraints='',
             learning_rate=0.140000001, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=258, n_jobs=8, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='approx', validate_parameters=1, verbosity=None)

In [125]:
y_pred = model.predict(X_val).clip(0, 20)
# Basic RMSE
print('The rmse of prediction is:', round(mean_squared_error(y_pred, y_val) ** 0.5, 5))

The rmse of prediction is: 0.88106


In [126]:
import pickle
y_pred = model.predict(X_val).clip(0, 20)
y_test = model.predict(X_test).clip(0, 20)

submission = pd.DataFrame({
    "ID": test.index, 
    "item_cnt_month": y_test
})
submission.to_csv('Final_Project_xgboost_hyper_tune_submission_v1.csv', index=False)