In [1]:
import pandas as pd
import numpy as np
from numpy import absolute, mean, std, arange
import os
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import xgboost
from xgboost import XGBRegressor, plot_importance
import lightgbm as lgb
from catboost import Pool, CatBoostRegressor
from scipy.stats import uniform
from itertools import product
import pickle
import time
import math
import re
import gc

import joblib
from sklearn.svm import SVR
from sklearn import neighbors
from sklearn.model_selection import RepeatedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingClassifier
from mlxtend.preprocessing import minmax_scaling

from vecstack import stacking
from bayes_opt import BayesianOptimization

import warnings
warnings.filterwarnings('ignore')
pd.set_option("display.max_columns", None)

SEED = 123
start_0 = time.time()
%matplotlib inline

In [2]:
alldfs = [var for var in dir() if isinstance(eval(var), pd.core.frame.DataFrame)]
print(alldfs)

[]


In [3]:
for i in dir():
    if isinstance(globals()[i], pd.DataFrame):
        del globals()[i]

gc.collect()

53

#### Load Data

In [4]:
DATA_FOLDER = "./"

data = pd.read_pickle('data.pkl')
test = pd.read_csv(os.path.join(DATA_FOLDER, 'test.csv.gz')) #test data

In [5]:
data.columns

Index(['date_block_num', 'shop_id', 'item_id', 'target', 'ID', 'city_code',
       'item_category_id', 'meta_category_code', 'subtype_code',
       'item_target_enc', 'shop_target_enc', 'item_category_target_enc',
       'city_code_target_enc', 'meta_category_code_target_enc',
       'subtype_code_target_enc', 'target_lag_1', 'target_lag_3',
       'target_lag_6', 'date_avg_item_cnt_lag_1',
       'date_item_avg_item_cnt_lag_1', 'date_item_avg_item_cnt_lag_3',
       'date_item_avg_item_cnt_lag_6', 'date_shop_avg_item_cnt_lag_1',
       'date_shop_avg_item_cnt_lag_3', 'date_shop_avg_item_cnt_lag_6',
       'date_category_avg_item_cnt_lag_1',
       'date_shop_category_avg_item_cnt_lag_1',
       'date_shop_meta_category_avg_item_cnt_lag_1',
       'date_shop_subtype_avg_item_cnt_lag_1', 'date_city_avg_item_cnt_lag_1',
       'date_item_city_avg_item_cnt_lag_1',
       'date_meta_category_avg_item_cnt_lag_1',
       'date_subtype_avg_item_cnt_lag_1', 'delta_price_lag', 'month', 'days'],

In [6]:
data.dtypes

date_block_num                                  int64
shop_id                                         int64
item_id                                         int64
target                                        float16
ID                                            float64
city_code                                       int64
item_category_id                                int64
meta_category_code                              int64
subtype_code                                    int64
item_target_enc                               float64
shop_target_enc                               float64
item_category_target_enc                      float64
city_code_target_enc                          float64
meta_category_code_target_enc                 float64
subtype_code_target_enc                       float64
target_lag_1                                  float64
target_lag_3                                  float64
target_lag_6                                  float64
date_avg_item_cnt_lag_1     

In [7]:
#Function used to downcast
def downcast_dtypes(df):
    '''
          Changes column types in the dataframe: 
            `float64` type to `float32`
            `int64`   type to `int32`
      '''

    # Select columns to downcast
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype == "int64"]

    # Downcast
    df[float_cols] = df[float_cols].astype(np.float16)
    df[int_cols]   = df[int_cols].astype(np.int16)

    return df
downcast_dtypes(data)

Unnamed: 0,date_block_num,shop_id,item_id,target,ID,city_code,item_category_id,meta_category_code,subtype_code,item_target_enc,shop_target_enc,item_category_target_enc,city_code_target_enc,meta_category_code_target_enc,subtype_code_target_enc,target_lag_1,target_lag_3,target_lag_6,date_avg_item_cnt_lag_1,date_item_avg_item_cnt_lag_1,date_item_avg_item_cnt_lag_3,date_item_avg_item_cnt_lag_6,date_shop_avg_item_cnt_lag_1,date_shop_avg_item_cnt_lag_3,date_shop_avg_item_cnt_lag_6,date_category_avg_item_cnt_lag_1,date_shop_category_avg_item_cnt_lag_1,date_shop_meta_category_avg_item_cnt_lag_1,date_shop_subtype_avg_item_cnt_lag_1,date_city_avg_item_cnt_lag_1,date_item_city_avg_item_cnt_lag_1,date_meta_category_avg_item_cnt_lag_1,date_subtype_avg_item_cnt_lag_1,delta_price_lag,month,days
4488710,12,2,27,0.0,0.0,1,19,5,10,0.065796,0.101196,0.989746,0.101196,0.818848,0.897949,0.0,0.0,0.0,0.470459,0.086975,0.130493,0.065247,0.156006,0.098877,0.096008,1.181641,0.965820,0.814941,0.943359,0.156006,0.0,1.125000,1.163086,-0.282715,0,31
4488711,12,2,30,0.0,0.0,1,40,11,4,3.562500,0.101196,0.263672,0.101196,0.242188,0.263672,0.0,0.0,0.0,0.470459,1.021484,0.521973,0.891113,0.156006,0.098877,0.096008,0.309082,0.046234,0.051727,0.046234,0.156006,0.0,0.281006,0.309082,-0.483398,0,31
4488712,12,2,31,0.0,0.0,1,37,11,1,2.179688,0.101196,0.198242,0.101196,0.242188,0.199829,0.0,0.0,0.0,0.470459,0.543457,0.543457,0.304443,0.156006,0.098877,0.096008,0.234009,0.059448,0.051727,0.064697,0.156006,0.0,0.281006,0.235107,-0.137451,0,31
4488713,12,2,32,1.0,0.0,1,40,11,4,2.509766,0.101135,0.263672,0.101135,0.242188,0.263672,0.0,0.0,0.0,0.470459,1.934570,1.260742,1.891602,0.156006,0.098877,0.096008,0.309082,0.046234,0.051727,0.046234,0.156006,0.0,0.281006,0.309082,-0.407227,0,31
4488714,12,2,33,1.0,0.0,1,37,11,1,0.764160,0.101196,0.198242,0.101196,0.242188,0.199829,1.0,0.0,0.0,0.470459,0.913086,0.717285,1.000000,0.156006,0.098877,0.096008,0.234009,0.059448,0.051727,0.064697,0.156006,1.0,0.281006,0.235107,-0.225464,0,31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11127999,34,45,18454,0.0,inf,21,55,13,2,0.957520,0.175293,0.220215,0.181641,0.169678,0.220215,1.0,0.0,0.0,0.289307,0.045441,0.071411,0.590820,0.129639,0.139038,0.144287,0.197021,0.126831,0.089294,0.126831,0.136841,0.5,0.147095,0.197021,-0.475098,10,30
11128000,34,45,16188,0.0,inf,21,64,14,42,0.031250,0.175293,0.298096,0.181641,0.396729,0.298096,0.0,0.0,0.0,0.289307,0.022720,0.000000,0.000000,0.129639,0.000000,0.000000,0.155640,0.094482,0.112976,0.094482,0.136841,0.0,0.313232,0.155640,0.081116,10,30
11128001,34,45,15757,0.0,inf,21,55,13,2,0.227417,0.175293,0.220215,0.181641,0.169678,0.220215,0.0,0.0,0.0,0.289307,0.113647,0.095215,0.250000,0.129639,0.139038,0.144287,0.197021,0.126831,0.089294,0.126831,0.136841,0.0,0.147095,0.197021,0.155884,10,30
11128002,34,45,19648,0.0,inf,21,40,11,4,0.103699,0.175293,0.250732,0.181641,0.220825,0.250732,0.0,0.0,0.0,0.289307,0.045441,0.166626,0.090881,0.129639,0.139038,0.144287,0.221558,0.083740,0.097046,0.083740,0.136841,0.0,0.226318,0.221558,-0.091736,10,30


In [8]:
data["target"] = data["target"].astype(np.float64)

In [9]:
data.dtypes

date_block_num                                  int16
shop_id                                         int16
item_id                                         int16
target                                        float64
ID                                            float16
city_code                                       int16
item_category_id                                int16
meta_category_code                              int16
subtype_code                                    int16
item_target_enc                               float16
shop_target_enc                               float16
item_category_target_enc                      float16
city_code_target_enc                          float16
meta_category_code_target_enc                 float16
subtype_code_target_enc                       float16
target_lag_1                                  float16
target_lag_3                                  float16
target_lag_6                                  float16
date_avg_item_cnt_lag_1     

### Create one hot encode features for Linear models

In [10]:
categoricalcolumns = data.select_dtypes(include=["int16"]).columns.tolist()
categoricalcolumns = [e for e in categoricalcolumns if e in ('month', 'days')]

categoricalcolumns

['month', 'days']

In [11]:
data_lm = data.ID
for i in categoricalcolumns:
    print("Unique Vaule for {} is {}".format(i, data[i].nunique()))
    if data[i].nunique() <= 100:
        df_temp = pd.get_dummies(data[i], prefix=i)
        data_lm = pd.concat([data_lm, df_temp], axis=1)

print("Linear Model Data Input Columns {}".format(data_lm.columns))
data_lm.head()

Unique Vaule for month is 12
Unique Vaule for days is 3
Linear Model Data Input Columns Index(['ID', 'month_0', 'month_1', 'month_2', 'month_3', 'month_4', 'month_5',
       'month_6', 'month_7', 'month_8', 'month_9', 'month_10', 'month_11',
       'days_28', 'days_30', 'days_31'],
      dtype='object')


Unnamed: 0,ID,month_0,month_1,month_2,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,days_28,days_30,days_31
4488710,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4488711,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4488712,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4488713,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4488714,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1


### MinMax Scaling features for linear models

In [12]:
# MinMax Scaling features for linear models
numeric_var = [key for key in dict(data.dtypes)
                   if dict(data.dtypes)[key]
                       in ['float16']] # Numeric Variable
numeric_var.remove('ID')
print(numeric_var)

df_numeric_var = data[numeric_var]

# Min Max Scale
df_numeric_var_scaled = minmax_scaling(df_numeric_var, columns = df_numeric_var.columns.values)
data_lm = pd.concat([data[['date_block_num', 'target']], data_lm, df_numeric_var_scaled], axis=1)
data_lm.pop('ID')
data_lm.tail()

['item_target_enc', 'shop_target_enc', 'item_category_target_enc', 'city_code_target_enc', 'meta_category_code_target_enc', 'subtype_code_target_enc', 'target_lag_1', 'target_lag_3', 'target_lag_6', 'date_avg_item_cnt_lag_1', 'date_item_avg_item_cnt_lag_1', 'date_item_avg_item_cnt_lag_3', 'date_item_avg_item_cnt_lag_6', 'date_shop_avg_item_cnt_lag_1', 'date_shop_avg_item_cnt_lag_3', 'date_shop_avg_item_cnt_lag_6', 'date_category_avg_item_cnt_lag_1', 'date_shop_category_avg_item_cnt_lag_1', 'date_shop_meta_category_avg_item_cnt_lag_1', 'date_shop_subtype_avg_item_cnt_lag_1', 'date_city_avg_item_cnt_lag_1', 'date_item_city_avg_item_cnt_lag_1', 'date_meta_category_avg_item_cnt_lag_1', 'date_subtype_avg_item_cnt_lag_1', 'delta_price_lag']


Unnamed: 0,date_block_num,target,month_0,month_1,month_2,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,days_28,days_30,days_31,item_target_enc,shop_target_enc,item_category_target_enc,city_code_target_enc,meta_category_code_target_enc,subtype_code_target_enc,target_lag_1,target_lag_3,target_lag_6,date_avg_item_cnt_lag_1,date_item_avg_item_cnt_lag_1,date_item_avg_item_cnt_lag_3,date_item_avg_item_cnt_lag_6,date_shop_avg_item_cnt_lag_1,date_shop_avg_item_cnt_lag_3,date_shop_avg_item_cnt_lag_6,date_category_avg_item_cnt_lag_1,date_shop_category_avg_item_cnt_lag_1,date_shop_meta_category_avg_item_cnt_lag_1,date_shop_subtype_avg_item_cnt_lag_1,date_city_avg_item_cnt_lag_1,date_item_city_avg_item_cnt_lag_1,date_meta_category_avg_item_cnt_lag_1,date_subtype_avg_item_cnt_lag_1,delta_price_lag
11127999,34,0.0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0.004797,0.128582,0.002047,0.183704,0.012026,0.002047,0.002295,0.003056,0.003056,0.565903,0.001598,0.001693,0.003593,0.058583,0.062831,0.06528,0.000722,0.000863,0.000156,0.000316,0.10132,0.00201,0.003674,0.000722,0.130842
11128000,34,0.0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0.000157,0.128582,0.002771,0.183704,0.028118,0.002771,0.00153,0.003056,0.003056,0.565903,0.001515,0.001431,0.001431,0.058583,0.0,8.3e-05,0.00057,0.000838,0.000197,0.000291,0.10132,0.001608,0.007825,0.00057,0.269489
11128001,34,0.0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0.001139,0.128582,0.002047,0.183704,0.012026,0.002047,0.00153,0.003056,0.003056,0.565903,0.001847,0.00178,0.002346,0.058583,0.062831,0.06528,0.000722,0.000863,0.000156,0.000316,0.10132,0.001608,0.003674,0.000722,0.288127
11128002,34,0.0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0.000519,0.128582,0.002331,0.183704,0.015651,0.002331,0.00153,0.003056,0.003056,0.565903,0.001598,0.002041,0.001764,0.058583,0.062831,0.06528,0.000812,0.00083,0.000169,0.000283,0.10132,0.001608,0.005654,0.000812,0.226403
11128003,34,0.0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0.001809,0.128582,0.00163,0.183704,0.015651,0.001638,0.00153,0.003056,0.003056,0.565903,0.001681,0.001519,0.001681,0.058583,0.062831,0.06528,0.000963,0.000864,0.000169,0.000316,0.10132,0.00201,0.005654,0.000899,0.098223


### Split Data into Train Val and Test Set (Linear Model)

In [13]:
%%time
X_train = data_lm[data_lm.date_block_num < 33].drop(['target'], axis=1)
y_train = data_lm[data_lm.date_block_num < 33]['target']
X_val = data_lm[data_lm.date_block_num == 33].drop(['target'], axis=1)
y_val = data_lm[data_lm.date_block_num == 33]['target']
X_test = data_lm[data_lm.date_block_num == 34].drop(['target'], axis=1)

CPU times: user 3.98 s, sys: 2.29 s, total: 6.27 s
Wall time: 6.47 s


In [14]:
%%time
# define model
n_neighbors = 5
model = neighbors.KNeighborsRegressor(n_neighbors, weights='uniform')

CPU times: user 40 µs, sys: 1e+03 ns, total: 41 µs
Wall time: 46.7 µs


In [15]:
%%time
model.fit(X_train, y_train)

# Save the model as a pickle in a file
joblib.dump(model, 'knn_model.pkl')

filename = 'knn_model.sav'
pickle.dump(model, open(filename, 'wb'))

CPU times: user 2.62 s, sys: 8.34 s, total: 11 s
Wall time: 14.4 s


In [16]:
%time
y_pred = model.predict(X_val).clip(0, 20)
# Basic RMSE
print('The rmse of prediction is:', round(mean_squared_error(y_pred, y_val) ** 0.5, 5))

# save predictions for an ensemble
pickle.dump(y_pred, open('knn_val.pickle', 'wb'))

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 5.25 µs


KeyboardInterrupt: 

In [None]:
%time
# Predict
y_test = model.predict(X_test).clip(0, 20)

# save predictions for submission
pickle.dump(y_test, open('knn_test.pickle', 'wb'))

In [None]:
print('It takes %s minutes' % ((time.time() - start_0)/60))