In [92]:
#!kaggle competitions download -c competitive-data-science-predict-future-sales
#!pip install lightgbm
#!pip install xgboost
#!conda install catboost
#!pip install ipywidgets
#!jupyter nbextension enable --py widgetsnbextension

In [1]:
import pandas as pd
import numpy as np
import gc
import matplotlib.pyplot as plt
%matplotlib inline 

pd.set_option('display.max_rows', 600)
pd.set_option('display.max_columns', 50)

import lightgbm as lgb
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from tqdm import tqdm_notebook

from itertools import product

import xgboost as xgb
import sklearn.svm as svm
from sklearn.linear_model import LinearRegression
from catboost import Pool, CatBoostRegressor


def downcast_dtypes(df):
    '''
        Changes column types in the dataframe: 
                
                `float64` type to `float32`
                `int64`   type to `int32`
    '''
    
    # Select columns to downcast
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype == "int64"]
    
    # Downcast
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int32)
    
    return df

## Brief data analysis

Here is recommendation from lections:

Try to carefully tune hyper parameters of your models, maybe there is a better set of parameters for your model out there. But don't spend too much time on it. Try ensembling. Start with simple averaging of linear model and gradient boosted trees like in programming assignment notebook. And then try to use stacking. Explore new features! There is a lot of useful information in the data: text descriptions, item categories, seasonal trends.

In [2]:
items = pd.read_csv('final_project_data/items.csv')
item_categories = pd.read_csv('final_project_data/item_categories.csv')
sales_train = pd.read_csv('final_project_data/sales_train.csv.gz')
sample_submission = pd.read_csv('final_project_data/sample_submission.csv.gz')
shops = pd.read_csv('final_project_data/shops.csv')
test = pd.read_csv('final_project_data/test.csv.gz')

In [3]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [4]:
items.head(3)
item_categories.head(3)
print(sales_train.shape)
sales_train.head(3)
print(test.shape)
test.head(3)
print(sample_submission.shape)
sample_submission.head(3)

Unnamed: 0,item_name,item_id,item_category_id
0,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,0,40
1,!ABBYY FineReader 12 Professional Edition Full...,1,76
2,***В ЛУЧАХ СЛАВЫ (UNV) D,2,40


Unnamed: 0,item_category_name,item_category_id
0,PC - Гарнитуры/Наушники,0
1,Аксессуары - PS2,1
2,Аксессуары - PS3,2


(2935849, 6)


Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0


(214200, 3)


Unnamed: 0,ID,shop_id,item_id
0,0,5,5037
1,1,5,5320
2,2,5,5233


(214200, 2)


Unnamed: 0,ID,item_cnt_month
0,0,0.5
1,1,0.5
2,2,0.5


I used separate notebook to draw histograms and scatter plots as well as prepare previous value benchmark prediction.

## Prepare some features
Let's prepare features and dataset

In [5]:
sales = sales_train

# Create "grid" with columns
index_cols = ['shop_id', 'item_id', 'date_block_num']

# For every month we create a grid from all shops/items combinations from that month
# We can not use 
grid = [] 
for block_num in sales['date_block_num'].unique():
    cur_shops = sales.loc[sales['date_block_num'] == block_num, 'shop_id'].unique()
#     print(cur_shops)
    cur_items = sales.loc[sales['date_block_num'] == block_num, 'item_id'].unique()
#     print(cur_items)
    grid.append(np.array(list(product(*[cur_shops, cur_items, [block_num]])),dtype='int32'))
#     print(grid)

# Turn the grid into a dataframe
grid = pd.DataFrame(np.vstack(grid), columns = index_cols,dtype=np.int32)
# print(grid)

# Groupby data to get shop-item-month aggregates
gb = sales.groupby(index_cols,as_index=False).agg({'item_cnt_day':'sum'})
print(gb.head())
gb.rename(columns={'item_cnt_day':'item_cnt_month'}, inplace=True)

# Join it to the grid
trainset = pd.merge(grid, gb, how='left', on=index_cols).fillna(0)
print(trainset.head())

   shop_id  item_id  date_block_num  item_cnt_day
0        0       30               1          31.0
1        0       31               1          11.0
2        0       32               0           6.0
3        0       32               1          10.0
4        0       33               0           3.0
   shop_id  item_id  date_block_num  item_cnt_month
0       59    22154               0             1.0
1       59     2552               0             0.0
2       59     2554               0             0.0
3       59     2555               0             0.0
4       59     2564               0             0.0


Let's check how many items (shop_id, item_id, date_block_num) have zero sales per month

In [6]:
trainset.shape
sales_train.shape
sales_train.shape[0] / trainset.shape[0]

(10913850, 4)

(2935849, 6)

0.2690021394833171

In [7]:
test.shape
test[~test.item_id.isin(trainset.item_id.unique())].shape

(214200, 3)

(15246, 3)

So, we have 15246 rows which contains new items in a test set for which we will not have lag information.

In [8]:
test.item_id.unique().shape
trainset.item_id.unique().shape
test.item_id.unique()[np.isin(test.item_id.unique(), trainset.item_id.unique(), invert=True)].shape
15246 / 363

(5100,)

(21807,)

(363,)

42.0

So, we have 214200 items in test set, from which 15246 contains new item_id, which is about 7% оf data and we will have to predict values for them based on item descriptions taken from item category.
But, first let's try to do something for items for which we have historical data...

In [9]:
#let's merge trainset and testset together
testset = test.drop(columns=['ID'])
testset['date_block_num'] = 34
testset['item_cnt_month'] = -1
train_test_set = pd.concat([testset, trainset], sort=False)
train_test_set.head(1)

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month
0,5,5037,34,-1.0


Now, make some features

In [10]:
# Same as above but with shop-month aggregates
gb = sales.groupby(['shop_id', 'date_block_num'],as_index=False).agg({'item_cnt_day':{'item_cnt_month_shop':'sum'}})
gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values]
train_test_set = pd.merge(train_test_set, gb, how='left', on=['shop_id', 'date_block_num']).fillna(0)

# Same as above but with item-month aggregates
gb = sales.groupby(['item_id', 'date_block_num'],as_index=False).agg({'item_cnt_day':{'item_cnt_month_item':'sum'}})
gb.columns = [col[0] if col[-1] == '' else col[-1] for col in gb.columns.values]
train_test_set = pd.merge(train_test_set, gb, how='left', on=['item_id', 'date_block_num']).fillna(0)

# Downcast dtypes from 64 to 32 bit to save memory
train_test_set = downcast_dtypes(train_test_set)
del gb 
gc.collect();
print(train_test_set.head())

  return super(DataFrameGroupBy, self).aggregate(arg, *args, **kwargs)


84

   shop_id  item_id  date_block_num  item_cnt_month  item_cnt_month_shop  \
0        5     5037              34            -1.0                  0.0   
1        5     5320              34            -1.0                  0.0   
2        5     5233              34            -1.0                  0.0   
3        5     5232              34            -1.0                  0.0   
4        5     5268              34            -1.0                  0.0   

   item_cnt_month_item  
0                  0.0  
1                  0.0  
2                  0.0  
3                  0.0  
4                  0.0  


In [11]:
# List of columns that we will use to create lags
cols_to_rename = list(train_test_set.columns.difference(index_cols)) 
print(cols_to_rename)

shift_range = [1, 2, 3, 4, 5, 6, 7, 12, 24]

['item_cnt_month', 'item_cnt_month_item', 'item_cnt_month_shop']


In [12]:
for shift in tqdm_notebook(shift_range):
    train_test_set_copy = train_test_set[index_cols + cols_to_rename].copy()
    train_test_set_copy['date_block_num'] += shift
    renamer = lambda x: '{}_lag_{}'.format(x, shift) if x in cols_to_rename else x
    train_test_set_copy = train_test_set_copy.rename(columns=renamer)
    train_test_set = train_test_set.merge(train_test_set_copy, on=index_cols, how='left').fillna(0)

HBox(children=(IntProgress(value=0, max=9), HTML(value='')))




In [13]:
train_test_set.head(3)

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month,item_cnt_month_shop,item_cnt_month_item,item_cnt_month_lag_1,item_cnt_month_item_lag_1,item_cnt_month_shop_lag_1,item_cnt_month_lag_2,item_cnt_month_item_lag_2,item_cnt_month_shop_lag_2,item_cnt_month_lag_3,item_cnt_month_item_lag_3,item_cnt_month_shop_lag_3,item_cnt_month_lag_4,item_cnt_month_item_lag_4,item_cnt_month_shop_lag_4,item_cnt_month_lag_5,item_cnt_month_item_lag_5,item_cnt_month_shop_lag_5,item_cnt_month_lag_6,item_cnt_month_item_lag_6,item_cnt_month_shop_lag_6,item_cnt_month_lag_7,item_cnt_month_item_lag_7,item_cnt_month_shop_lag_7,item_cnt_month_lag_12,item_cnt_month_item_lag_12,item_cnt_month_shop_lag_12,item_cnt_month_lag_24,item_cnt_month_item_lag_24,item_cnt_month_shop_lag_24
0,5,5037,34,-1.0,0.0,0.0,0.0,25.0,1052.0,1.0,110.0,1092.0,3.0,119.0,1294.0,1.0,54.0,991.0,1.0,105.0,954.0,1.0,87.0,1012.0,0.0,28.0,1054.0,1.0,65.0,1445.0,0.0,0.0,0.0
1,5,5320,34,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5,5233,34,-1.0,0.0,0.0,1.0,42.0,1052.0,3.0,80.0,1092.0,1.0,150.0,1294.0,0.0,37.0,991.0,2.0,119.0,954.0,3.0,71.0,1012.0,0.0,8.0,1054.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
train_test_set = train_test_set.merge(items[['item_id', 'item_category_id']], how='left', on='item_id')
train_test_set = downcast_dtypes(train_test_set)
gc.collect();

In [15]:
train_test_set.head()

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month,item_cnt_month_shop,item_cnt_month_item,item_cnt_month_lag_1,item_cnt_month_item_lag_1,item_cnt_month_shop_lag_1,item_cnt_month_lag_2,item_cnt_month_item_lag_2,item_cnt_month_shop_lag_2,item_cnt_month_lag_3,item_cnt_month_item_lag_3,item_cnt_month_shop_lag_3,item_cnt_month_lag_4,item_cnt_month_item_lag_4,item_cnt_month_shop_lag_4,item_cnt_month_lag_5,item_cnt_month_item_lag_5,item_cnt_month_shop_lag_5,item_cnt_month_lag_6,item_cnt_month_item_lag_6,item_cnt_month_shop_lag_6,item_cnt_month_lag_7,item_cnt_month_item_lag_7,item_cnt_month_shop_lag_7,item_cnt_month_lag_12,item_cnt_month_item_lag_12,item_cnt_month_shop_lag_12,item_cnt_month_lag_24,item_cnt_month_item_lag_24,item_cnt_month_shop_lag_24,item_category_id
0,5,5037,34,-1.0,0.0,0.0,0.0,25.0,1052.0,1.0,110.0,1092.0,3.0,119.0,1294.0,1.0,54.0,991.0,1.0,105.0,954.0,1.0,87.0,1012.0,0.0,28.0,1054.0,1.0,65.0,1445.0,0.0,0.0,0.0,19
1,5,5320,34,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,55
2,5,5233,34,-1.0,0.0,0.0,1.0,42.0,1052.0,3.0,80.0,1092.0,1.0,150.0,1294.0,0.0,37.0,991.0,2.0,119.0,954.0,3.0,71.0,1012.0,0.0,8.0,1054.0,0.0,0.0,0.0,0.0,0.0,0.0,19
3,5,5232,34,-1.0,0.0,0.0,0.0,28.0,1052.0,0.0,48.0,1092.0,1.0,65.0,1294.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,23
4,5,5268,34,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20


In [16]:
#Number of month from last sale of shop/item (Use info from past)
#Number of month from last sale of item(Use info from past)¶
#Item name (Tfidf text feature)
train_test_set.item_cnt_month = train_test_set.item_cnt_month.fillna(0).clip(0,20)

In [17]:
train_test_set.to_csv('train_test_set.csv')

In [19]:
train_test_set = pd.read_csv('train_test_set.csv')

# Fitting model
Try to use different models, do hyperparameters turning using train and validation set.
Submit prediction for the model trained on full train set.

Try the following:
- XGBoost
- CatBoost
- SVM
- LinearRegression

TODO: SVM (too long to wait), GBDT, Neural Network, kNN
This set contains models from 4 different classes, so it is the most diverse set and it should be in general case be the best for stacking

In [84]:
train_test_set.columns

Index(['Unnamed: 0', 'shop_id', 'item_id', 'date_block_num', 'item_cnt_month',
       'item_cnt_month_shop', 'item_cnt_month_item', 'item_cnt_month_lag_1',
       'item_cnt_month_item_lag_1', 'item_cnt_month_shop_lag_1',
       'item_cnt_month_lag_2', 'item_cnt_month_item_lag_2',
       'item_cnt_month_shop_lag_2', 'item_cnt_month_lag_3',
       'item_cnt_month_item_lag_3', 'item_cnt_month_shop_lag_3',
       'item_cnt_month_lag_4', 'item_cnt_month_item_lag_4',
       'item_cnt_month_shop_lag_4', 'item_cnt_month_lag_5',
       'item_cnt_month_item_lag_5', 'item_cnt_month_shop_lag_5',
       'item_cnt_month_lag_6', 'item_cnt_month_item_lag_6',
       'item_cnt_month_shop_lag_6', 'item_cnt_month_lag_7',
       'item_cnt_month_item_lag_7', 'item_cnt_month_shop_lag_7',
       'item_cnt_month_lag_12', 'item_cnt_month_item_lag_12',
       'item_cnt_month_shop_lag_12', 'item_cnt_month_lag_24',
       'item_cnt_month_item_lag_24', 'item_cnt_month_shop_lag_24',
       'item_category_id'],
     

In [21]:
train_time_min = 0
train_time_max = 32
train_time_val = 33
train_time_test = 34

train_set = train_test_set[(train_test_set.date_block_num >= train_time_min) & 
                           (train_test_set.date_block_num <= train_time_max)]
drop_columns = ['item_cnt_month', 'item_cnt_month_shop', 'item_cnt_month_item']
train_set_y = train_set.item_cnt_month
train_set_x = train_set.drop(columns=drop_columns)


val_set = train_test_set[(train_test_set.date_block_num == train_time_val)]
val_set_y = val_set.item_cnt_month
val_set_x = val_set.drop(columns=drop_columns)


test_set = train_test_set[(train_test_set.date_block_num == train_time_test)]
test_set_x = test_set.drop(columns=drop_columns)
test_set_y = test_set.item_cnt_month

full_train_set = train_test_set[(train_test_set.date_block_num >= train_time_min) &
                                (train_test_set.date_block_num <= train_time_val)]
full_train_set_x = full_train_set.drop(columns=drop_columns)
full_train_set_y = full_train_set.item_cnt_month

## XGBRegression

In [63]:
# Use train_set_x/train_set_y and val_set_x/val_set_y to find optimal hyperparameters
xgb_model = xgb.XGBRegressor()

eval_set = [(val_set_x.values, val_set_y.values)]
xgb_model.fit(X=train_set_x.values, y=train_set_y.values, eval_metric='rmse',
          eval_set=eval_set)

[0]	validation_0-rmse:1.12993
[1]	validation_0-rmse:1.10356
[2]	validation_0-rmse:1.08196
[3]	validation_0-rmse:1.06437
[4]	validation_0-rmse:1.04903
[5]	validation_0-rmse:1.03615
[6]	validation_0-rmse:1.02533
[7]	validation_0-rmse:1.01648
[8]	validation_0-rmse:1.00893
[9]	validation_0-rmse:1.00268
[10]	validation_0-rmse:0.997432
[11]	validation_0-rmse:0.993085
[12]	validation_0-rmse:0.988875
[13]	validation_0-rmse:0.985875
[14]	validation_0-rmse:0.983124
[15]	validation_0-rmse:0.980945
[16]	validation_0-rmse:0.979304
[17]	validation_0-rmse:0.977178
[18]	validation_0-rmse:0.975725
[19]	validation_0-rmse:0.974137
[20]	validation_0-rmse:0.97299
[21]	validation_0-rmse:0.972265
[22]	validation_0-rmse:0.971785
[23]	validation_0-rmse:0.970655
[24]	validation_0-rmse:0.969756
[25]	validation_0-rmse:0.969307
[26]	validation_0-rmse:0.968531
[27]	validation_0-rmse:0.968134
[28]	validation_0-rmse:0.967677
[29]	validation_0-rmse:0.967131
[30]	validation_0-rmse:0.966345
[31]	validation_0-rmse:0.9656

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, importance_type='gain',
       learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
       nthread=None, objective='reg:linear', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=1)

In [98]:
# Use full_train_set_x/full_train_set_y and test_set_x to predict final values

model = xgb.XGBRegressor()

model.fit(X=full_train_set_x.values, y=full_train_set_y.values, eval_metric='rmse', nthread = 8)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, importance_type='gain',
       learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
       nthread=None, objective='reg:linear', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=1)

In [101]:
pred = np.clip(model.predict(test_set_x.values), 0, 20)
df = pd.DataFrame(pred, columns = ['item_cnt_month'])
df['ID'] = df.index
df = df.set_index('ID')
df.to_csv('xgb_pred.csv')

In [102]:
!kaggle competitions submit -c competitive-data-science-final-project -f xgb_pred.csv -m "Predict using XGBoost model"

100%|███████████████████████████████████████| 3.46M/3.46M [00:06<00:00, 562kB/s]
Successfully submitted to Final project: predict future sales

In [103]:
!kaggle competitions submit -c competitive-data-science-predict-future-sales -f xgb_pred.csv -m "Predict using XGBoost model"

100%|███████████████████████████████████████| 3.46M/3.46M [00:05<00:00, 700kB/s]
Successfully submitted to Predict Future Sales

## LinearRegression

In [23]:
# Now train SVM model and make linear combination of SVM and XGB models
lr_model = LinearRegression()
lr_model.fit(X=full_train_set_x.values, y=full_train_set_y.values)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [24]:
lr_pred = np.clip(lr_model.predict(test_set_x.values), 0, 20)
lr_df = pd.DataFrame(lr_pred, columns = ['item_cnt_month'])
lr_df['ID'] = lr_df.index
lr_df = lr_df.set_index('ID')
lr_df.to_csv('lr_pred.csv')

In [25]:
!kaggle competitions submit -c competitive-data-science-final-project -f lr_pred.csv -m "Predict using LinearRegression model"

100%|███████████████████████████████████████| 5.16M/5.16M [00:08<00:00, 626kB/s]
Successfully submitted to Final project: predict future sales

In [26]:
!kaggle competitions submit -c competitive-data-science-predict-future-sales -f lr_pred.csv -m "Predict using LinearRegression model"

100%|███████████████████████████████████████| 5.16M/5.16M [00:08<00:00, 667kB/s]
Successfully submitted to Predict Future Sales

## CatBoost

In [41]:
train_set_x.columns

Index(['Unnamed: 0', 'shop_id', 'item_id', 'date_block_num',
       'item_cnt_month_lag_1', 'item_cnt_month_item_lag_1',
       'item_cnt_month_shop_lag_1', 'item_cnt_month_lag_2',
       'item_cnt_month_item_lag_2', 'item_cnt_month_shop_lag_2',
       'item_cnt_month_lag_3', 'item_cnt_month_item_lag_3',
       'item_cnt_month_shop_lag_3', 'item_cnt_month_lag_4',
       'item_cnt_month_item_lag_4', 'item_cnt_month_shop_lag_4',
       'item_cnt_month_lag_5', 'item_cnt_month_item_lag_5',
       'item_cnt_month_shop_lag_5', 'item_cnt_month_lag_6',
       'item_cnt_month_item_lag_6', 'item_cnt_month_shop_lag_6',
       'item_cnt_month_lag_7', 'item_cnt_month_item_lag_7',
       'item_cnt_month_shop_lag_7', 'item_cnt_month_lag_12',
       'item_cnt_month_item_lag_12', 'item_cnt_month_shop_lag_12',
       'item_cnt_month_lag_24', 'item_cnt_month_item_lag_24',
       'item_cnt_month_shop_lag_24', 'item_category_id'],
      dtype='object')

In [42]:
cat_features = [1, 2, 31]

train_pool = Pool(train_set_x.values, 
                  train_set_y.values, cat_features=cat_features)
eval_pool = Pool(val_set_x.values, 
                 val_set_y.values, cat_features=cat_features)

test_pool = Pool(test_set_x.values, 
                 test_set_y.values, cat_features=cat_features)
full_train_pool = Pool(full_train_set_x.values, 
                 full_train_set_y.values, cat_features=cat_features)

In [51]:
cb_model = CatBoostRegressor(iterations=50,
                             #depth=5,
                             learning_rate=0.1,
                             loss_function='RMSE')

In [52]:
#train the model
cb_model.fit(train_pool, verbose=True, eval_set=eval_pool)

0:	learn: 1.2153801	test: 1.1314660	best: 1.1314660 (0)	total: 2.6s	remaining: 2m 7s
1:	learn: 1.1771990	test: 1.1038188	best: 1.1038188 (1)	total: 4.97s	remaining: 1m 59s
2:	learn: 1.1447868	test: 1.0806592	best: 1.0806592 (2)	total: 7.21s	remaining: 1m 53s
3:	learn: 1.1174379	test: 1.0606022	best: 1.0606022 (3)	total: 9.38s	remaining: 1m 47s
4:	learn: 1.0940679	test: 1.0437758	best: 1.0437758 (4)	total: 11.6s	remaining: 1m 44s
5:	learn: 1.0742150	test: 1.0300732	best: 1.0300732 (5)	total: 13.7s	remaining: 1m 40s
6:	learn: 1.0572393	test: 1.0187123	best: 1.0187123 (6)	total: 16.1s	remaining: 1m 38s
7:	learn: 1.0416459	test: 1.0070225	best: 1.0070225 (7)	total: 18.3s	remaining: 1m 35s
8:	learn: 1.0294643	test: 0.9987969	best: 0.9987969 (8)	total: 20.5s	remaining: 1m 33s
9:	learn: 1.0157625	test: 0.9929415	best: 0.9929415 (9)	total: 23.2s	remaining: 1m 32s
10:	learn: 1.0042177	test: 0.9859275	best: 0.9859275 (10)	total: 25.7s	remaining: 1m 30s
11:	learn: 0.9858239	test: 0.9835597	best: 

<catboost.core.CatBoostRegressor at 0x7efd835f3748>

In [60]:
# make the prediction using the resulting model
cb_pred_val = cb_model.predict(eval_pool)
print(cb_pred_val)

[0.06885867 0.58769883 0.29647026 ... 0.08327507 0.08327507 0.06885867]


In [61]:
from sklearn.metrics import mean_squared_error

In [62]:
mean_squared_error(val_set_y.values, np.clip(cb_pred_val, 0 , 20))

0.967389707378649

In [57]:
cb_pred = np.clip(cb_model.predict(test_pool), 0, 20)
cb_df = pd.DataFrame(cb_pred, columns = ['item_cnt_month'])
cb_df['ID'] = cb_df.index
cb_df = cb_df.set_index('ID')
cb_df.to_csv('cb_pred.csv')

In [59]:
!kaggle competitions submit -c competitive-data-science-final-project -f cb_pred.csv -m "Predict using CatBoost model"
!kaggle competitions submit -c competitive-data-science-predict-future-sales -f cb_pred.csv -m "Predict using CatBoost model"

100%|██████████████████████████████████████| 5.35M/5.35M [00:03<00:00, 1.64MB/s]
100%|██████████████████████████████████████| 5.35M/5.35M [00:02<00:00, 1.93MB/s]
Successfully submitted to Predict Future Sales

## Ensembling

Try to use linear combination of LinearRegression, CatBoost and XGBoost

In [64]:
lr_model = LinearRegression()
lr_model.fit(X=train_set_x.values, y=train_set_y.values)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [65]:
xb_pred = np.clip(xgb_model.predict(val_set_x.values), 0, 20)
lr_pred = np.clip(lr_model.predict(val_set_x.values), 0, 20)
cb_pred = np.clip(cb_model.predict(val_set_x.values), 0, 20)

In [66]:
mean_squared_error(xb_pred, val_set_y.values)
mean_squared_error(lr_pred, val_set_y.values)
mean_squared_error(cb_pred, val_set_y.values)

0.9120986292023764

1.0974785287366815

0.967389707378649

In [73]:
alphas_to_try = np.linspace(0, 1, 1001)

# YOUR CODE GOES HERE
best_alpha = 0# YOUR CODE GOES HERE
r2_train_simple_mix = 100# YOUR CODE GOES HERE

for alpha in alphas_to_try:
    y_pred = alpha * (0.915 * xb_pred + (1 - 0.915) * cb_pred) + (1 - alpha) * lr_pred
    score = mean_squared_error(val_set_y.values, y_pred)
    if score < r2_train_simple_mix:
        best_alpha = alpha
        r2_train_simple_mix = score

print('Best alpha: %f; Corresponding rmse on train: %f' % (best_alpha, r2_train_simple_mix))

Best alpha: 0.988000; Corresponding rmse on train: 0.911596


Train LinearRegression, CatBoost and XGBoost models on full train set and submit predictions

In [74]:
lr_full_model = LinearRegression()
lr_full_model.fit(X=full_train_set_x.values, y=full_train_set_y.values)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [None]:
# Use full_train_set_x/full_train_set_y and test_set_x to predict final values

xgb_full_model = xgb.XGBRegressor(nthread = 8)
xgb_full_model.fit(X=full_train_set_x.values, y=full_train_set_y.values, eval_metric='rmse')

In [77]:
xgb_full_model

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, importance_type='gain',
       learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
       nthread=8, objective='reg:linear', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=1)

In [79]:
cb_full_model = CatBoostRegressor(iterations=15,
                             #depth=5,
                             learning_rate=0.1,
                             loss_function='RMSE')
cb_full_model.fit(full_train_pool, verbose=True)

0:	learn: 1.2136207	total: 2.51s	remaining: 35.1s
1:	learn: 1.1757179	total: 4.88s	remaining: 31.7s
2:	learn: 1.1434749	total: 7.15s	remaining: 28.6s
3:	learn: 1.1161515	total: 9.52s	remaining: 26.2s
4:	learn: 1.0897444	total: 12s	remaining: 24s
5:	learn: 1.0634160	total: 14.3s	remaining: 21.4s
6:	learn: 1.0394609	total: 16.1s	remaining: 18.4s
7:	learn: 1.0206620	total: 18.1s	remaining: 15.8s
8:	learn: 1.0030630	total: 20.9s	remaining: 14s
9:	learn: 0.9864289	total: 23.5s	remaining: 11.8s
10:	learn: 0.9748635	total: 25.8s	remaining: 9.39s
11:	learn: 0.9632853	total: 29.2s	remaining: 7.29s
12:	learn: 0.9547901	total: 31.8s	remaining: 4.9s
13:	learn: 0.9458146	total: 34.3s	remaining: 2.45s
14:	learn: 0.9373982	total: 37.1s	remaining: 0us


<catboost.core.CatBoostRegressor at 0x7efd835f09e8>

In [80]:
xb_test_pred = np.clip(xgb_full_model.predict(test_set_x.values), 0, 20)
lr_test_pred = np.clip(lr_full_model.predict(test_set_x.values), 0, 20)
cb_test_pred = np.clip(cb_full_model.predict(test_set_x.values), 0, 20)

In [81]:
y_pred = best_alpha * (0.915 * xb_test_pred + (1 - 0.915) * cb_test_pred) + (1 - best_alpha) * lr_test_pred

In [82]:
y_pred = np.clip(y_pred, 0, 20)
test_df = pd.DataFrame(y_pred, columns = ['item_cnt_month'])
test_df['ID'] = test_df.index
test_df = test_df.set_index('ID')
test_df.to_csv('test_pred.csv')

In [83]:
!kaggle competitions submit -c competitive-data-science-final-project -f test_pred.csv -m "Predict using CatBoost,LinearRegresssion and XGBoost model"
!kaggle competitions submit -c competitive-data-science-predict-future-sales -f test_pred.csv -m "Predict using CatBoost,LinearRegresssion and XGBoost model"

100%|███████████████████████████████████████| 5.36M/5.36M [00:11<00:00, 508kB/s]
100%|██████████████████████████████████████| 5.36M/5.36M [00:05<00:00, 1.09MB/s]
Successfully submitted to Predict Future Sales

I was able to improve score from  0.99042 to 0.98989, which is only slight improvement.
So, introducing more features seems to be more promissing way to improve score.
Will try to do next submission by using only XGBoost and introduce more features.