In [99]:
import numpy as np
import pandas as pd
import datetime
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from itertools import product

In [2]:
item_categories = pd.read_csv('data/item_categories.csv', index_col='item_category_id')
items = pd.read_csv('data/items.csv', index_col='item_id')
shops = pd.read_csv('data/shops.csv', index_col='shop_id')
train_df = pd.read_csv('data/train_dataset.csv')
test_df = pd.read_csv('data/test_dataset.csv')

In [54]:
train_df.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,item_category_id,total_income,days,weeks
0,2013-01-01,0,18,5823,2500.0,1.0,35,2500.0,0,0
1,2013-01-01,0,27,5573,849.0,1.0,2,849.0,0,0
2,2013-01-01,0,7,1006,399.0,1.0,67,399.0,0,0
3,2013-01-01,0,19,17707,899.0,1.0,19,899.0,0,0
4,2013-01-01,0,14,19548,149.0,1.0,40,149.0,0,0


In [3]:
train_sales = train_df.groupby(['shop_id', 'item_id', 'date_block_num']).agg({'item_cnt_day': 'sum', 'item_category_id': 'mean'}).reset_index()
test_sales = test_df.groupby(['shop_id', 'item_id', 'date_block_num']).agg({'item_cnt_day': 'sum', 'item_category_id': 'mean'}).reset_index()

In [43]:
train_df.groupby(['shop_id', 'item_id', 'date_block_num']).agg({'item_cnt_day': 'sum', 'item_category_id': 'mean'}).reset_index()

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_day,item_category_id
0,0,30,1,31.0,40
1,0,31,1,11.0,37
2,0,32,0,6.0,40
3,0,32,1,10.0,40
4,0,33,0,3.0,37
...,...,...,...,...,...
1577588,59,22164,27,2.0,37
1577589,59,22164,30,1.0,37
1577590,59,22167,9,1.0,49
1577591,59,22167,11,2.0,49


In [71]:
train_sales = pd.read_csv('data/sales_train.csv')

In [72]:
train_sales = train_sales.groupby(['shop_id', 'item_id', 'date_block_num']).agg({'item_cnt_day': 'sum'}).reset_index()

In [39]:
train_sales.set_index(['shop_id', 'item_id', 'date_block_num'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,item_cnt_day,item_name,item_category_id
shop_id,item_id,date_block_num,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,30,1,31.0,007: КООРДИНАТЫ «СКАЙФОЛЛ»,40
0,31,1,11.0,007: КООРДИНАТЫ «СКАЙФОЛЛ» (BD),37
0,32,0,6.0,1+1,40
0,32,1,10.0,1+1,40
0,33,0,3.0,1+1 (BD),37
...,...,...,...,...,...
59,22164,27,2.0,ЯРОСТЬ (BD),37
59,22164,30,1.0,ЯРОСТЬ (BD),37
59,22167,9,1.0,Язык запросов 1С:Предприятия 8 (+CD). Хрустале...,49
59,22167,11,2.0,Язык запросов 1С:Предприятия 8 (+CD). Хрустале...,49


In [41]:
59*22169*34

44471014

In [46]:
train = []

for i in train_sales['date_block_num'].unique():
    all_shop = train_sales.loc[train_sales['date_block_num']==i, 'shop_id'].unique()
    all_item = train_sales.loc[train_sales['date_block_num']==i, 'item_id'].unique()
    train.append(np.array(list(product([i], all_shop, all_item))))

idx_features = ['date_block_num', 'shop_id', 'item_id']
train = pd.DataFrame(np.vstack(train), columns=idx_features)

train

Unnamed: 0,date_block_num,shop_id,item_id
0,1,0,30
1,1,0,31
2,1,0,32
3,1,0,33
4,1,0,35
...,...,...,...
10913845,29,59,5395
10913846,29,59,5865
10913847,29,59,7806
10913848,29,59,11876


In [75]:
all_sales = train.merge(train_sales, how='left', on=idx_features)

In [76]:
all_sales.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_day
0,1,0,30,31.0
1,1,0,31,11.0
2,1,0,32,10.0
3,1,0,33,3.0
4,1,0,35,14.0


In [77]:
all_sales.info(null_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10913850 entries, 0 to 10913849
Data columns (total 4 columns):
 #   Column          Non-Null Count     Dtype  
---  ------          --------------     -----  
 0   date_block_num  10913850 non-null  int64  
 1   shop_id         10913850 non-null  int64  
 2   item_id         10913850 non-null  int64  
 3   item_cnt_day    1609124 non-null   float64
dtypes: float64(1), int64(3)
memory usage: 416.3 MB


In [79]:
all_sales = all_sales.merge(items.drop(columns=['item_name']), how='left', on='item_id')

In [80]:
all_sales.fillna(value=0, inplace=True)

In [82]:
all_sales.info(null_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10913850 entries, 0 to 10913849
Data columns (total 5 columns):
 #   Column            Non-Null Count     Dtype  
---  ------            --------------     -----  
 0   date_block_num    10913850 non-null  int64  
 1   shop_id           10913850 non-null  int64  
 2   item_id           10913850 non-null  int64  
 3   item_cnt_day      10913850 non-null  float64
 4   item_category_id  10913850 non-null  int64  
dtypes: float64(1), int64(4)
memory usage: 499.6 MB


In [83]:
train_X = all_sales[all_sales.date_block_num < 33].copy()
train_y = train_X.item_cnt_day
train_X.drop(columns=['item_cnt_day'], inplace=True)

In [84]:
test_X = all_sales[all_sales.date_block_num == 33].copy()
test_y = test_X.item_cnt_day
test_X.drop(columns=['item_cnt_day'], inplace=True)

In [85]:
boosting = GradientBoostingRegressor(n_estimators=200)

In [86]:
boosting.fit(train_X, train_y)

GradientBoostingRegressor(n_estimators=200)

In [87]:
predicted = boosting.predict(test_X)

In [88]:
def MSE(series_true, series_predicted):
    return sum((series_true - series_predicted)**2 / len(series_true))**0.5

In [89]:
MSE(test_y, predicted)

5.1730721708771865

In [96]:
mses = [MSE(test_y, est[0].predict(test_X)) for est in boosting.estimators_]

In [98]:
min(mses)

5.200919507987963

In [100]:
tree = DecisionTreeRegressor(max_depth=20)

In [101]:
tree.fit(train_X, train_y)

DecisionTreeRegressor(max_depth=20)

In [102]:
MSE(test_y, tree.predict(test_X))

4.971666290284159

In [103]:
mses = {}
max_depths = list(range(2, 21))
for max_depth in max_depths:
    tree = DecisionTreeRegressor(max_depth=max_depth)
    tree.fit(train_X, train_y)
    mses[max_depth] = MSE(test_y, tree.predict(test_X))
    print('max_depth:', max_depth)
    print('MSE:', mses[max_depth])
    print('--------------------------------------------')

max_depth: 2
MSE: 5.340807043723677
--------------------------------------------
max_depth: 3
MSE: 5.338903378052625
--------------------------------------------
max_depth: 4
MSE: 5.226081905338992
--------------------------------------------
max_depth: 5
MSE: 5.229955366147048
--------------------------------------------
max_depth: 6
MSE: 5.287071566530162
--------------------------------------------
max_depth: 7
MSE: 5.153591204994831
--------------------------------------------
max_depth: 8
MSE: 5.132349287491448
--------------------------------------------
max_depth: 9
MSE: 5.0967532830413536
--------------------------------------------
max_depth: 10
MSE: 5.063229204642131
--------------------------------------------
max_depth: 11
MSE: 5.193188084465128
--------------------------------------------
max_depth: 12
MSE: 5.271393432667224
--------------------------------------------
max_depth: 13
MSE: 5.092865718227058
--------------------------------------------
max_depth: 14
MSE: 5.10