In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import time
import warnings

from category_encoders import TargetEncoder
from itertools import product

warnings.filterwarnings('ignore')

  import pandas.util.testing as tm


In [2]:
items = pd.read_csv('competitive-data-science-predict-future-sales/items.csv')
item_categories = pd.read_csv('competitive-data-science-predict-future-sales/item_categories.csv')
shops = pd.read_csv('competitive-data-science-predict-future-sales/shops.csv')
sales = pd.read_csv('competitive-data-science-predict-future-sales/sales_train.csv')
test = pd.read_csv('competitive-data-science-predict-future-sales/test.csv')

In [3]:
train = sales.merge(items, on=['item_id'], how='left').merge(item_categories, on=['item_category_id'], how='left').merge(shops, on=['shop_id'], how='left')

In [4]:
test_shop_ids = test['shop_id'].unique()
test_item_ids = test['item_id'].unique()

train_ = train[(train['shop_id'].isin(test_shop_ids)) | (train['item_id'].isin(test_item_ids))]

In [16]:
train.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,item_name,item_category_id,item_category_name,shop_name
0,02.01.2013,0,59,22154,999.0,1.0,ЯВЛЕНИЕ 2012 (BD),37,Кино - Blu-Ray,"Ярославль ТЦ ""Альтаир"""
1,03.01.2013,0,25,2552,899.0,1.0,DEEP PURPLE The House Of Blue Light LP,58,Музыка - Винил,"Москва ТРК ""Атриум"""
2,05.01.2013,0,25,2552,899.0,-1.0,DEEP PURPLE The House Of Blue Light LP,58,Музыка - Винил,"Москва ТРК ""Атриум"""
3,06.01.2013,0,25,2554,1709.05,1.0,DEEP PURPLE Who Do You Think We Are LP,58,Музыка - Винил,"Москва ТРК ""Атриум"""
4,15.01.2013,0,25,2555,1099.0,1.0,DEEP PURPLE 30 Very Best Of 2CD (Фирм.),56,Музыка - CD фирменного производства,"Москва ТРК ""Атриум"""


In [17]:
grouped = train_.sort_values('date').groupby(['date_block_num', 'item_category_id', 'shop_id', 'item_id'], as_index=False)
grouped = grouped.agg({'item_price': ['sum', 'mean'], 'item_cnt_day': ['sum', 'mean', 'count']})
grouped.columns = ['date_block_num', 'item_category_id', 'shop_id', 'item_id', 'item_price', 'item_price_mean', 'item_cnt_month', 'item_cnt_mean', 'item_cnt_count']


In [28]:
empty_df = []
for i in range(34):
    for shop_id in test_shop_ids:
        for item_id in test_item_ids:
            empty_df.append([i, shop_id, item_id])
empty_df = pd.DataFrame(empty_df, columns=['date_block_num', 'shop_id', 'item_id'])

In [42]:
train_df = empty_df.merge(grouped, on=['date_block_num', 'shop_id', 'item_id'], how='left')

In [43]:
train_df.fillna(0, inplace=True)
train_df['year'] = train_df['date_block_num'].apply(lambda x: x // 12 + 2013)
train_df['month'] = train_df['date_block_num'].apply(lambda x: x % 12)

In [44]:
group_cate_sum = train_df.groupby('item_category_id', as_index=False)['item_cnt_month'].sum().rename(columns={'item_cnt_month': 'group_cat_month'})
group_cate_mean = train_df.groupby('item_category_id', as_index=False)['item_cnt_month'].mean().rename(columns={'item_cnt_month': 'group_mean_cat_month'})
group_item_sum = train_df.groupby('item_id', as_index=False)['item_cnt_month'].sum().rename(columns={'item_cnt_month': 'group_item_month'})
group_item_mean = train_df.groupby('item_id', as_index=False)['item_cnt_month'].mean().rename(columns={'item_cnt_month': 'group_mean_item_month'})
group_shop_sum = train_df.groupby('shop_id', as_index=False)['item_cnt_month'].mean().rename(columns={'item_cnt_month': 'group_shop_month'})
group_shop_mean = train_df.groupby('shop_id', as_index=False)['item_cnt_month'].mean().rename(columns={'item_cnt_month': 'group_mean_shop_month'})

group_cate_price_mean = train_df.groupby('item_category_id', as_index=False)['item_price'].mean().rename(columns={'item_price': 'group_mean_cat_price'})
group_item_price_mean = train_df.groupby('item_id', as_index=False)['item_price'].mean().rename(columns={'item_price': 'group_mean_item_price'})
group_shop_price_mean = train_df.groupby('shop_id', as_index=False)['item_price'].mean().rename(columns={'item_price': 'group_mean_shop_price'})

In [45]:
group_df = train_df.copy()
group_df = group_df[['date_block_num', 'shop_id', 'item_id', 'item_category_id']]

In [46]:
group_df = group_df.merge(group_cate_sum, on=['item_category_id'], how='left').merge(group_cate_mean, on=['item_category_id'], how='left')
group_df = group_df.merge(group_item_sum, on=['item_id'], how='left').merge(group_item_mean, on=['item_id'], how='left')
group_df = group_df.merge(group_shop_sum, on=['shop_id'], how='left').merge(group_shop_mean, on=['shop_id'], how='left')

group_df = group_df.merge(group_cate_price_mean, on=['item_category_id'], how='left').merge(group_item_price_mean, on=['item_id'], how='left').merge(group_shop_price_mean, on=['shop_id'], how='left')

In [47]:
train_df = train_df.merge(group_df, on=['date_block_num', 'shop_id', 'item_id', 'item_category_id'], how='left').reset_index(drop=True)

In [48]:
train_df.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_category_id,item_price,item_price_mean,item_cnt_month,item_cnt_mean,item_cnt_count,year,month,group_cat_month,group_mean_cat_month,group_item_month,group_mean_item_month,group_shop_month,group_mean_shop_month,group_mean_cat_price,group_mean_item_price,group_mean_shop_price
0,0,5,5037,0.0,0.0,0.0,0.0,0.0,0.0,2013,0,0.0,0.0,1137.0,0.796218,0.142099,0.142099,0.0,1287.197791,117.965097
1,0,5,5320,0.0,0.0,0.0,0.0,0.0,0.0,2013,0,0.0,0.0,0.0,0.0,0.142099,0.142099,0.0,0.0,117.965097
2,0,5,5233,0.0,0.0,0.0,0.0,0.0,0.0,2013,0,0.0,0.0,488.0,0.341737,0.142099,0.142099,0.0,244.425718,117.965097
3,0,5,5232,0.0,0.0,0.0,0.0,0.0,0.0,2013,0,0.0,0.0,140.0,0.098039,0.142099,0.142099,0.0,71.356856,117.965097
4,0,5,5268,0.0,0.0,0.0,0.0,0.0,0.0,2013,0,0.0,0.0,0.0,0.0,0.142099,0.142099,0.0,0.0,117.965097


In [49]:
train_df = train_df[(train_df['item_cnt_month'] >= 0) & (train_df['item_cnt_month'] <= 20) & (train_df['item_price'] <= 50000)]
train_df.rename(columns={'item_cnt_month': 'item_cnt'}, inplace=True)

In [50]:
# 整体向后移动一个单位
train_df['item_cnt_month'] = train_df.sort_values('date_block_num').groupby(['shop_id', 'item_id'])['item_cnt'].shift(-1)

In [53]:
gp_price = train_df.sort_values('date_block_num').groupby('item_id', as_index=False)
gp_price = gp_price.agg({'item_price': [np.max, np.min]})
gp_price.columns = ['item_id', 'hist_max_price', 'hist_min_price']

train_df = train_df.merge(gp_price, on=['item_id'], how='left')
train_df['price_inc'] = train_df['item_price'] - train_df['hist_min_price']
train_df['price_dec'] = train_df['hist_max_price'] - train_df['item_price']

In [56]:
f_min = lambda x: x.rolling(window=3, min_periods=1).min()
f_max = lambda x: x.rolling(window=3, min_periods=1).max()
f_mean = lambda x: x.rolling(window=3, min_periods=1).mean()
f_std = lambda x: x.rolling(window=3, min_periods=1).std()

function_list = [f_min, f_max, f_mean, f_std]
function_name = ['min', 'max', 'mean', 'std']

for i in range(len(function_list)):
    train_df['item_cnt_%s' % function_name[i]] = train_df.sort_values('date_block_num').groupby(['shop_id', 'item_category_id', 'item_id'])['item_cnt'].apply(function_list[i])

In [57]:
train_set = train_df.query('date_block_num >= 3 and date_block_num < 28').copy()
validation_set = train_df.query('date_block_num >= 28 and date_block_num < 33').copy()
test_set = train_df.query('date_block_num == 33').copy()

train_set.dropna(subset=['item_cnt_month'], inplace=True)
validation_set.dropna(subset=['item_cnt_month'], inplace=True)

train_set.dropna(inplace=True)
validation_set.dropna(inplace=True)

In [58]:
X_train = train_set.drop(['item_cnt_month', 'date_block_num'], axis=1)
Y_train = train_set['item_cnt_month'].astype(int)
X_validation = validation_set.drop(['item_cnt_month', 'date_block_num'], axis=1)
Y_validation = validation_set['item_cnt_month'].astype(int)

In [59]:
latest_records = pd.concat([train_set, validation_set]).drop_duplicates(subset=['shop_id', 'item_id'], keep='last')
X_test = test.merge(latest_records, on=['shop_id', 'item_id'], how='left').drop('ID', axis=1)

In [62]:
X_test = X_test.drop(['item_cnt_month', 'date_block_num'], axis=1)

In [64]:
sets = [X_train, X_validation, X_test]

for dataset in sets:
    for shop_id in dataset['shop_id'].unique():
        for column in dataset.columns:
            shop_median = dataset[(dataset['shop_id'] == shop_id)][column].median()
            dataset.loc[(dataset[column].isnull()) & (dataset['shop_id'] == shop_id), column] = shop_median

In [66]:
X_test['year'] = 2015
X_test['month'] = 11

X_train.drop(['item_category_id'], axis=1, inplace=True)
X_validation.drop(['item_category_id'], axis=1, inplace=True)
X_test.drop(['item_category_id'], axis=1, inplace=True)

In [68]:
import lightgbm as lgb

In [69]:
train_data = lgb.Dataset(data=X_train, label=Y_train)
valid_data = lgb.Dataset(data=X_validation, label=Y_validation)
    
params = {"objective" : "regression", "metric" : "rmse", 'num_rounds':3000, 'early_stopping_rounds':100,
              "num_leaves" : 50, "learning_rate" : 0.01, "bagging_fraction" : 0.9, 'max_depth': 10, 
              "feature_fraction" : 0.3, "bagging_seed" : 0}
    
lgb_model = lgb.train(params, train_data, valid_sets=[train_data, valid_data], verbose_eval=200) 

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2944
[LightGBM] [Info] Number of data points in the train set: 5276791, number of used features: 24
[LightGBM] [Info] Start training from score 0.156223
Training until validation scores don't improve for 100 rounds
[200]	training's rmse: 0.637131	valid_1's rmse: 0.698453
[400]	training's rmse: 0.62026	valid_1's rmse: 0.692179
[600]	training's rmse: 0.612202	valid_1's rmse: 0.691327
[800]	training's rmse: 0.607222	valid_1's rmse: 0.691094
Early stopping, best iteration is:
[820]	training's rmse: 0.606713	valid_1's rmse: 0.691053
