In [1]:
#library imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [7]:
#read data
data_folder = ('../Data/')
train = pd.read_csv(data_folder+'monthly_train.csv') 
test = pd.read_csv(data_folder+'test.csv')

In [8]:
train = train.drop(['Unnamed: 0'],axis=1)

In [9]:
train.head()

Unnamed: 0,date_block_num,shop_id,item_category_id,item_id,mean_item_price,item_cnt,mean_item_cnt,num_transactions,item_cnt_month
0,0,0,2,5572,1322.0,10.0,1.666667,6,11.0
1,0,0,2,5573,560.0,1.0,1.0,1,1.0
2,0,0,2,5575,806.0,4.0,1.333333,3,0.0
3,0,0,2,5576,2231.0,5.0,1.0,5,2.0
4,0,0,2,5609,2381.0,1.0,1.0,1,1.0


In [17]:
# features for iten price
grouped_price = train.sort_values('date_block_num').groupby(['item_id'], as_index=False).agg({'mean_item_price':[np.min, np.max]})
# min max over time
grouped_price.columns = ['item_id', 'item_price_min', 'item_price_max']
train = pd.merge(train, grouped_price, on='item_id', how='left')
# price increase decrease in value over whole data 
train['price_inc'] = train['mean_item_price'] - train['item_price_min']
train['price_dec'] = train['item_price_max'] - train['mean_item_price']

In [20]:
# create lag features

for i in range(1,3):
    #new feature name
    feature = ('item_cnt_lag'+str(i))
    train[feature] = train.sort_values('date_block_num').groupby(['shop_id', 'item_category_id', 'item_id'])['item_cnt'].shift(i)
    # fill nan values with 0
    train[feature] = train[feature].fillna(0)

In [21]:
train['trend'] = train['item_cnt']

for i in range(1,3):
    feature = ('item_cnt_lag'+str(i))
    train['trend'] = train['trend'] - train[feature]

train['trend'] = train['trend'] / 4

In [22]:
train.head()

Unnamed: 0,date_block_num,shop_id,item_category_id,item_id,mean_item_price,item_cnt,mean_item_cnt,num_transactions,item_cnt_month,item_price_min_x,tem_price_max_x,item_price_min_y,tem_price_max_y,item_price_min,item_price_max,price_inc,price_dec,item_cnt_lag1,item_cnt_lag2,trend
0,0,0,2,5572,1322.0,10.0,1.666667,6,11.0,1300.0,2190.0,1300.0,2190.0,1300.0,2190.0,22.0,868.0,0.0,0.0,2.5
1,0,0,2,5573,560.0,1.0,1.0,1,1.0,248.0,999.0,248.0,999.0,248.0,999.0,312.0,439.0,0.0,0.0,0.25
2,0,0,2,5575,806.0,4.0,1.333333,3,0.0,298.0,1199.0,298.0,1199.0,298.0,1199.0,508.0,393.0,0.0,0.0,1.0
3,0,0,2,5576,2231.0,5.0,1.0,5,2.0,1119.0,2590.0,1119.0,2590.0,1119.0,2590.0,1112.0,359.0,0.0,0.0,1.25
4,0,0,2,5609,2381.0,1.0,1.0,1,1.0,1398.0,3090.0,1398.0,3090.0,1398.0,3090.0,983.0,709.0,0.0,0.0,0.25


In [24]:
training = train.query('date_block_num >= 3 and date_block_num < 27').copy()
training = training.dropna()
validation = train.query('date_block_num >= 27 and date_block_num < 32').copy()
validation = validation.dropna()
testing = train.query('date_block_num == 33').copy()
testing = testing.dropna()


In [None]:
# add mean encoding for categrical features so as to capture them better
# one hot encong will take too much space

# Shop mean encoding.
gp_shop_mean = train_set.groupby(['shop_id']).agg({'item_cnt_month': ['mean']})
gp_shop_mean.columns = ['shop_mean']
gp_shop_mean.reset_index(inplace=True)
# Item mean encoding.
gp_item_mean = train_set.groupby(['item_id']).agg({'item_cnt_month': ['mean']})
gp_item_mean.columns = ['item_mean']
gp_item_mean.reset_index(inplace=True)
# Shop with item mean encoding.
gp_shop_item_mean = train_set.groupby(['shop_id', 'item_id']).agg({'item_cnt_month': ['mean']})
gp_shop_item_mean.columns = ['shop_item_mean']
gp_shop_item_mean.reset_index(inplace=True)
# Year mean encoding.
gp_year_mean = train_set.groupby(['year']).agg({'item_cnt_month': ['mean']})
gp_year_mean.columns = ['year_mean']
gp_year_mean.reset_index(inplace=True)
# Month mean encoding.
gp_month_mean = train_set.groupby(['month']).agg({'item_cnt_month': ['mean']})
gp_month_mean.columns = ['month_mean']
gp_month_mean.reset_index(inplace=True)

# Add meand encoding features to train set.
train_set = pd.merge(train_set, gp_shop_mean, on=['shop_id'], how='left')
train_set = pd.merge(train_set, gp_item_mean, on=['item_id'], how='left')
train_set = pd.merge(train_set, gp_shop_item_mean, on=['shop_id', 'item_id'], how='left')
train_set = pd.merge(train_set, gp_year_mean, on=['year'], how='left')
train_set = pd.merge(train_set, gp_month_mean, on=['month'], how='left')
# Add meand encoding features to validation set.
validation_set = pd.merge(validation_set, gp_shop_mean, on=['shop_id'], how='left')
validation_set = pd.merge(validation_set, gp_item_mean, on=['item_id'], how='left')
validation_set = pd.merge(validation_set, gp_shop_item_mean, on=['shop_id', 'item_id'], how='left')
validation_set = pd.merge(validation_set, gp_year_mean, on=['year'], how='left')
validation_set = pd.merge(validation_set, gp_month_mean, on=['month'], how='left')

test_set = pd.merge(test_set, gp_shop_mean, on=['shop_id'], how='left')
test_set = pd.merge(test_set, gp_item_mean, on=['item_id'], how='left')
test_set = pd.merge(test_set, gp_shop_item_mean, on=['shop_id', 'item_id'], how='left')
test_set = pd.merge(test_set, gp_year_mean, on=['year'], how='left')
test_set = pd.merge(test_set, gp_month_mean, on=['month'], how='left')


In [None]:
# Create train and validation sets and labels. 
X_train = train_set.drop(['item_cnt_month', 'date_block_num'], axis=1)
Y_train = train_set['item_cnt_month'].astype(int)
X_validation = validation_set.drop(['item_cnt_month', 'date_block_num'], axis=1)
Y_validation = validation_set['item_cnt_month'].astype(int)
X_test = test_set.drop(['item_cnt_month', 'date_block_num'], axis=1)
Y_test = test_set['item_cnt_month'].astype(int)

In [None]:
X_train.to_csv("X_train.csv",index=False)
Y_train.to_csv("Y_train.csv",index=False)
X_validation.to_csv("X_validation.csv",index=False)
Y_validation.to_csv("Y_validation.csv",index=False)
X_test.to_csv("X_test.csv",index=False)
Y_test.to_csv("Y_test.csv",index=False)

In [None]:
rf_model = RandomForestRegressor(n_estimators=50, max_depth=7, random_state=0, n_jobs=-1)
rf_model.fit(xgb_train, Y_train)
rf_train_pred = rf_model.predict(rf_train)
rf_val_pred = rf_model.predict(rf_val)
rf_test_pred = rf_model.predict(rf_test)
print('Train rmse:', np.sqrt(mean_squared_error(Y_train, rf_train_pred)))
print('Validation rmse:', np.sqrt(mean_squared_error(Y_validation, rf_val_pred)))

In [None]:
xgb_model = XGBRegressor(max_depth=8, 
                         n_estimators=500, 
                         min_child_weight=1000,  
                         colsample_bytree=0.7, 
                         subsample=0.7, 
                         eta=0.3, 
                         seed=0,
                        tree_method = 'exact')
xgb_model.fit(X_train, 
              Y_train, 
              eval_metric="rmse", 
              eval_set=[(X_train, Y_train), (X_validation, Y_validation)], 
              verbose=20, 
              early_stopping_rounds=10)

In [None]:
xgb_model.best_estimator_

In [None]:
plt.rcParams["figure.figsize"] = (15, 6)
plot_importance(xgb_model)
plt.show()

In [None]:
xgb_train_pred = xgb_model.predict(xgb_train)
xgb_val_pred = xgb_model.predict(xgb_val)
xgb_test_pred = xgb_model.predict(xgb_test

In [None]:
print('Train rmse:', np.sqrt(mean_squared_error(Y_train, xgb_train_pred)))
print('Validation rmse:', np.sqrt(mean_squared_error(Y_validation, xgb_val_pred)))