In [1]:
#library imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [7]:
#read data
data_folder = ('../Data/')
train = pd.read_csv(data_folder+'monthly_train.csv') 
test = pd.read_csv(data_folder+'test.csv')

In [8]:
train = train.drop(['Unnamed: 0'],axis=1)

In [9]:
train.head()

Unnamed: 0,date_block_num,shop_id,item_category_id,item_id,mean_item_price,item_cnt,mean_item_cnt,num_transactions,item_cnt_month
0,0,0,2,5572,1322.0,10.0,1.666667,6,11.0
1,0,0,2,5573,560.0,1.0,1.0,1,1.0
2,0,0,2,5575,806.0,4.0,1.333333,3,0.0
3,0,0,2,5576,2231.0,5.0,1.0,5,2.0
4,0,0,2,5609,2381.0,1.0,1.0,1,1.0


In [17]:
# features for iten price
grouped_price = train.sort_values('date_block_num').groupby(['item_id'], as_index=False).agg({'mean_item_price':[np.min, np.max]})
# min max over time
grouped_price.columns = ['item_id', 'item_price_min', 'item_price_max']
train = pd.merge(train, grouped_price, on='item_id', how='left')
# price increase decrease in value over whole data 
train['price_inc'] = train['mean_item_price'] - train['item_price_min']
train['price_dec'] = train['item_price_max'] - train['mean_item_price']

In [20]:
# create lag features

for i in range(1,3):
    #new feature name
    feature = ('item_cnt_lag'+str(i))
    train[feature] = train.sort_values('date_block_num').groupby(['shop_id', 'item_category_id', 'item_id'])['item_cnt'].shift(i)
    # fill nan values with 0
    train[feature] = train[feature].fillna(0)

In [21]:
train['trend'] = train['item_cnt']

for i in range(1,3):
    feature = ('item_cnt_lag'+str(i))
    train['trend'] = train['trend'] - train[feature]

train['trend'] = train['trend'] / 4

In [34]:
# Add seasonality features
train['year'] = train['date_block_num'].apply(lambda x: ((x//12) + 2013))
train['month'] = train['date_block_num'].apply(lambda x: (x % 12 + 1))
train['no-of-days-in-month'] = pd.to_datetime(train['month'], 
                                                          format='%m').dt.days_in_month

In [38]:
train.head()

Unnamed: 0,date_block_num,shop_id,item_category_id,item_id,mean_item_price,item_cnt,mean_item_cnt,num_transactions,item_cnt_month,item_price_min_x,...,item_price_min,item_price_max,price_inc,price_dec,item_cnt_lag1,item_cnt_lag2,trend,year,month,no-of-days-in-month
0,0,0,2,5572,1322.0,10.0,1.666667,6,11.0,1300.0,...,1300.0,2190.0,22.0,868.0,0.0,0.0,2.5,2013,1,31
1,0,0,2,5573,560.0,1.0,1.0,1,1.0,248.0,...,248.0,999.0,312.0,439.0,0.0,0.0,0.25,2013,1,31
2,0,0,2,5575,806.0,4.0,1.333333,3,0.0,298.0,...,298.0,1199.0,508.0,393.0,0.0,0.0,1.0,2013,1,31
3,0,0,2,5576,2231.0,5.0,1.0,5,2.0,1119.0,...,1119.0,2590.0,1112.0,359.0,0.0,0.0,1.25,2013,1,31
4,0,0,2,5609,2381.0,1.0,1.0,1,1.0,1398.0,...,1398.0,3090.0,983.0,709.0,0.0,0.0,0.25,2013,1,31


In [39]:
training = train.query('date_block_num >= 3 and date_block_num < 27').copy()
training = training.dropna()
validation = train.query('date_block_num >= 27 and date_block_num < 32').copy()
validation = validation.dropna()
testing = train.query('date_block_num == 33').copy()
testing = testing.dropna()


In [42]:
# add mean encoding for categrical features so as to capture them better
# one hot encong will take too much space

# add mean encoding for shop id , item id , year and month
mean_col_list = ['shop_id','item_id','year','month']
for i in mean_col_list:
    groupedmean= training.groupby([i]).agg({'item_cnt_month': ['mean']})
    groupedmean.columns = [i+'_mean']
    groupedmean.reset_index(inplace=True)
    training = pd.merge(training, groupedmean, on=[i], how='left')
    validation = pd.merge(validation, groupedmean, on=[i], how='left')
    testing = pd.merge(testing, groupedmean, on=[i], how='left')
    
# add mean encoding on item id and shop id level
groupedmean = training.groupby(['item_id','shop_id']).agg({'item_cnt_month': ['mean']})
groupedmean.columns = ['itemshop_mean']
groupedmean.reset_index(inplace=True)
training = pd.merge(training, groupedmean, on=[i], how='left')
validation = pd.merge(validation, groupedmean, on=[i], how='left')
testing = pd.merge(testing, groupedmean, on=[i], how='left')

    

In [43]:
# make train, validation, test to run models. 
X_train = training.drop(['item_cnt_month', 'date_block_num'], axis=1)
Y_train = training['item_cnt_month'].astype(int)
X_validation = validation.drop(['item_cnt_month', 'date_block_num'], axis=1)
Y_validation = validation['item_cnt_month'].astype(int)
X_test = testing.drop(['item_cnt_month', 'date_block_num'], axis=1)
Y_test = testing['item_cnt_month'].astype(int)

In [44]:
# save files
X_train.to_csv("X_train.csv",index=False)
Y_train.to_csv("Y_train.csv",index=False)
X_validation.to_csv("X_validation.csv",index=False)
Y_validation.to_csv("Y_validation.csv",index=False)
X_test.to_csv("X_test.csv",index=False)
Y_test.to_csv("Y_test.csv",index=False)

KeyboardInterrupt: 

In [49]:
from matplotlib.pylab import rcParams
import xgboost as xgb
from xgboost import XGBRegressor
from xgboost import plot_importance
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error


In [50]:
def cal_rmse(actual,pred):
    return np.sqrt(mean_squared_error(actual, pred))

In [48]:
# apply random forest
rf = RandomForestRegressor(n_estimators=100, max_depth=8, n_jobs=-1)
rf.fit(X_train, Y_train)
rf_trainpred = rf.predict(X_train)
rf_validationpred = rf.predict(X_validation)
rf_testpred = rf.predict(X_test)
print('RMSE Train:', cal_rmse(Y_train, rf_trainpred))
print('RMSE Validation :', cal_rmse(Y_validation, rf_validationpred))
print('RMSE Test :', cal_rmse(Y_test, rf_test))

NameError: name 'RandomForestRegressor' is not defined

In [None]:
xgb_model = XGBRegressor(max_depth=8, 
                         n_estimators=500, 
                         min_child_weight=1000,  
                         colsample_bytree=0.7, 
                         subsample=0.7, 
                         eta=0.3, 
                         seed=0,
                        tree_method = 'exact')
xgb_model.fit(X_train, 
              Y_train, 
              eval_metric="rmse", 
              eval_set=[(X_train, Y_train), (X_validation, Y_validation)], 
              verbose=20, 
              early_stopping_rounds=10)

In [None]:
params = {
        'min_child_weight': [300, 500, 1000],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.7, 0.8],
        'colsample_bytree': [0.6, 0.7, 0.8],
        'max_depth': [5, 7, 8]
        }

In [None]:
cv_results = xgb.cv(
    params,
    X_train,Y_train,
    eval_set=[(X_train, Y_train), (X_validation, Y_validation)]
    num_boost_round=num_boost_round,
    seed=42,
    nfold=5,
    metrics={'rmse'},
    early_stopping_rounds=10
)
cv_results


In [None]:
xgb_model.best_estimator_

In [None]:
plt.rcParams["figure.figsize"] = (20, 10)
plot_importance(xgb_model)
plt.show()

In [None]:
xgbmodel_trainpred = xgbmodel.predict(X_train)
xgbmodel_valpred = xgbmodel.predict(X_validation)
xgbmodel_testpred = xgbmodel.predict(X_test)

In [None]:
print('RMSE Train :', cal_rmse(Y_train, xgbmodel_trainpred))
print('RMSE Validation :', cal_rmse(Y_validation, xgbmodel_validaionpred)))\
print('RMSE Test :', cal_rmse(Y_test, xgbmodel_testpred))