# Russian retail store sales prediction

This notebook shows the work of Udacity machine learning final project

# Data and input

In [None]:
#import necessary libraries
import pandas as pd
import os
import psutil
import numpy as np
import itertools
import gc
import sys 
import matplotlib.pyplot as plt

In [None]:
items = pd.read_csv('./items.csv')
cat = pd.read_csv('./item_categories.csv')
shop = pd.read_csv('./shops.csv')
sales = pd.read_csv('./sales_train.csv')
test = pd.read_csv('./test.csv')

In [None]:
# showing part of the dataset
from IPython.display import display
for tab in [items,cat,shop,sales,test]:
    display(tab.head())

In [None]:
# show the shape of the dataset
print(items.shape)
print(sales.shape)
print(cat.shape)
print(shop.shape)
print(test.shape)

In [None]:
# check memory usage
def memcheck():
    process = psutil.Process(os.getpid())
    print(process.memory_info().rss)
# check new product launched in new period
def newproduct(datebk):
    return np.setdiff1d(np.unique(alldat.loc[alldat.date_block_num==datebk,'item_id']),
                        np.unique(alldat.loc[alldat.date_block_num<datebk,'item_id']))
# plot new graph
def plot(ts, xl,yl, title=''):
    plt.figure(figsize=(12,8))
    plt.xlabel(xl)
    plt.ylabel(yl)
   
    plt.title(title)
    plt.plot(ts)

# Exploratory data analysis

In [None]:
sales.isnull().sum()

In [None]:
import seaborn as sns

In [None]:

plt.hist(sales['item_cnt_day'], color = 'blue', edgecolor = 'black'
        )
plt.title('distribution of daily sales'
         )
plt.xlabel('daily sales')
plt.ylabel('count')

In [None]:
neg = sum(sales['item_cnt_day']<0)
zero = sum(sales['item_cnt_day']==0)
one_percent = sum(sales['item_cnt_day']==1)/len(sales)
print('the number of instance with negative sales is {} '.format(neg))
print('the number of instance with zero sales is {} '.format(zero))
print('the proportion of instance with sales =1  is {} '.format(one_percent))

In [None]:
stat = pd.DataFrame(sales['item_price'].describe())
stat['item_price'] = np.round(stat['item_price'])
stat

In [None]:
stat = pd.DataFrame(sales['item_cnt_day'].describe())
stat['item_cnt_day'] = np.round(stat['item_cnt_day'])
stat

In [None]:
#refund_count_item = sales.loc[sales.item_cnt_day<-0].groupby(['item_id','date_block_num'])['item_cnt_day'].count().sort_values(ascending=False)

In [None]:
%matplotlib inline
salestrend = sales.groupby(['date_block_num'])['item_cnt_day'].sum()
plot(salestrend,'date_block_num','sales','sales trend by month')

In [None]:
fig = plt.figure(figsize=(30,40))
for i in range(0,60):
    trend =  sales.loc[sales.shop_id==i].groupby(['date_block_num'])['item_cnt_day'].sum()
    ax = fig.add_subplot(10,6,i+1)
    ax.set_title('shop %s'%i)
    ax.plot(trend)
    ax.set_ylabel('monthly_sales')

By decomposing the total sales into sales for each of the store, it can be seen that most of the sales data show a similar pattern to the sales data. 

# Data preprocessing


## expansion of the sales dataset

In [None]:
item_num =len(np.unique(items['item_id']))
shop_num = len(np.unique(shop['shop_id']))
print('total number of shop: {}'.format(shop_num))
print('total number of item: {}'.format(item_num))

In [None]:
date_block = sales['date_block_num'].unique()
grid=[]
for d in date_block:
    #number of unique shops and items
    allshops = sales.loc[sales.date_block_num==d,'shop_id'].unique()
    allitems = sales.loc[sales.date_block_num==d, 'item_id'].unique()
    
    grid.append(list(itertools.product(*[allshops,allitems,[d]])))

grids = np.vstack(grid)
# combine the shop-item pairing with sales dataset  
grids_df = pd.DataFrame(grids,columns=['shop_id','item_id','date_block_num'])
sales_month = sales.groupby(['shop_id','item_id','date_block_num'],as_index=False).agg({'item_cnt_day':['sum']})
sales_month.columns = ['shop_id','item_id','date_block_num','monthly_sales']

train_data = pd.merge(grids_df, sales_month, on=['shop_id','item_id','date_block_num'],how='left' ).fillna(0)
sales = pd.merge(sales, items[['item_id','item_category_id']], how='left', on='item_id')

In [None]:
del grids_df

In [None]:
print('the shape of expanded dataset is {}'.format(train_data.shape))

Test data has a month period of 34

In [None]:
id_test = test['ID']
test.drop('ID',axis=1, inplace=True)
test['date_block_num']=np.nan
test['date_block_num'] = test['date_block_num'].fillna(34)



## combine train and test

In [None]:
alldat = train_data.append(test)
train_l = train_data.shape[0]
test_l = test.shape[0]

In [None]:
alldat.shape

# feature engineering


## construct variables(city, item category , year, month)

In [None]:
shop['city'] = shop.shop_name.str.split(expand=True)[0]

alldat = pd.merge(alldat,shop, on='shop_id', how='left')
sales = pd.merge(sales,shop, on='shop_id', how='left')


In [None]:
alldat = alldat.drop('shop_name',axis=1)


In [None]:
sales['date'] = pd.to_datetime(sales['date'], format='%d.%m.%Y')
sales['month']= sales['date'].dt.month

date_df = sales[['date','date_block_num','month']].copy()
date_df = date_df.drop('date',axis=1)
date_df = date_df.drop_duplicates(keep='first')


In [None]:
alldat = pd.merge(alldat, date_df, how='left', on='date_block_num')
alldat = pd.merge(alldat, items[['item_id','item_category_id']], how='left', on='item_id')

In [None]:
alldat.loc[train_l:,'month'] = alldat.loc[train_l:,'month'].fillna(11)

In [None]:
alldat['year'] = alldat['date_block_num']//12
sales['year'] = sales['date_block_num']//12

In [None]:
sales_month_item = sales.groupby(['item_id','date_block_num'],as_index=False).agg({'item_cnt_day':['sum']})
sales_month_item.columns = ['item_id','date_block_num','monthly_sales_item']

sales_month_shop = sales.groupby(['shop_id','date_block_num'],as_index=False).agg({'item_cnt_day':['sum']})
sales_month_shop.columns = ['shop_id','date_block_num','monthly_sales_shop']
'''
sales_month_cat = sales.groupby(['item_category_id','date_block_num'],as_index=False).agg({'item_cnt_day':['sum']})
sales_month_cat.columns = ['item_category_id','date_block_num','monthly_sales_cat']

sales_month_city = sales.groupby(['city','date_block_num'],as_index=False).agg({'item_cnt_day':['sum']})
sales_month_city.columns = ['city','date_block_num','monthly_sales_city']
'''

In [None]:
alldat = pd.merge(alldat, sales_month_item, how='left', on=['item_id','date_block_num'])
alldat = pd.merge(alldat, sales_month_shop, how='left', on=['shop_id','date_block_num'])


## price statistics

In [None]:
price_stat_item = sales.groupby(['item_id'],as_index=False).agg({'item_price':['mean','std']}).fillna(0)
price_stat_item.columns = ['item_id','item_price_mean','item_price_std']


In [None]:
price_stat_shop = sales.groupby(['shop_id'],as_index=False).agg({'item_price':['mean','std']}).fillna(0)
price_stat_shop.columns = ['shop_id','shop_price_mean','shop_price_std']

In [None]:
alldat = pd.merge(alldat, price_stat_item,on=['item_id'], how ='left')

In [None]:
alldat = pd.merge(alldat, price_stat_shop ,on=['shop_id'], how ='left')

In [None]:
del price_stat_item
del price_stat_shop
del grids
del grid
del sales_month
del sales_month_item
del sales_month_shop

del date_df
del date_block

gc.collect()

In [None]:
alldat[['item_price_mean','item_price_std']] = alldat[['item_price_mean','item_price_std']].fillna(0)

## label encoding

In [None]:
from sklearn.preprocessing import LabelEncoder

lb = LabelEncoder()
lb.fit(alldat['city'])

alldat['city']=lb.fit_transform(alldat['city'])



In [None]:
alldat

## outliers removal

In [None]:
alldat.loc[:train_l-1,'monthly_sales'] = alldat.loc[:train_l-1,'monthly_sales'].clip(0,30)
# clipping them between 0 and 90 percentile
for f in ['monthly_sales_shop','monthly_sales_item']:#
    alldat.loc[:train_l,f] = alldat.loc[:train_l,f].clip(0,alldat.loc[:train_l,f].quantile(0.90))

In [None]:
train_data =  alldat.loc[:train_l-1]

## discretize target variable


In [None]:
'''
bin_array = [0,1,2,3,5,30.1]
# count the number of instance in each bin
train_data['discretized_sales'] = np.digitize(train_data['monthly_sales'],bin_array)

# generate discretized feature grouped by shop
count_shop = train_data.groupby(['shop_id','discretized_sales','year']).monthly_sales.count()
count_shop = count_shop.reset_index().pivot_table(index=['shop_id','year'],columns=['discretized_sales'],values='monthly_sales').fillna(0)

count_shop.columns = ['shop_discrete_'+str(c) for c in count_shop.columns]


alldat = pd.merge(alldat, count_shop, on=['shop_id','year'], how='left')



alldat['shop_count'] = alldat['shop_discrete_1']+alldat['shop_discrete_5']+alldat['shop_discrete_2']+alldat['shop_discrete_3']+alldat['shop_discrete_4']

for i in [1,2,3,4,5]:
    alldat['shop_discrete_'+str(i)] = alldat['shop_discrete_'+str(i)] / alldat['shop_count']

alldat.drop(['shop_count'],axis=1, inplace=True)
'''

In [None]:
#group by item category and year
'''
train_data['discretized_sales'] = np.digitize(train_data['monthly_sales'],bin_array)


count_shop = train_data.groupby(['item_category_id','discretized_sales','year']).monthly_sales.count()
count_shop = count_shop.reset_index().pivot_table(index=['item_category_id','year'],columns=['discretized_sales'],values='monthly_sales').fillna(0)
count_shop.columns = ['cat_discrete_'+str(c) for c in count_shop.columns]


alldat = pd.merge(alldat, count_shop, on=['item_category_id','year'], how='left')

alldat['cat_count'] = alldat['cat_discrete_1']+alldat['cat_discrete_5']+alldat['cat_discrete_2']+alldat['cat_discrete_3']+alldat['cat_discrete_4']

for i in [1,2,3,4,5]:
    alldat['cat_discrete_'+str(i)] = alldat['cat_discrete_'+str(i)] / alldat['cat_count']
    

alldat.drop(['cat_count'],axis=1, inplace=True)
'''

In [None]:
#group by shop and month
'''
train_data['discretized_sales'] = np.digitize(train_data['monthly_sales'],bin_array)


count_shop = train_data.groupby(['shop_id','discretized_sales','month']).monthly_sales.count()
count_shop = count_shop.reset_index().pivot_table(index=['shop_id','month'],columns=['discretized_sales'],values='monthly_sales').fillna(0)
count_shop.columns = ['shop_discrete_month_'+str(c) for c in count_shop.columns]


alldat = pd.merge(alldat, count_shop, on=['shop_id','month'], how='left')


alldat['shop_count'] = alldat['shop_discrete_month_1']+alldat['shop_discrete_month_5']+alldat['shop_discrete_month_2']+alldat['shop_discrete_month_3']+alldat['shop_discrete_month_4']

for i in [1,2,3,4,5]:
    alldat['shop_discrete_month_'+str(i)] = alldat['shop_discrete_month_'+str(i)] / alldat['shop_count']

alldat.drop(['shop_count'],axis=1, inplace=True)
'''

## Target encoding


In [None]:
from tqdm import tqdm_notebook
def mean_encoding_bydate(df,feature):
    name=feature
    if(type(feature) ==list):
        name='-'.join(feature)
    
    df[name+'_mean_encoded']=np.nan
    for d in tqdm_notebook(df['date_block_num'].unique()):
        
        past_date_mask = (df['date_block_num']<d) 
        current_date_mask = (df['date_block_num']==d) 
        
        
        
        mean = df.loc[past_date_mask].groupby(feature).monthly_sales.mean()
        mean = mean.reset_index()
        temp = pd.merge(df.loc[current_date_mask],mean, how='left', on=feature)
        temp.set_index(df.loc[current_date_mask].index,inplace=True)
        
        df.loc[current_date_mask, name+'_mean_encoded'] = temp.monthly_sales_y
        
        max_ = df.loc[past_date_mask].groupby(feature).monthly_sales.max()
        max_ = max_.reset_index()
        temp2 = pd.merge(df.loc[current_date_mask],max_, how='left', on=feature)
        temp2.set_index(df.loc[current_date_mask].index,inplace=True)
        
        df.loc[current_date_mask, name+'_max_encoded'] = temp2.monthly_sales_y
        
        df = df.fillna(0.3343)
    
        del mean
        del max_
        gc.collect()
    
        
    #print("The coorelation between target and {} is {}".format(name, np.corrcoef(df[name+'_mean_encoded'],df.monthly_sales)[0][1]))
    return df

In [49]:
f_arr = ['shop_id','item_id','city','item_category_id']
for f in f_arr:
    alldat = mean_encoding_bydate(alldat,f)




HBox(children=(IntProgress(value=0, max=35), HTML(value='')))




In [50]:
print('the global average of monthly sales is {}'.format(np.mean(alldat['monthly_sales'])))

the global average of monthly sales is 0.3067073800034363


In [51]:
gc.collect()

32

In [52]:
alldat.loc[:,'shop_id_mean_encoded':] = alldat.loc[:,'shop_id_mean_encoded':].fillna(0.3343)

## Lag features

In [53]:
%%time
%matplotlib inline
from tqdm import tqdm_notebook
# increase date_block_num by the number of lags and merge it to the training set
shift_range=[1,2,3,6,12]
for lag in tqdm_notebook(shift_range):
    feat_tolag = alldat[['date_block_num','item_id','shop_id','monthly_sales_item',
       'monthly_sales_shop','item_price_mean','monthly_sales']].copy()
     
    # increase date_block_num by the number of lags and merge it to the training set
    feat_tolag['date_block_num'] = feat_tolag['date_block_num']+lag
    feat_tolag.columns =[c+'_lag_'+str(lag) if c.startswith('month') ==True or c.startswith('item_price') == True  else c for c in feat_tolag.columns]

    alldat = pd.merge(alldat, feat_tolag, how='left', on=['date_block_num','item_id','shop_id'], suffixes=['','_y'])
    del feat_tolag
    print('{} months lag features created'.format(lag)) 
    gc.collect()

gc.collect()

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

1 months lag features created
2 months lag features created
3 months lag features created
6 months lag features created
12 months lag features created

CPU times: user 1min 8s, sys: 25.5 s, total: 1min 34s
Wall time: 1min 4s


In [54]:
alldat.loc[:,'monthly_sales_item_lag_1':] = alldat.loc[:,'monthly_sales_item_lag_1':].fillna(0)

## downcast 

In [55]:
# convert the numeric types to reduce memory usage
int_feats = alldat.select_dtypes(include=[np.int64]).columns
float_feats = alldat.select_dtypes(include=[np.float64]).columns

alldat[int_feats] = alldat[int_feats].astype(np.int32)
alldat[float_feats] = alldat[float_feats].astype(np.float32)

## Trend

In [56]:
# difference between lag features
for f in ['monthly_sales_item',
       'monthly_sales_shop','item_price_mean','monthly_sales']:
    for lag1, lag2 in zip(shift_range[0:3],shift_range[1:]):
        alldat[f+'_diff_'+str(lag1)+str(lag2)] = alldat[f+'_lag_'+str(lag1)] -  alldat[f+'_lag_'+str(lag2)]

        

In [57]:
alldat.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11128050 entries, 0 to 11128049
Data columns (total 54 columns):
date_block_num                   float32
item_id                          int32
monthly_sales                    float32
shop_id                          int32
city                             int32
month                            float32
item_category_id                 int32
year                             float32
monthly_sales_item               float32
monthly_sales_shop               float32
item_price_mean                  float32
item_price_std                   float32
shop_price_mean                  float32
shop_price_std                   float32
shop_id_mean_encoded             float32
shop_id_max_encoded              float32
item_id_mean_encoded             float32
item_id_max_encoded              float32
city_mean_encoded                float32
city_max_encoded                 float32
item_category_id_mean_encoded    float32
item_category_id_max_encoded    

In [58]:
gc.collect()

111

In [59]:
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import ElasticNet,SGDRegressor
import xgboost as xgb
import lightgbm

In [60]:
alldat.drop(['monthly_sales_item',
       'monthly_sales_shop'],axis=1, inplace=True)



In [61]:
train_data = alldat.loc[:train_l-1].copy()
test = alldat.loc[train_l:].copy()

x_test = test.drop('monthly_sales',axis=1)
gc.collect()

14

# Validation set



In [63]:
from hyperopt import fmin, tpe, hp


In [64]:
# divide the data into train and validation
# the item-related  mean encoding of the data in validation set is assumed to null and filled with 0.3343
# t1: latest period in training data, t2: validation period
def train_val_split(t1,t2):
    train = train_data.loc[train_data.date_block_num <= t1]
    val = train_data.loc[train_data.date_block_num == t2]
    
    train_x = train.drop('monthly_sales',axis=1)
    train_y = train['monthly_sales']
    
    val_x = val.drop('monthly_sales',axis=1)
    val_y = val['monthly_sales']
    
    #adjust val set
    #newp = newproduct(t2)
    #val_x.loc[val_x.item_id.isin(newp),['item_id_mean_encoded','item_id_max_encoded','item_id-shop_id_mean_encoded',
    #                                    'item_id-shop_id_max_encoded','month-item_id_mean_encoded','month-item_id_max_encoded']]=0.3343
    
    #train_x.drop(['city','item_id','shop_id','item_category_id'],axis=1,inplace=True)
    #val_x.drop(['city','item_id','shop_id','item_category_id'],axis=1,inplace=True)

    del train
    del val
    gc.collect()
    
    return train_x, train_y, val_x, val_y.clip(0,20)
#'month-item_id_mean_encoded','item_id-city_mean_encoded'

In [65]:
from sklearn.metrics import mean_squared_error

# Modeliing
# 1. SGD(benchmark)

In [None]:
#3-fold cross validation
from sklearn.preprocessing import RobustScaler, StandardScaler,MinMaxScaler
sgddata_arr=[]
for t in [33,28,23]:
    x, y, x_val, y_val = train_val_split(t-1,t)
    
    mm = MinMaxScaler()
    
    x = mm.fit_transform(x)
    x_val = mm.fit_transform(x_val)
    y = np.log1p(y)
    y_val = y_val.clip(0,20)
    
    data={}
    data['x']=x 
    data['x_val'] = x_val
    data['y'] = y
    data['y_val'] = y_val
    
    sgddata_arr.append(data)
    

In [None]:
gc.collect()

In [None]:
scores=[]
# unoptimzied model
for i in [0,1,2]:
    sgd = SGDRegressor(l1_ratio=0.2, alpha=0.0005, penalty='elasticnet')
    sgd.fit(sgddata_arr[i]['x'], sgddata_arr[i]['y'])
        
    y_pred =sgd.predict(sgddata_arr[i]['x_val'])
    rmse = np.sqrt(mean_squared_error(np.expm1(y_pred).clip(0,20),sgddata_arr[i]['y_val']))
    print('rmse in fold {}: {}'.format(i+1,rmse))
    scores.append(rmse)
     
print('cv error is {}'.format(np.mean(scores)))

## hyperparmeter tuning


In [None]:
# search space

sgd_space={'l1_ratio':hp.uniform('l1_ratio',0.1,0.9),
           'alpha':hp.uniform('alpha',0.00001,0.5)}
#define the ojective function to be minimized
#objective chosen to be cv errors
def sgd_objective(params):
    p = {'l1_ratio': params['l1_ratio'],
        'alpha': params['alpha'],
        'penalty':'elasticnet'
       
        }
    score_arr=[]
    print('start..............')
    for i in [0,1,2]:
        sgd = SGDRegressor(**p)
        sgd.fit(sgddata_arr[i]['x'], sgddata_arr[i]['y'])
        
        y_pred =sgd.predict(sgddata_arr[i]['x_val'])
        rmse = np.sqrt(mean_squared_error(np.expm1(y_pred).clip(0,20),sgddata_arr[i]['y_val']))
        print('rmse in fold {}: {}'.format(i+1,rmse))
        score_arr.append(rmse)
     
    print('cv error is {}'.format(np.mean(score_arr)))
    print('-------------------------------------------------------\n')
    gc.collect() 
    return np.mean(score_arr)
# tpe suggest : algorithm for find the best hyperparaters, max_evals =4
sgd_best = fmin(fn = sgd_objective, space=sgd_space, algo=tpe.suggest, max_evals=15)
print('best sgd elasticnet parameters is {}'.format(sgd_best)) 

# 2. XGB

In [None]:

gc.collect()

In [None]:
# unoptimzied model
best_xgb = {'colsample_bytree': 1, 'gamma': 1.333080152263989, 'lambda': 3.4345783000200765, 'learning_rate': 0.25699972492763873, 'max_depth': 10, 'min_child_weight': 130.99821047754926, 
  'booster':'gbtree',
    'objective':'reg:linear',
    'eval_metric':'rmse','n_jobs':16,'subsample': 0.9}
default =  {'booster': 'gbtree',
          'eta': .1,
          'min_child_weight': 100,
          'max_depth': 10,
          'objective': 'reg:linear',
          'eval_metric': 'rmse',
          'silent': False,
          'nthread': 16}

pred_arr=[]
p={}
t=33

x,y,x_val,y_val = train_val_split(t-1,t)
train = xgb.DMatrix(x,y)
val = xgb.DMatrix(x_val,y_val)
    
model = xgb.train(default,train, 40, [(train, 'Train'), (val, 'Val')], early_stopping_rounds=5, verbose_eval=1 )
y_pred = model.predict(val)
pred_arr.append(np.sqrt(mean_squared_error(y_pred.clip(0,20),y_val)))
    
score = np.mean(pred_arr)
print('validation error is {}'.format(score))

In [None]:
xgb.plot_importance(booster=model,max_num_features=10)

In [None]:
x.columns

## hyperparameter tuning

In [None]:


space = {'max_depth':hp.quniform('max_depth',6,12,1),
        'learning_rate':hp.uniform('learning_rate',0.009,0.25),
        'gamma':hp.uniform('gamma',0,5),
        'min_child_weight':hp.uniform('min_child_weight',5,200),
        'subsample':hp.uniform('subsample',0.4,1),
        'colsample_bytree':hp.uniform('colsample_bytree',0.4,1),
             'lambda':hp.uniform('lambda',0,5)
            }

def objective(params):
    p ={'max_depth':int(params['max_depth']),
    'learning_rate':params['learning_rate'],
    'gamma': params['gamma'],
    'min_child_weight': params['min_child_weight'],
    'subsample': params['subsample'],
    'colsample_bytree': params['colsample_bytree'],
    'n_jobs':16,
    'lambda':params['lambda'],
    'booster':'gbtree',
    'objective':'reg:linear',
    'eval_metric':'rmse'}

    pred_arr=[]
    for t in [33,28,23]:
        x,y,x_val,y_val = train_val_split(t-1,t)
        train = xgb.DMatrix(x,y)
        val = xgb.DMatrix(x_val,y_val)
    
        model = xgb.train(p, train, 25, [(train, 'Train'), (val, 'Val')], early_stopping_rounds=5, verbose_eval=25 )
        y_pred = model.predict(val)
        pred_arr.append(np.sqrt(mean_squared_error(y_pred.clip(0,20),y_val)))

        

    score = np.mean(pred_arr)
    print('cross validation error is {}'.format(score))
    print('parameter: {}'.format(p))
    return score






In [None]:
best = fmin(fn = objective , space = space , algo=tpe.suggest, max_evals=8)

print(best)

In [None]:

best_xgb = {'colsample_bytree': 0.830688370955194, 'gamma': 1.333080152263989, 'lambda': 3.4345783000200765, 'learning_rate': 0.22699972492763873, 'max_depth': 8, 'min_child_weight': 130.99821047754926, 
  'booster':'gbtree',
    'objective':'reg:linear',
    'eval_metric':'rmse','n_jobs':16,'subsample': 0.4960204984048794}

# 3. LGB

In [None]:
pred_lgb=[]
de_p={ 'metric': 'l2','objective':'regression'}
for t in [33,28,23]:
    x,y,x_val,y_val = train_val_split(t-1,t)
    train_lgb = lightgbm.Dataset(x,y)
    val_lgb = lightgbm.Dataset(x_val,y_val)
    model = lightgbm.train(de_p,train_lgb,150,valid_sets=[train_lgb,val_lgb],early_stopping_rounds=100,verbose_eval=50)
    
    pred_y = model.predict(x_val, num_iteration=model.best_iteration or 150)
    
    score = np.sqrt(mean_squared_error(pred_y.clip(0,20),y_val))
    pred_lgb.append(score)
print('cv error: {}'.format(np.mean(pred_lgb))) 

## hyperparameter tuning


In [None]:


lgb_params = {
     'num_leaves': 200,
    'objective': 'regression',
    'min_data_in_leaf': 100,
    'learning_rate': 0.02,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.7,
    'bagging_freq': 1,
    'metric': 'l2',
    'num_threads': 16
}




space = {'num_leaves':hp.quniform('num_leaves',50,1000,50),
    'learning_rate':hp.uniform('learning_rate',0.01,0.5),
    'min_data_in_leaf':hp.quniform('min_data_in_leaf',50,800,50),
    
    'feature_fraction':hp.uniform('feature_fraction',0.4,1),
    'bagging_fraction':hp.uniform('bagging_fraction',0.4,1),
             'bagging_freq':hp.quniform('bagging_freq',1,5,1)
    }

def objective(params):
    p ={'num_leaves':int(params['num_leaves']),
    'learning_rate':params['learning_rate'],
    'min_data_in_leaf': int(params['min_data_in_leaf']),

    'feature_fraction': params['feature_fraction'],
    'bagging_fraction': params['bagging_fraction'],
    'num_threads':16,
    'bagging_freq':int(params['bagging_freq']),
        
    'objective':'regression',
    'metric':'l2'}
    pred_lgb = []
    for t in [33,28,23]:
        x,y,x_val,y_val = train_val_split(t-1,t)
        train_lgb = lightgbm.Dataset(x,y)
        val_lgb = lightgbm.Dataset(x_val,y_val)
        model = lightgbm.train(p, train_lgb,1500,valid_sets=[train_lgb,val_lgb],early_stopping_rounds=100,verbose_eval=250)
    
        pred_y = model.predict(x_val, num_iteration=model.best_iteration or 2500)
    
        score = np.sqrt(mean_squared_error(pred_y.clip(0,20),y_val))
        pred_lgb.append(score)
        
        
    print('parameters: {}'.format(p))    
    print('cv error: {}'.format(np.mean(pred_lgb)))    
    return np.mean(pred_lgb)

best_lgb = fmin(fn = objective , space = space , algo=tpe.suggest, max_evals=10)

print('best parameters for lgb is: {}'.format(best_lgb))



In [None]:
best_lgb = {'bagging_fraction': 0.8870127757104167, 'bagging_freq': 5, 'feature_fraction': 0.8554121488511988, 'learning_rate': 0.03365956768833352, 'min_data_in_leaf': 450, 
'num_leaves': 150, 'num_threads':16,'objective':'regression','metric':'l2'}


In [None]:
gc.collect()

# 4. NN

In [None]:
train_x = alldat.loc[alldat.date_block_num<33]
val_x = alldat.loc[alldat.date_block_num==33]

In [None]:
# label encoding
train_list=[]
val_list=[]
for f in ['shop_id','item_id','date_block_num','city','item_category_id']:
    unique_val = train_data[f].unique()
    label_map={}
    print(len(unique_val))
    for val in range(len(unique_val)):
        label_map[unique_val[val]] = val
        
    train_list.append(train_x[f].map(label_map).values)
    val_list.append(val_x[f].map(label_map).values)

In [None]:
y = train_x['monthly_sales']
y_val = val_x['monthly_sales']

In [None]:
train_x.drop('monthly_sales',axis=1, inplace=True)
val_x.drop('monthly_sales',axis=1, inplace=True)

In [None]:
from keras.layers.normalization import BatchNormalization
from keras.layers.advanced_activations import PReLU
from keras.models import Model
from keras.layers import Dense,Reshape, Concatenate,Input,Dropout
from keras.layers.embeddings import Embedding
from keras.wrappers.scikit_learn import KerasRegressor
from keras import metrics


inputs=[]
embeddings=[]
# embedding layers  
shop = Input(shape=(1,))
embedding = Embedding(60,15,input_length=1)(shop)
embedding = Reshape(target_shape=(15,))(embedding)
inputs.append(shop)
embeddings.append(embedding)
    
item = Input(shape=(1,))
embedding = Embedding(21807,200,input_length=1)(item)
embedding = Reshape(target_shape=(200,))(embedding)
inputs.append(item)
embeddings.append(embedding)
    
date = Input(shape=(1,))
embedding = Embedding(34,15,input_length=1)(date)
embedding = Reshape(target_shape=(15,))(embedding)
inputs.append(date)
embeddings.append(embedding)
    
city = Input(shape=(1,))
embedding = Embedding(32,10,input_length=1)(city)
embedding = Reshape(target_shape=(10,))(embedding)
inputs.append(city)
embeddings.append(embedding)
    
category = Input(shape=(1,))
embedding = Embedding(84,20,input_length=1)(category)
embedding = Reshape(target_shape=(20,))(embedding)
inputs.append(category)
embeddings.append(embedding)
    
numeric = Input(shape=(98,))
embedding = Dense(30)(numeric)
inputs.append(numeric)
embeddings.append(embedding)

x = Concatenate()(embeddings)
x = Dense(350, kernel_initializer='uniform')(x)
# parametrized relu
x = PReLU()(x)
x = BatchNormalization()(x)
x = Dropout(0.3)(x)
    
x = Dense(50, kernel_initializer='uniform')(x)
x = PReLU()(x)
x = BatchNormalization()(x)
x = Dropout(0.1)(x)
    
output = Dense(1,activation='sigmoid')(x)
    
model = Model(inputs, output)
    
model.compile(loss='mean_squared_error', optimizer='adam')
    


In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
# min max sclaing the numerical features
normalized_f = alldat.columns.difference(['item_category_id','city','item_id','shop_id','date_block_num','monthly_sales']) 

mm = MinMaxScaler()
mm.fit(train_data[normalized_f])

train_nn = mm.transform(train_x[normalized_f])
val_nn = mm.transform(val_x[normalized_f])

train_list.append(train_nn)
val_list.append(val_nn)

In [None]:
from keras.callbacks import ModelCheckpoint
np.random.seed(7)
val_rmse={}
n=0
count=0
checkpoint = ModelCheckpoint('nn_weights.hdf5', verbose=1)

In [None]:
#kerasModel = KerasRegressor(build_fn=baseline_model, epochs=5, batch_size=2048,verbose=1)
#stop training after validation error increase for a specific number of rounds
while(True):
    print('round {}'.format(count+1))
    model.fit(train_list,np.log1p(y),epochs=1, batch_size=2048,verbose=1,callbacks=[checkpoint])
    y_pred = model.predict(val_list)
    rmse = np.sqrt(mean_squared_error(y_val.clip(0,20), np.expm1(y_pred).clip(0,20)))
    print(rmse)
    
    val_rmse[count] =rmse
    if(count>0):
        if(val_rmse[count]>val_rmse[count-1]):
            n=n+1
    count= count+1
    if(n>=3):
        break

In [None]:
model.fit(train_list,np.log1p(y),epochs=3, batch_size=2048,verbose=1)
y_pred = model.predict(val_list)
rmse = np.sqrt(mean_squared_error(y_val.clip(0,20), np.round(np.expm1(y_pred)).clip(0,20)))
print('rmse of optimized NN model is {}'.format(rmse))

In [None]:
gc.collect()

# Ensemble

In [None]:
pred_xgb=[]
pred_lgb=[]

In [None]:
# generating meta features for xgb
for t in [29,30,31,32]:
    x,y,x_val,y_val  = train_val_split(t-1,t)
    train = xgb.DMatrix(x,y)
    val = xgb.DMatrix(x_val,y_val)
    
    model = xgb.train(best_xgb, train, 60, [(train, 'Train'), (val, 'Val')], early_stopping_rounds=20, verbose_eval=10 )
    y_pred = model.predict(val)
    
    pred_xgb.append(y_pred.clip(0,20))
    gc.collect()

In [None]:
pred_xgb = np.concatenate(pred_xgb,axis=0)


In [None]:
# generating meta features for xgb
for t in [29,30,31,32]:
    x,y,x_val,y_val  = train_val_split(t-1,t)
    train_lgb = lightgbm.Dataset(x,y)
    val_lgb = lightgbm.Dataset(x_val,y_val)
    model = lightgbm.train(best_lgb, train_lgb,2500,valid_sets=[train_lgb,val_lgb],early_stopping_rounds=500,verbose_eval=250)
    
    pred_y = model.predict(x_val, num_iteration=model.best_iteration or 2500)
    
    pred_lgb.append(pred_y.clip(0,20))
    gc.collect()

In [None]:
pred_lgb = np.concatenate(pred_lgb,axis=0)

In [None]:
y_train_level2 = train_data.loc[train_data.date_block_num.isin([29,30,31,32]),'monthly_sales']

## finding best weight

In [None]:
weight = np.linspace(0,1,10000)
rmse = 100
best_weight=0 
for i in weight:
    pred = i * pred_xgb + (1-i) * pred_lgb
    score = np.sqrt(mean_squared_error(pred, y_train_level2.clip(0,20)))
    if(score<rmse):
        rmse = score
        best_weight=i
print('the best score is {}'.format(rmse))
print('the best weight is {}'.format(best_weight))

In [None]:
pd.DataFrame({'a':pred_xgb}).to_pickle('./pred_xgb.pkl')
pd.DataFrame({'a':pred_lgb}).to_pickle('./pred_lgb.pkl')

In [None]:
x,y,x_val,y_val  = train_val_split(32,33)
train = xgb.DMatrix(x,y)
val = xgb.DMatrix(x_val,y_val)
 

In [None]:
model_xgb = xgb.train(best_xgb, train, 60, [(train, 'Train'), (val, 'Val')], early_stopping_rounds=20, verbose_eval=10 )

In [None]:
pred_xgb33 = model_xgb.predict(val)

In [None]:
best_lgb = {'bagging_fraction': 0.8870127757104167, 'bagging_freq': 5, 'feature_fraction': 0.8554121488511988, 'learning_rate': 0.03365956768833352, 'min_data_in_leaf': 450, 
'num_leaves': 150, 'num_threads':16,'objective':'regression','metric':'l2'}
basic = {'num_threads':16,'objective':'regression','metric':'l2'}
x,y,x_val,y_val  = train_val_split(32,33)
train_lgb = lightgbm.Dataset(x,y)
val_lgb = lightgbm.Dataset(x_val,y_val)
model = lightgbm.train(basic, train_lgb,3000,valid_sets=[train_lgb,val_lgb],early_stopping_rounds=500,verbose_eval=100)



In [None]:
pred_lgb33 = model.predict(x_val, num_iteration=model.best_iteration or 3000)

## validation

In [None]:
score = np.sqrt(mean_squared_error(pred_lgb33.clip(0,20),y_val))
print('after ensemble, the rmse is {}'.format(score))

## submission

In [None]:
test_pred_lgb = model.predict(x_test[x.columns])

In [None]:
test_pred_xgb = model_xgb.predict(xgb.DMatrix(x_test[x.columns]))

In [None]:
final_pred = test_pred_lgb.clip(0,20)*(1-best_weight) + test_pred_xgb.clip(0,20)*(best_weight)

In [None]:
submit=pd.DataFrame({'ID':id_test,'item_cnt_month':test_pred_lgb }).set_index('ID')

In [None]:
submit.to_csv('submit.csv')