In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use("dark_background")
import gc
from sklearn.model_selection import train_test_split
from lightgbm import LGBMRegressor
import lightgbm as lgb
from hyperopt import hp, tpe, fmin
from sklearn.model_selection import cross_val_score

In [None]:
pd.pandas.set_option('display.max_columns', None)

# Load Data

In [None]:
# load data
train = pd.read_csv("./m5-forecasting-accuracy/sales_train_evaluation.csv")
calendar = pd.read_csv("./m5-forecasting-accuracy/calendar.csv")
sell_prices = pd.read_csv("./m5-forecasting-accuracy/sell_prices.csv")

In [None]:
train.head()

In [None]:
calendar.head()

In [None]:
sell_prices.head()

# Check Null Values

In [None]:
train.isnull().sum().sort_values(ascending = False)

In [None]:
calendar.isnull().sum().sort_values(ascending = False)

In [None]:
# Add Zero sales for dates d_1942 to d_1969
for i in range(1942,1970):
    col = "d_"+ str(i)
    train[col] = 0

# Memory Usage Reduction

In [None]:
#Downcast in order to save memory
def downcast(df):
    cols = df.dtypes.index.tolist()
    types = df.dtypes.values.tolist()
    for i,t in enumerate(types):
        if 'int' in str(t):
            if df[cols[i]].min() > np.iinfo(np.int8).min and df[cols[i]].max() < np.iinfo(np.int8).max:
                df[cols[i]] = df[cols[i]].astype(np.int8)
            elif df[cols[i]].min() > np.iinfo(np.int16).min and df[cols[i]].max() < np.iinfo(np.int16).max:
                df[cols[i]] = df[cols[i]].astype(np.int16)
            elif df[cols[i]].min() > np.iinfo(np.int32).min and df[cols[i]].max() < np.iinfo(np.int32).max:
                df[cols[i]] = df[cols[i]].astype(np.int32)
            else:
                df[cols[i]] = df[cols[i]].astype(np.int64)
        elif 'float' in str(t):
            if df[cols[i]].min() > np.finfo(np.float16).min and df[cols[i]].max() < np.finfo(np.float16).max:
                df[cols[i]] = df[cols[i]].astype(np.float16)
            elif df[cols[i]].min() > np.finfo(np.float32).min and df[cols[i]].max() < np.finfo(np.float32).max:
                df[cols[i]] = df[cols[i]].astype(np.float32)
            else:
                df[cols[i]] = df[cols[i]].astype(np.float64)
        elif t == np.object:
            if cols[i] == 'date':
                df[cols[i]] = pd.to_datetime(df[cols[i]], format='%Y-%m-%d')
            else:
                df[cols[i]] = df[cols[i]].astype('category')
    return df  

In [None]:
train = downcast(train)
sell_prices = downcast(sell_prices)
calendar = downcast(calendar)

# MELT the Dataset

In [None]:
sale_train = pd.melt(train, 
                  id_vars = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], 
                  var_name = 'd', 
                  value_name = "sales")

In [None]:
sale_train.head()

In [None]:
del train
gc.collect()

# Merge the datasets

In [None]:
combine = pd.merge(sale_train,calendar, on = "d")
combine.head()

In [None]:
del calendar,sale_train
gc.collect()

In [None]:
fulldata = pd.merge(combine, sell_prices, on=['store_id','item_id','wm_yr_wk'], how='left') 
fulldata.head()

In [None]:
del sell_prices
gc.collect()

# Feature Engineering

In [None]:
new_id = dict(zip(fulldata.id.cat.codes, fulldata.id))
new_item_id = dict(zip(fulldata.item_id.cat.codes, fulldata.item_id))
new_dept_id = dict(zip(fulldata.dept_id.cat.codes, fulldata.dept_id))
new_cat_id = dict(zip(fulldata.cat_id.cat.codes, fulldata.cat_id))
new_store_id = dict(zip(fulldata.store_id.cat.codes, fulldata.store_id))
new_d_state_id = dict(zip(fulldata.state_id.cat.codes, fulldata.state_id))

In [None]:
fulldata.d = fulldata['d'].apply(lambda x: x.split('_')[1]).astype(np.int16) #change d into int type
fulldata.head()

In [None]:
# convert numeric variables into categorical variables
cols = fulldata.dtypes.index.tolist()
types = fulldata.dtypes.values.tolist()
for i,type in enumerate(types):
    if type.name == 'category':
        fulldata[cols[i]] = fulldata[cols[i]].cat.codes

In [None]:
fulldata['sell_price'].fillna(0, inplace = True)

In [None]:
fulldata.head()

In [None]:
fulldata.drop('date',1,inplace = True)

In [None]:
lagdata = fulldata.copy()

In [None]:
#Introduce lags
lags = [1,7,14,28]
for lag in lags:
    lagdata['lag_'+str(lag)] = lagdata.groupby('id')['sales'].shift(lag).astype(np.float16)

In [None]:
windows = [7,14]
for lag in lags:
    for w in windows:
        lagdata['lag_'+str(lag)+'_rolling_mean_'+str(w)] = lagdata.groupby('id')['sales'].shift(lag).rolling(w).mean().astype(np.float16)
        

In [None]:
lagdata.head()

# Model train

In [None]:
valid = lagdata[(lagdata['d']>=1914) & (lagdata['d']<1942)][['id','d','sales']]
test = lagdata[lagdata['d']>=1942][['id','d','sales']]
eval_preds = test['sales']
valid_preds = valid['sales']

In [None]:
valgrid = {'n_estimators':hp.quniform('n_estimators', 800, 1200, 100),
           'learning_rate':hp.quniform('learning_rate', 0.1, 0.4, 0.1),
           'max_depth':hp.quniform('max_depth', 8,12,1),
           'num_leaves':hp.quniform('num_leaves', 50,100,25),
           'subsample':hp.quniform('subsample', 0.5, 0.9, 0.1),
           'colsample_bytree':hp.quniform('colsample_bytree', 0.5, 0.9, 0.1),
           'min_child_weight':hp.quniform('min_child_weight', 100, 500, 100) 
          }

def objective(params):
    params = {'n_estimators': int(params['n_estimators']),
              'learning_rate': params['learning_rate'],
              'max_depth': int(params['max_depth']),
              'num_leaves': int(params['num_leaves']),
              'subsample': params['subsample'],
              'colsample_bytree': params['colsample_bytree'],
              'min_child_weight': params['min_child_weight']}
    
    lgb_a = LGBMRegressor(**params)
    score = cross_val_score(lgb_a, X_train, y_train, cv=2, n_jobs=-1).mean()
    return score

bestP = fmin(fn= objective, space= valgrid, max_evals=20, rstate=np.random.RandomState(123), algo=tpe.suggest)

In [None]:
cats = lagdata.cat_id.astype('category').cat.codes.unique().tolist()
for cat in cats:
    df = lagdata[lagdata['cat_id']==cat]
    
    # split the data into train,validate and test
    X_train, y_train = df[df['d']<1914].drop('sales',axis=1), df[df['d']<1914]['sales']
    X_valid, y_valid = df[(df['d']>=1914) & (df['d']<1942)].drop('sales',axis=1), df[(df['d']>=1914) & (df['d']<1942)]['sales']
    X_test = df[df['d']>=1942].drop('sales',axis=1)
    
    #model
    model = LGBMRegressor(
        n_estimators = int(bestP['n_estimators']),
        learning_rate = bestP['learning_rate'],
        subsample = bestP['subsample'],
        colsample_bytree = bestP['colsample_bytree'],
        max_depth = int(bestP['max_depth']),
        num_leaves = int(bestP['num_leaves']),
        min_child_weight = int(bestP['min_child_weight'])
    )
    print('Category: {} prediction result'.format(new_cat_id[cat]))
    lgbmmodel = model.fit(X_train, y_train, eval_set=[(X_train,y_train),(X_valid,y_valid)],
             eval_metric='rmse', verbose=20, early_stopping_rounds=20)
    
    lgb.plot_importance(model, importance_type="gain", precision=0, figsize=(6, 13))

In [None]:
valid_preds[X_valid.index] = model.predict(X_valid)
eval_preds[X_test.index] = model.predict(X_test)
#del model, X_train, y_train, X_valid, y_valid
#gc.collect()

# Submission File

In [None]:
valid['sales'] = valid_preds
validation = valid[['id','d','sales']]
validation = pd.pivot(validation, index='id', columns='d', values='sales').reset_index()
validation.columns=['id'] + ['F' + str(i + 1) for i in range(28)]
validation.id = validation.id.map(new_id).str.replace('evaluation','validation')

#Get the evaluation results
test['sales'] = eval_preds
evaluation = test[['id','d','sales']]
evaluation = pd.pivot(evaluation, index='id', columns='d', values='sales').reset_index()
evaluation.columns=['id'] + ['F' + str(i + 1) for i in range(28)]
#Remap the category id to their respective categories
evaluation.id = evaluation.id.map(new_id)

#Prepare the submission
submit = pd.concat([validation,evaluation]).reset_index(drop=True)
submit.to_csv('submission4.csv',index=False)