# Imports

In [None]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error,mean_squared_error

In [None]:
PATH = "data/demand-forecasting/"

# Helper Functions

In [None]:
def add_datepart(df, fldname, drop=True, time=False):
    fld = df[fldname]
    fld_dtype = fld.dtype
    if isinstance(fld_dtype, pd.core.dtypes.dtypes.DatetimeTZDtype):
        fld_dtype = np.datetime64

    if not np.issubdtype(fld_dtype, np.datetime64):
        df[fldname] = fld = pd.to_datetime(fld, infer_datetime_format=True)
    targ_pre = re.sub('[Dd]ate$', '', fldname)
    attr = ['Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear',
            'Is_month_end', 'Is_month_start', 'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start']
    if time: attr = attr + ['Hour', 'Minute', 'Second']
    for n in attr: df[targ_pre + n] = getattr(fld.dt, n.lower())
    df[targ_pre + 'Elapsed'] = fld.astype(np.int64) // 10 ** 9
    if drop: df.drop(fldname, axis=1, inplace=True)

In [None]:
def apply_cats(df, trn):
    for n,c in df.items():
        if (n in trn.columns) and (trn[n].dtype.name=='category'):
            df[n] = pd.Categorical(c, categories=trn[n].cat.categories, ordered=True)

In [None]:
def smape(y_pred,y_true):
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 200.0
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    return np.nanmean(diff)

In [None]:
def print_scores(pred,label):
    rmse = np.sqrt(mean_squared_error(pred,label))
    mae = mean_absolute_error(pred,label)
    smape_score = smape(pred,label)
    
    print('RMSE: ' + str(rmse))
    print('MAE: ' + str(mae))
    print('SMAPE: ' + str(smape_score))

In [None]:
def plot_preds(pred,label):  
    
    plt.xlabel('Actual Y ')
    plt.ylabel('Predicted Y')
    
    x = np.linspace(0.0, 200.0)
    y = x
    plt.plot(x, y, 'r')
    
    plt.scatter(label,pred,alpha=0.2)
    plt.show()

# Pre-Process Data

In [None]:
train = pd.read_csv(f'{PATH}train.csv', parse_dates=['date'])
test = pd.read_csv(f'{PATH}test.csv', parse_dates=['date'])

In [None]:
add_datepart(train, 'date',drop=False)
add_datepart(test, 'date', drop=False)

In [None]:
cat_vars = ['store', 'item', 'Month', 'Week', 'Day','Dayofweek', 'Dayofyear']
contin_vars = ['Year', 'Elapsed'] 

# Elapsed = Unix Timestamp ( 0 = 1 Jan 1970 12.00am and can be negative)
# Discared date categorical variables
# 'Is_month_end', 'Is_month_start', 'Is_quarter_end', 'Is_quarter_start','Is_year_end', 'Is_year_start'

In [None]:
# Reorg train and test dataset
dependent = 'sales'
train = train[cat_vars+contin_vars+[dependent, 'date']].copy()

test[dependent] = 0 # broadcast zero for 'Sales' dummy col
test = test[cat_vars+contin_vars+[dependent, 'date', 'id']].copy()

In [None]:
for v in cat_vars: train[v] = train[v].astype('category').cat.as_ordered()

In [None]:
apply_cats(test, train)

In [None]:
for v in contin_vars:
    train[v] = train[v].fillna(0).astype('float32')
    test[v] = test[v].fillna(0).astype('float32')

In [None]:
sales = train['sales'].copy()
train.drop(columns=['date','sales'],inplace=True)
test.drop(columns=['date','sales','id'],inplace=True )

# Feature Engineering

We expect sales to be what they were at the same time of year, in the past, for each store+item combo

In [None]:
#df["median-store_item-month"] = df.groupby(['Month',"item","store"])["sales"].transform("median") # median sales for particular item-store combo
#df["median-store_item-week"] = df.groupby(['Week',"item","store"])["sales"].transform("median") # median sales for particular item-store combo
#df["median-store_item-dayofweek"] = df.groupby(['Dayofweek',"item","store"])["sales"].transform("median") # median sales for particular item-store combo

df["mean-store_item-month"] = df.groupby(['Month',"item","store"])["sales"].transform("mean") # mean sales for particular item-store combo
df["mean-store_item-week"] = df.groupby(['Week',"item","store"])["sales"].transform("mean") # mean sales for particular item-store combo
df["mean-store_item-dayofweek-month"] = df.groupby(['Dayofweek','Month',"item","store"])["sales"].transform("mean") # mean sales for particular item-store combo

#df["item-month-sum"] = df.groupby(['Month',"item"])["sales"].transform("sum") # total sales of that item  for all stores
#df["store-month-sum"] = df.groupby(['Month',"store"])["sales"].transform("sum") # total sales of that store  for all items

#df["item-week-sum"] = df.groupby(['Week',"item"])["sales"].transform("sum") # total sales of that item  for all stores
#df["store-week-sum"] = df.groupby(['Week',"store"])["sales"].transform("sum") # total sales of that store  for all items

In [None]:
df.drop(columns=['Is_month_end', 'Is_month_start','Is_quarter_end',
                 'Is_quarter_start','Is_year_end','Is_year_start', 'Elapsed'],inplace=True)

In [None]:
# df["item-week_shifted-90"] = df.groupby(['Week',"item"])["sales"].transform(lambda x:x.shift(12).sum()) # shifted total sales for that item 12 weeks (3 months) ago
# df["store-week_shifted-90"] = df.groupby(['Week',"store"])["sales"].transform(lambda x:x.shift(12).sum()) # shifted total sales for that store 12 weeks (3 months) ago
# df["item-week_shifted-90"] = df.groupby(['Week',"item"])["sales"].transform(lambda x:x.shift(12).mean()) # shifted mean sales for that item 12 weeks (3 months) ago
# df["store-week_shifted-90"] = df.groupby(['Week',"store"])["sales"].transform(lambda x:x.shift(12).mean()) # shifted mean sales for that store 12 weeks (3 months) ago

# Split Training-Validation Data

In [None]:
train.shape, sales.shape, test.shape

In [None]:
# closest same period (diff year) as test set
X_valid = train.loc[(train.Year==2017) & ((train.Month==1) | (train.Month==2) | (train.Month==3))].copy() 
y_valid = sales[X_valid.index].copy()

X_train = train.drop(X_valid.index).copy()
y_train = sales.drop(X_valid.index).copy()

In [None]:
X_train.shape, y_train.shape, X_valid.shape, y_valid.shape

# Random Forest

## With Validation Set

In [None]:
%%time
rf = RandomForestRegressor(n_estimators=200, n_jobs=-1)
rf.fit(X_train, y_train)

In [None]:
predictions_train = rf.predict(X_train)
print_scores(predictions_train, y_train)

In [None]:
predictions_valid = rf.predict(X_valid)
print_scores(predictions_valid, y_valid)

In [None]:
plot_preds(predictions_valid, y_valid)

## All In

In [None]:
%%time
rf = RandomForestRegressor(n_estimators=200, n_jobs=-1)
rf.fit(train, sales)

In [None]:
y_pred = rf.predict(test)

# Submission

In [None]:
submission = pd.read_csv(f'{PATH}test.csv', index_col='id')

In [None]:
submission['sales'] = y_pred

In [None]:
csv_fn = f'{PATH}tmp/RF_v3.csv'

In [None]:
submission[['sales']].to_csv(csv_fn)

# Feature Importance

In [None]:
plt.barh(X_train.columns, rf.feature_importances_)