# 1. Imports

In [216]:
import numpy as n
import pandas as pd 
import os
import seaborn as sns
import shap

from datetime import datetime, timedelta
from sklearn.model_selection import train_test_split,RandomizedSearchCV,GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error,mean_squared_error

pd.options.display.max_columns = None
pd.set_option('display.max_rows', None)

seed = 42


# 2. Loading Data

In [217]:
# train data
path = '../input/walmart-recruiting-store-sales-forecasting/'
train = pd.read_csv(path+'train.csv.zip')
stores = pd.read_csv(path+'stores.csv')
features = pd.read_csv(path+'features.csv.zip')

# test data
test = pd.read_csv(path+'test.csv.zip')

In [218]:
train.head()

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday
0,1,1,2010-02-05,24924.5,False
1,1,1,2010-02-12,46039.49,True
2,1,1,2010-02-19,41595.55,False
3,1,1,2010-02-26,19403.54,False
4,1,1,2010-03-05,21827.9,False


# 3. Joining Tables

## 3.1. Filling missing dates


Some Store Departments have no data for some weeks so we are going to input these instances with zero sales

In [219]:
# Creating new index
dates = train.Date.sort_values().unique()
depts = train.Dept.sort_values().unique()
strs = train.Store.sort_values().unique()

fill_index = pd.MultiIndex.from_product([dates, strs, depts],
                           names=['Date','Store','Dept'])

# Creating holidays dataframe
holidays = train.query('Store==1 & Dept==1')[['Date','IsHoliday']]


# Filling with reindex
train_new = train.set_index(['Date','Store','Dept']).Weekly_Sales.reindex(fill_index, fill_value = 0)
train_new = train_new.to_frame()
train_new.reset_index(inplace=True)

# merging back with holidays

train_new = train_new.merge(holidays, on='Date', how = 'left')


KeyboardInterrupt: 

## 3.2. Merge

Merging the new train dataset with Stores and Features 

In [None]:
data = train_new.merge(stores, on = 'Store', how = 'left').merge(features.drop(columns=['IsHoliday']), on =['Store','Date'], how = 'left')

#data.IsHoliday_x.equals(data.IsHoliday_y)  # columns are equal so one can be dropped

data.to_csv("data.csv")

# 4. Initial Exploration

In [None]:
# Describe
data.describe().T

In [None]:
# Types
print(data.head(),'\n\n',data.dtypes) # datatypes are all good

In [None]:
# Check for Nulls
print(data.count(),'\n\n',data.isna().sum()) # null values only on markdown columns

In [None]:
# Stores entries
# data['Store'].value_counts()  # some stores have slightly more entries than others...

# this was fixed using imputation 

In [None]:
data.groupby(['Store','Dept']).agg(dates = ('Date', 'count'))

# all departments now have the same number of entries

In [None]:
# Analisis Ideas

# Total Sales by department
# Total Sales by store
# Pairplot

# 5. Feature Engineering

- OK - Parse Dates to label encoding (months and weeks of the year)
- OK - Encode variable "Type" (one hot)
- Create variables #weeks before holiday
- OK - Input missing Dept instances as zero sales (treating missing data as zero sales)
- OK - What are negative sales? (1285 entries in the test database  = 0.3% max of -4988.94 and total of -88161.56 adding up to -0,000013086% of the total sales) replace with zero?
- OK - Fill Nulls in markdown variables



## Features to create for the test dataset - Out of Time (if proven to be inportant to the prediction): 
- Forecast Temperatures (seasonal ) as function of week of the year
- Forecast Fuel Prices as an average of the last year price
- Forecast CPI (linear regression) as function of week of the year
- Forecast Unemployment (rectified linear regression with minimum bound as 2%)
- See what to to With markdowns (if important to sales prediction)

In [None]:
# Replacing Negative Sales with 0
data[data.Weekly_Sales < 0 ].Weekly_Sales = 0

In [None]:
# FillNA markdown variables with -9999
data.fillna(-9999,inplace = True)

In [None]:
# OneHot encoding variable store type
data = pd.get_dummies(data,columns=['Type'])

In [None]:
# parse data into month and weekofyear columns
data['Month'] = data.Date.apply(lambda x : datetime.strptime(str(x),'%Y-%m-%d').month)
data['WeekofYear'] = data.Date.apply(lambda x : datetime.strptime(str(x),'%Y-%m-%d').isocalendar()[1])


In [None]:
# holidays weeks
data.query('IsHoliday == True').WeekofYear.unique()

In [None]:
data['IsHoliday'] = data.query('IsHoliday == True').WeekofYear
data['IsHoliday_1'] = data.query('WeekofYear in (5, 35, 46, 51)').WeekofYear

In [None]:
data = pd.get_dummies(data,columns=['IsHoliday','IsHoliday_1'],prefix=['Holiday','Week_Before_Holiday'])

In [None]:
data.head()

# Modeling

In [None]:
X = data.drop(columns = ['Date','Weekly_Sales'])
y = data.Weekly_Sales

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=seed)

In [None]:
X_train.head()

In [None]:
X_train.shape,y_train.shape

In [None]:
rf = RandomForestRegressor(random_state=seed, n_jobs=-1)
random_grid = {'bootstrap': [True, False],
               'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
               'max_features': ['auto', 'sqrt'],
               'min_samples_leaf': [1, 2, 4],
               'min_samples_split': [2, 5, 10],
               'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}


random_grid = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, random_state=seed, n_jobs = -1)

# Fit the random search model
random_grid.fit(X_train, y_train)

print(random_grid.best_params_)

rand_reg = random_grid.best_estimator_.fit(X_train, y_train)

In [None]:
rand_reg.predict(X_train.head())

In [None]:
y_train.head(40)

In [None]:
print("train MAE: {:.2f}".format(mean_absolute_error(y_train,rand_reg.predict(X_train))))
print("test MAE: {:.2f}".format(mean_absolute_error(y_test,rand_reg.predict(X_test))))
print("train RMSE: {:.2f}".format(mean_squared_error(y_train, rand_reg.predict(X_train))**0.5))
print("test RMSE: {:.2f}".format(mean_squared_error(y_test, rand_reg.predict(X_test))**0.5))

In [None]:
imp_rf = pd.DataFrame(rand_reg.feature_importances_).transpose()
imp_rf.columns = X_train.columns
imp_rf = imp_rf.transpose().sort_values(0,ascending=False)
imp_rf.columns = ['rf_importance']

imp_rf.plot.barh()

In [None]:
explainer = shap.TreeExplainer(rand_reg, approximate = True)
rf_shap_values = explainer.shap_values(X_test.iloc[0:1000,:])
shap.summary_plot(rf_shap_values, X_test.iloc[0:1000,:])