##### Import of the required libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error, mean_squared_error

%matplotlib inline

##### Definition of helper funtions

In [2]:
def error_table(model_name, y_real, y_pred):
    mae = mean_absolute_error(y_real, y_pred)
    mape = mean_absolute_percentage_error(y_real, y_pred)
    rmse = np.sqrt(mean_squared_error(y_real, y_pred))

    return pd.DataFrame({'Model Name': model_name,
                        'MAE': mae,
                        'MAPE': mape,
                        'RMSE': rmse}, index = [0])

##### Data Preparation

In [3]:
df = pd.read_csv("../data/sales_stores.csv", parse_dates=['Date'])
df.head()

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,WeekOfYear,StoreType,Assortment,CompetitionDistance,CompetitionOpenSince,Promo2,Promo2Since,PromoInterval,CompetitionTimeDays,CompetitionTimeMonths,PromoTimeWeeks
0,1,5,2015-07-31,5263,555,1,1,no_holiday,1,31,c,basic,1270.0,2008-09-01,0,,,2524,82,0
1,2,5,2015-07-31,6064,625,1,1,no_holiday,1,31,a,basic,570.0,2007-11-01,1,2010-03-29,"Jan,Apr,Jul,Oct",2829,93,278
2,3,5,2015-07-31,8314,821,1,1,no_holiday,1,31,a,basic,14130.0,2006-12-01,1,2011-04-04,"Jan,Apr,Jul,Oct",3164,104,225
3,4,5,2015-07-31,13995,1498,1,1,no_holiday,1,31,c,extended,620.0,2009-09-01,0,,,2159,70,0
4,5,5,2015-07-31,4822,559,1,1,no_holiday,1,31,a,basic,29910.0,2015-04-01,0,,,121,3,0


As we will aggregate the data on a weekly basis, we no longer need `DayOfWeek`.

In [4]:
df.drop('DayOfWeek', axis=1, inplace=True)

During the exploratory data analysis, we realized that some variables would not be of much use to us:

In [5]:
df.drop(['Promo2', 'Promo2Since', 'PromoInterval', 'CompetitionDistance', 'CompetitionOpenSince', 'CompetitionTimeDays', 'CompetitionTimeMonths'], axis=1, inplace=True)

One-hot encoding for categorical variables:

In [6]:
df = pd.get_dummies(df, dtype='int')
# No holiday = all zeroes on the StateHoliday columns
df.drop('StateHoliday_no_holiday', axis=1, inplace=True)
df.columns

Index(['Store', 'Date', 'Sales', 'Customers', 'Open', 'Promo', 'SchoolHoliday',
       'WeekOfYear', 'PromoTimeWeeks', 'StateHoliday_christmas',
       'StateHoliday_easter', 'StateHoliday_public_holiday', 'StoreType_a',
       'StoreType_b', 'StoreType_c', 'StoreType_d', 'Assortment_basic',
       'Assortment_extended', 'Assortment_extra'],
      dtype='object')

##### Weekly aggregation

We make `Date` the index:

In [7]:
df.set_index('Date', inplace=True)
# Now we have a DataFrame for each date:
#df.loc['2013-01-01']

Let's aggregate the data on a weekly basis:

In [8]:
# The anchored offset W would give sunday weekly frequency
by_week_store = df.groupby([pd.Grouper(freq='W-Mon'), 'Store'])

sum_cols = by_week_store[['Sales', 'Customers', 'Open', 'Promo', 'SchoolHoliday', 'StateHoliday_christmas', 
                          'StateHoliday_easter', 'StateHoliday_public_holiday']].sum()

fix_cols = by_week_store[['StoreType_a', 'StoreType_b', 'StoreType_c', 'StoreType_d', 'Assortment_basic', 
                          'Assortment_extended', 'Assortment_extra','WeekOfYear', 'PromoTimeWeeks']].first()

week_df = pd.concat([sum_cols, fix_cols], axis=1)

week_df.head()

# With code like this we can check that it has worked fine:
#foo = df.loc['2013-01-01':'2013-01-07']
#foo[foo['Store']  == 1]['Sales'].sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,Sales,Customers,Open,Promo,SchoolHoliday,StateHoliday_christmas,StateHoliday_easter,StateHoliday_public_holiday,StoreType_a,StoreType_b,StoreType_c,StoreType_d,Assortment_basic,Assortment_extended,Assortment_extra,WeekOfYear,PromoTimeWeeks
Date,Store,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2013-01-07,1,26516,3285,5,1,5,0,0,0,0,0,1,0,1,0,0,2,0
2013-01-07,2,22182,2866,5,1,3,0,0,0,1,0,0,0,1,0,0,2,145
2013-01-07,3,35564,3820,5,1,3,0,0,0,1,0,0,0,1,0,0,2,92
2013-01-07,4,48928,6985,5,1,3,0,0,0,0,0,1,0,0,1,0,2,0
2013-01-07,5,20742,2520,5,1,1,0,0,0,1,0,0,0,1,0,0,2,0


In [9]:
# Return temporarily to a sequential index
week_df = week_df.reset_index()
week_df['Date'] = week_df['Date'] - pd.Timedelta(days=7)
week_df.head()

Unnamed: 0,Date,Store,Sales,Customers,Open,Promo,SchoolHoliday,StateHoliday_christmas,StateHoliday_easter,StateHoliday_public_holiday,StoreType_a,StoreType_b,StoreType_c,StoreType_d,Assortment_basic,Assortment_extended,Assortment_extra,WeekOfYear,PromoTimeWeeks
0,2012-12-31,1,26516,3285,5,1,5,0,0,0,0,0,1,0,1,0,0,2,0
1,2012-12-31,2,22182,2866,5,1,3,0,0,0,1,0,0,0,1,0,0,2,145
2,2012-12-31,3,35564,3820,5,1,3,0,0,0,1,0,0,0,1,0,0,2,92
3,2012-12-31,4,48928,6985,5,1,3,0,0,0,0,0,1,0,0,1,0,2,0
4,2012-12-31,5,20742,2520,5,1,1,0,0,0,1,0,0,0,1,0,0,2,0


In [10]:
# Create a time series for the number of observations per date
obs_by_date = week_df.groupby('Date').size()

total_weeks = 0
data_points = 0
for no_stores in obs_by_date.unique():
    no_weeks = obs_by_date[obs_by_date == no_stores].size
    total_weeks += no_weeks
    data_points += no_weeks*no_stores
    print("There are {} Week(s) with {} observed stores.".format(no_weeks, no_stores))

print("There are {} Weeks in total.".format(total_weeks))
print("And there are {} data points in total.".format(data_points))

There are 7 Week(s) with 1111 observed stores.
There are 1 Week(s) with 1105 observed stores.
There are 1 Week(s) with 1107 observed stores.
There are 5 Week(s) with 1109 observed stores.
There are 2 Week(s) with 1110 observed stores.
There are 8 Week(s) with 1112 observed stores.
There are 23 Week(s) with 1113 observed stores.
There are 17 Week(s) with 1114 observed stores.
There are 45 Week(s) with 1115 observed stores.
There are 5 Week(s) with 934 observed stores.
There are 3 Week(s) with 932 observed stores.
There are 1 Week(s) with 928 observed stores.
There are 2 Week(s) with 933 observed stores.
There are 2 Week(s) with 931 observed stores.
There are 13 Week(s) with 935 observed stores.
There are 135 Weeks in total.
And there are 145639 data points in total.


We log-scale 'Sales' because it has a large range of values and it's skewed towards long values.:

In [11]:
week_df['Sales'] = np.log1p(week_df['Sales'])

##### Split dataframe into training and test

In [12]:
week_df.sort_values(by=['Date', 'Store'], ascending=[False, True], inplace=True)
week_df.set_index(['Store', 'Date'], inplace=True)

In [13]:
# 8 weeks before the last date
test_date = week_df.index[0][1]- datetime.timedelta(weeks=8)

X_test =week_df[week_df.index.get_level_values(1) >= test_date].drop('Sales', axis=1)
X_train =week_df[week_df.index.get_level_values(1) < test_date].drop('Sales', axis=1)

y_test =week_df['Sales'][week_df.index.get_level_values(1) >= test_date]
y_train =week_df['Sales'][week_df.index.get_level_values(1) < test_date]

##### Averages Model

In [14]:
stores_mean_sales = pd.DataFrame(week_df['Sales'].groupby(level=['Store']).mean().rename('Predictions'))
avg_predictions = pd.DataFrame(y_test).join(stores_mean_sales)['Predictions']

error_table( 'Averages Model', np.expm1(y_test), np.expm1(avg_predictions))

Unnamed: 0,Model Name,MAE,MAPE,RMSE
0,Averages Model,4099.007178,0.103323,5642.39733


##### Pooled Regression Model