Trying to replicate models developed by David Thaler:

https://www.kaggle.com/c/walmart-recruiting-store-sales-forecasting/forums/t/8125/first-place-entry/56111#post56111

https://bitbucket.org/dthal/kaggle_walmart

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.ensemble import RandomForestRegressor

In [17]:
# 10 stores to test models on: [769, 1097, 85, 562, 262, 733, 494, 682, 335, 423]
id_list = [769, 1097, 85, 562, 262, 733, 494, 682, 335, 423]

data = pd.read_csv('train.csv', parse_dates = ['Date'])
#data = data[data['Open'] != 0]
#store = pd.read_csv('store.csv')  

#df = data.merge(store, on = 'Store', copy = False)

test_raw = pd.read_csv('test.csv', parse_dates = ['Date'])

In [75]:
train = data[data.Store.isin(set(id_list))][['Store','DayOfWeek','Date','Sales','Promo',
                                             'StateHoliday','SchoolHoliday']]
train['woy'] = train
#train.set_index('Date', inplace=True)
#train.head()

test = test_raw[test_raw.Store.isin(set(id_list))][['Store','DayOfWeek','Date','Promo',
                                             'StateHoliday','SchoolHoliday']]
#test.set_index('Date', inplace=True)
#print test.head()
#print test.tail()

In [215]:
def naive(train, test):
    '''
    Computes naive forecasts
    
    INPUT:
    train - dataframe of daily sales values [1 row for each (date, Store) duple]
    test - dataframe of daily store information [1 row for each (date, Store) duple]
    
    OUTPUT:
    out - dataframe of test data with columns: date, Store, Forecasted Sales
    '''
    # Make a copy of the test dataframe to use for forecasting
    out = test[['Store','Date']]
    out.set_index('Date', inplace=True)
    
    # Subset only the last observation from the train dataframe
    # The sales on this day will be used for the naive forecast
    tr = train.loc[train.Date==train.Date.max(),:]
    tr.set_index('Store', inplace=True)
    
    # Apply the last know sales value to each forecast date in the
    # test data
    out.loc[:,'SalesForecast'] = out.Store.map(tr.Sales).values
    return out

def seasonal_naive(train, test):
    '''
    Computes seasonal naive forecasts
    
    INPUT:
    train - dataframe of daily sales values [1 row for each (date, Store) duple]
    test - dataframe of daily store information [1 row for each (date, Store) duple]
    
    OUTPUT:
    out - dataframe of test data with columns: date, Store, Forecasted Sales
    '''
    lag = 364 # lag by 364 days (won't work in leap year)
    
    # Make a copy of the test dataframe to use for forecasting
    out = test[['Store','Date','Promo']]
    out.set_index(['Date','Store'], inplace=True)
    
    '''
    # Make a pivot table of Sales values with  Date x Store
    tr = pd.pivot_table(train, values='Sales', index='Date', 
                            columns='Store', aggfunc=np.mean)
    
    # lag the pivot table by 364 days and adjust the date
    # this step insures that the indices will line up with the test data
    tr_lag = tr.iloc[-lag:,:]
    tr_lag.index = tr_lag.index + np.timedelta64(lag,'D')

    # unstack and reindex the pivot table so that it's in the same format
    # as the test data
    tr_lag = tr_lag.unstack().reset_index().set_index(['Date','Store'])
    '''
    # Reset date index to lag data by +364 days. This shift ensures that
    # the test data is set to the same weekday from 1 year ago
    # (does not take into account leap years)
    tr_lag = train[['Sales','Date','Store']]
    tr_lag.loc[:,'Date'] = tr_lag.Date + np.timedelta64(lag,'D')
    tr_lag = tr_lag.set_index(['Date','Store'])
    
    # Merge data, keeping only dates from test data
    out = out.merge(pd.DataFrame(tr_lag), how='left', left_index=True, right_index=True).drop('Promo',axis=1)
    out.columns = ['SalesForecast']
    
    return out

def product(train, test):
    '''
    Computes forecasts with the product model. This model predicts the mean
    value by store times the mean value by week divided by the mean value
    '''

In [216]:
out = seasonal_naive(train, test)

In [217]:
out.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,SalesForecast
Date,Store,Unnamed: 2_level_1
2015-09-17,262,16660
2015-09-17,335,16747
2015-09-17,562,16289
2015-09-17,733,15415
2015-09-17,769,11083


In [195]:
out.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,SalesForecast
Date,Store,Unnamed: 2_level_1
2015-09-17,262,16660
2015-09-17,335,16747
2015-09-17,562,16289
2015-09-17,733,15415
2015-09-17,769,11083


In [116]:
panel_dict = dict.fromkeys(id_list)

train_copy = train.set_index('Date')
for store in id_list:
    panel_dict[store] = train_copy.query('Store == @store')
    
panel_train = pd.Panel(panel_dict)
panel_train


<class 'pandas.core.panel.Panel'>
Dimensions: 10 (items) x 942 (major_axis) x 6 (minor_axis)
Items axis: 85 to 1097
Major_axis axis: 2015-07-31 00:00:00 to 2013-01-01 00:00:00
Minor_axis axis: Store to SchoolHoliday

In [120]:
ct = pd.pivot_table(train, values='Sales', index='Date', 
                            columns='Store', aggfunc=np.mean)
ct.head()

Store,85,262,335,423,494,562,682,733,769,1097
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2013-01-01,4220,17267,2401,9643,3113,8498,3375,10765,5035,5961
2013-01-02,6069,16964,11542,9570,6300,15472,10526,12477,7276,6688
2013-01-03,5246,16616,10686,8254,6209,14807,11041,12639,6972,7053
2013-01-04,5339,16849,10420,9285,6772,15357,11376,12078,7374,6430
2013-01-05,5774,14868,7765,7095,4891,12554,7593,10674,6877,5460


In [121]:
ct_lag = ct.iloc[-364:,:]
ct_lag.head()

Store,85,262,335,423,494,562,682,733,769,1097
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2014-08-02,5845,19920,9115,9331,5922,14427,6985,15521,10893,7112
2014-08-03,12265,30097,8721,14633,8759,21206,6726,16139,11572,12606
2014-08-04,8779,20103,19878,11439,10004,20483,12031,16785,10955,9826
2014-08-05,7055,19051,17000,10889,8687,17560,11256,15835,10117,8937
2014-08-06,7098,18572,15869,10014,8248,17776,10841,15501,10024,8448


In [142]:
ct_copy = ct_lag.unstack().reset_index()
ct_copy.Date = ct_copy.Date + np.timedelta64(364,'D')
ct_copy.set_index(['Date','Store'], inplace=True)
ct_copy.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,0
Date,Store,Unnamed: 2_level_1
2015-08-01,85,5845
2015-08-02,85,12265
2015-08-03,85,8779
2015-08-04,85,7055
2015-08-05,85,7098


In [209]:
train.Date.iloc[3630]

Timestamp('2014-08-02 00:00:00')

In [211]:
(train.Date + np.timedelta64(364,'D')).iloc[3630]

Timestamp('2015-08-01 00:00:00')