In [2]:
import os
import pandas as pd
from ipywidgets.widgets import Dropdown, interact
from ipywidgets import Button, HBox, VBox, interactive_output

import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
plt.rcParams['figure.figsize'] = (10,8)

In [16]:
from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess, Fourier
from sklearn.linear_model import Ridge

In [27]:
all_data = {}
for file in os.listdir('data'):
    filename = file.split('.')[0]
    all_data[filename] = pd.read_parquet('data/{}'.format(file),)
    
all_data.keys()

dict_keys(['holidays_events', 'oil', 'sample_submission', 'stores', 'test', 'train', 'transactions'])

In [28]:
train, stores, holidays_events, oil = (all_data['train'], all_data['stores'], 
         all_data['holidays_events'], all_data['oil'])
for data in (train, holidays_events, oil):
    data['date'] = data['date'].astype('datetime64[ns]')

In [29]:
train.drop(['id', 'onpromotion'], axis=1, inplace=True)
train.date = train.date.dt.to_period('D')
train = train.set_index(['store_nbr', 'family', 'date']).sort_index()
train

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sales
store_nbr,family,date,Unnamed: 3_level_1
1,AUTOMOTIVE,2013-01-01,0.0
1,AUTOMOTIVE,2013-01-02,2.0
1,AUTOMOTIVE,2013-01-03,3.0
1,AUTOMOTIVE,2013-01-04,3.0
1,AUTOMOTIVE,2013-01-05,5.0
...,...,...,...
54,SEAFOOD,2017-08-11,0.0
54,SEAFOOD,2017-08-12,1.0
54,SEAFOOD,2017-08-13,2.0
54,SEAFOOD,2017-08-14,0.0


In [30]:
sdate = '2017-04-01'
edate = '2017-08-15'

In [31]:
y = train.unstack(['store_nbr', 'family']).loc[sdate:edate]
y

Unnamed: 0_level_0,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales
store_nbr,1,1,1,1,1,1,1,1,1,1,...,54,54,54,54,54,54,54,54,54,54
family,AUTOMOTIVE,BABY CARE,BEAUTY,BEVERAGES,BOOKS,BREAD/BAKERY,CELEBRATION,CLEANING,DAIRY,DELI,...,MAGAZINES,MEATS,PERSONAL CARE,PET SUPPLIES,PLAYERS AND ELECTRONICS,POULTRY,PREPARED FOODS,PRODUCE,SCHOOL AND OFFICE SUPPLIES,SEAFOOD
date,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
2017-04-01,9.0,0.0,1.0,3229.0,0.0,526.24900,14.0,858.0,1151.0,243.272,...,1.0,65.718000,339.0,0.0,12.0,87.915000,63.0,875.138,8.0,3.0
2017-04-02,4.0,0.0,1.0,1210.0,0.0,180.33900,3.0,281.0,446.0,86.642,...,0.0,51.650000,306.0,0.0,2.0,77.569000,75.0,821.363,7.0,2.0
2017-04-03,11.0,0.0,2.0,2097.0,0.0,444.85700,11.0,801.0,794.0,166.120,...,0.0,67.687004,327.0,0.0,6.0,33.741000,58.0,525.763,16.0,2.0
2017-04-04,3.0,0.0,4.0,2249.0,1.0,403.81900,19.0,673.0,725.0,149.078,...,0.0,40.891000,207.0,0.0,2.0,58.918003,54.0,859.102,13.0,0.0
2017-04-05,5.0,0.0,1.0,2687.0,2.0,499.38500,18.0,1057.0,1074.0,216.743,...,0.0,39.349000,200.0,1.0,2.0,37.943000,53.0,610.636,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-08-11,1.0,0.0,1.0,1006.0,0.0,145.60700,4.0,341.0,343.0,64.302,...,0.0,50.756000,155.0,0.0,0.0,80.759000,54.0,546.250,0.0,0.0
2017-08-12,6.0,0.0,3.0,1659.0,0.0,243.22000,3.0,351.0,526.0,99.488,...,1.0,53.079002,169.0,0.0,4.0,91.671000,81.0,696.920,0.0,1.0
2017-08-13,1.0,0.0,1.0,803.0,0.0,136.67900,1.0,169.0,266.0,47.770,...,3.0,67.435000,244.0,0.0,2.0,79.062996,91.0,877.304,0.0,2.0
2017-08-14,1.0,0.0,6.0,2201.0,0.0,346.03800,4.0,571.0,699.0,154.578,...,1.0,64.224000,200.0,0.0,1.0,56.155000,147.0,585.615,0.0,0.0


In [32]:
fourier = CalendarFourier(freq='W', order=4)

dp = DeterministicProcess(index=y.index,
                          constant=False,
                          order=1,
                          seasonal=False,
                          additional_terms=[fourier],
                          drop=True)
X = dp.in_sample()

In [33]:
X

Unnamed: 0_level_0,trend,"sin(1,freq=W-SUN)","cos(1,freq=W-SUN)","sin(2,freq=W-SUN)","cos(2,freq=W-SUN)","sin(3,freq=W-SUN)","cos(3,freq=W-SUN)"
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2017-04-01,1.0,-0.974928,-0.222521,0.433884,-0.900969,0.781831,0.623490
2017-04-02,2.0,-0.781831,0.623490,-0.974928,-0.222521,-0.433884,-0.900969
2017-04-03,3.0,0.000000,1.000000,0.000000,1.000000,0.000000,1.000000
2017-04-04,4.0,0.781831,0.623490,0.974928,-0.222521,0.433884,-0.900969
2017-04-05,5.0,0.974928,-0.222521,-0.433884,-0.900969,-0.781831,0.623490
...,...,...,...,...,...,...,...
2017-08-11,133.0,-0.433884,-0.900969,0.781831,0.623490,-0.974928,-0.222521
2017-08-12,134.0,-0.974928,-0.222521,0.433884,-0.900969,0.781831,0.623490
2017-08-13,135.0,-0.781831,0.623490,-0.974928,-0.222521,-0.433884,-0.900969
2017-08-14,136.0,0.000000,1.000000,0.000000,1.000000,0.000000,1.000000


In [34]:
model = Ridge(fit_intercept=True, solver='auto', alpha=0.4,  random_state=23)
model.fit(X, y)
y_pred = pd.DataFrame(model.predict(X), index=X.index, columns=y.columns)

In [35]:
y_pred

Unnamed: 0_level_0,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales
store_nbr,1,1,1,1,1,1,1,1,1,1,...,54,54,54,54,54,54,54,54,54,54
family,AUTOMOTIVE,BABY CARE,BEAUTY,BEVERAGES,BOOKS,BREAD/BAKERY,CELEBRATION,CLEANING,DAIRY,DELI,...,MAGAZINES,MEATS,PERSONAL CARE,PET SUPPLIES,PLAYERS AND ELECTRONICS,POULTRY,PREPARED FOODS,PRODUCE,SCHOOL AND OFFICE SUPPLIES,SEAFOOD
date,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
2017-04-01,5.156488,0.0,2.959326,2470.763162,0.308981,408.531738,13.904916,648.653987,841.078754,150.466932,...,1.778977,56.783466,253.912788,0.151904,5.427123,61.027270,63.169213,746.578200,11.693334,2.672102
2017-04-02,2.024402,0.0,1.318760,1106.953787,0.408395,189.496922,5.453092,265.690515,409.986831,74.840333,...,1.033244,60.806268,340.317049,0.102188,5.377307,71.336752,102.346394,857.550396,13.731007,2.622431
2017-04-03,4.112486,0.0,3.009160,2305.154355,0.557525,406.642944,16.241326,686.931588,804.725873,151.785296,...,0.834386,56.554602,237.154548,0.102188,4.084592,50.022839,78.732382,604.586128,12.785727,2.224748
2017-04-04,3.814206,0.0,2.810355,2394.640435,0.607223,392.366780,18.378991,783.229480,782.795882,146.459627,...,0.734960,49.400870,220.796025,0.052472,4.233639,54.623891,73.513257,922.740725,10.050673,3.020248
2017-04-05,3.824184,0.0,3.134068,2504.858518,0.878835,440.508829,17.468545,885.011482,917.313079,155.111091,...,0.937298,39.692702,178.429278,0.054989,3.651159,38.162263,71.995419,525.054855,12.099097,2.082262
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-08-11,6.430221,0.0,2.972917,2247.100196,0.119228,336.303390,20.799554,699.425566,704.257991,153.107655,...,1.010506,53.055100,127.456317,0.155501,2.822714,74.943406,79.259610,572.796947,-4.780340,1.664458
2017-08-12,5.532663,0.0,4.335345,2410.914082,-0.106223,358.641729,11.125779,605.951465,717.589127,135.306888,...,1.911463,62.349107,207.346679,0.147554,3.061847,79.489252,87.622559,752.915697,-4.189525,3.722019
2017-08-13,2.400577,0.0,2.694779,1047.104707,-0.006809,139.606913,2.673956,222.987993,286.497204,59.680289,...,1.165730,66.371909,293.750940,0.097837,3.012030,89.798734,126.799740,863.887893,-2.151851,3.672348
2017-08-14,4.488662,0.0,4.385179,2245.305275,0.142321,356.752935,13.462189,644.229066,681.236246,136.625252,...,0.966872,62.120243,190.588439,0.097837,1.719315,68.484821,103.185728,610.923625,-3.097132,3.274665


In [36]:
y_pred   = y_pred.stack(['store_nbr', 'family']).reset_index()
y_target = y.stack(['store_nbr', 'family']).reset_index().copy()

y_target['sales_pred'] = y_pred['sales'].clip(0.) # Sales should be >= 0
y_target

Unnamed: 0,date,store_nbr,family,sales,sales_pred
0,2017-04-01,1,AUTOMOTIVE,9.000,5.156488
1,2017-04-01,1,BABY CARE,0.000,0.000000
2,2017-04-01,1,BEAUTY,1.000,2.959326
3,2017-04-01,1,BEVERAGES,3229.000,2470.763162
4,2017-04-01,1,BOOKS,0.000,0.308981
...,...,...,...,...,...
244129,2017-08-15,54,POULTRY,59.619,73.085873
244130,2017-08-15,54,PREPARED FOODS,94.000,97.966602
244131,2017-08-15,54,PRODUCE,915.371,929.078222
244132,2017-08-15,54,SCHOOL AND OFFICE SUPPLIES,0.000,0.000000


In [37]:
# Test predictions

stest = '2017-08-16'
etest = '2017-08-31'

X_test = dp.out_of_sample(steps=16)
X_test

Unnamed: 0,trend,"sin(1,freq=W-SUN)","cos(1,freq=W-SUN)","sin(2,freq=W-SUN)","cos(2,freq=W-SUN)","sin(3,freq=W-SUN)","cos(3,freq=W-SUN)"
2017-08-16,138.0,0.974928,-0.222521,-0.433884,-0.900969,-0.781831,0.62349
2017-08-17,139.0,0.433884,-0.900969,-0.781831,0.62349,0.974928,-0.222521
2017-08-18,140.0,-0.433884,-0.900969,0.781831,0.62349,-0.974928,-0.222521
2017-08-19,141.0,-0.974928,-0.222521,0.433884,-0.900969,0.781831,0.62349
2017-08-20,142.0,-0.781831,0.62349,-0.974928,-0.222521,-0.433884,-0.900969
2017-08-21,143.0,0.0,1.0,0.0,1.0,0.0,1.0
2017-08-22,144.0,0.781831,0.62349,0.974928,-0.222521,0.433884,-0.900969
2017-08-23,145.0,0.974928,-0.222521,-0.433884,-0.900969,-0.781831,0.62349
2017-08-24,146.0,0.433884,-0.900969,-0.781831,0.62349,0.974928,-0.222521
2017-08-25,147.0,-0.433884,-0.900969,0.781831,0.62349,-0.974928,-0.222521


In [38]:

sales_pred = pd.DataFrame(model.predict(X_test), index=X_test.index, columns=y.columns)
sales_pred = sales_pred.stack(['store_nbr', 'family'])

In [39]:
sales_pred

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sales
Unnamed: 0_level_1,store_nbr,family,Unnamed: 3_level_1
2017-08-16,1,AUTOMOTIVE,4.200359
2017-08-16,1,BABY CARE,0.000000
2017-08-16,1,BEAUTY,4.510088
2017-08-16,1,BEVERAGES,2445.009437
2017-08-16,1,BOOKS,0.463630
...,...,...,...
2017-08-31,54,POULTRY,64.917828
2017-08-31,54,PREPARED FOODS,83.433466
2017-08-31,54,PRODUCE,506.966416
2017-08-31,54,SCHOOL AND OFFICE SUPPLIES,-6.502693
