In [50]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [15, 8]
pd.set_option('display.max_columns', None)

In [2]:
os.listdir('../data')

['cluster_1.csv',
 'combined.csv',
 'df.xlsx',
 'holidays_events.csv',
 'items.csv',
 'oil.csv',
 'sample_submission.csv',
 'stores.csv',
 'test.csv',
 'train+oil.xlsx',
 'train.csv',
 'transactions.csv']

## Load the data into DataFrame

In [3]:
train_df = pd.read_csv('../data/combined.csv', parse_dates = ['date'])

In [4]:
train_df.shape

(86902776, 13)

In [5]:
train_df.head()

Unnamed: 0,id,date,store_nbr,item_nbr,unit_sales,family,class,perishable,dcoilwtico,day_of_year,day_of_week,week_of_year,onpromotion_cat
0,101688779,2017-01-01,25,99197,1.0,GROCERY I,1067,0,53.75,1,6,52,0
1,101688780,2017-01-01,25,103665,7.0,BREAD/BAKERY,2712,1,53.75,1,6,52,0
2,101688781,2017-01-01,25,105574,1.0,GROCERY I,1045,0,53.75,1,6,52,0
3,101688782,2017-01-01,25,105857,4.0,GROCERY I,1092,0,53.75,1,6,52,0
4,101688783,2017-01-01,25,106716,2.0,GROCERY I,1032,0,53.75,1,6,52,0


## Filter by store

In [6]:
store_df = train_df[train_df['store_nbr'] == 44]

## Filter by category

In [7]:
# categories = ['GROCERY I', 'CLEANING', 'PERSONAL CARE', 'GROCERY II', 'FROZEN FOODS', 'PREPARED FOODS']
categories = ['FROZEN FOODS']

store_df = store_df[store_df['family'].isin(categories)]

In [8]:
store_df.shape

(32264, 13)

In [9]:
store_df.head(50)

Unnamed: 0,id,date,store_nbr,item_nbr,unit_sales,family,class,perishable,dcoilwtico,day_of_year,day_of_week,week_of_year,onpromotion_cat
89368,101778147,2017-01-02,44,208530,7.31,FROZEN FOODS,2226,0,,2,0,1,0
89428,101778207,2017-01-02,44,258411,32.0,FROZEN FOODS,2218,0,,2,0,1,0
89441,101778220,2017-01-02,44,264576,13.0,FROZEN FOODS,2220,0,,2,0,1,0
89686,101778465,2017-01-02,44,457574,35.361,FROZEN FOODS,2226,0,,2,0,1,0
89742,101778521,2017-01-02,44,507457,16.0,FROZEN FOODS,2218,0,,2,0,1,0
89976,101778755,2017-01-02,44,700609,19.0,FROZEN FOODS,2220,0,,2,0,1,0
89977,101778756,2017-01-02,44,700610,9.0,FROZEN FOODS,2220,0,,2,0,1,1
90004,101778783,2017-01-02,44,743496,44.0,FROZEN FOODS,2222,0,,2,0,1,0
90005,101778784,2017-01-02,44,743497,26.0,FROZEN FOODS,2222,0,,2,0,1,0
90042,101778821,2017-01-02,44,781794,51.0,FROZEN FOODS,2228,0,,2,0,1,0


## Reshape the df with date as index and unit_sales of each item as a column

In [10]:
store_df = store_df.pivot('date', 'item_nbr', 'unit_sales')

In [11]:
store_df.head()

item_nbr,208530,258411,264576,457574,507457,557241,700609,700610,743496,743497,...,1152350,1152355,1412204,2010755,2010837,2011032,2011054,2016716,2016737,2016738
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-01-02,,31.0,11.0,,13.0,,9.0,8.0,11.0,17.0,...,20.0,7.0,,,,,,,,
2015-01-03,30.721,42.0,6.0,,17.0,,13.0,6.0,20.0,11.0,...,17.0,4.0,,,,,,,,
2015-01-04,,38.0,13.0,,20.0,,23.0,5.0,17.0,12.0,...,14.0,9.0,,,,,,,,
2015-01-05,46.617,22.0,7.0,,19.0,,11.0,9.0,14.0,10.0,...,10.0,6.0,,,,,,,,
2015-01-06,43.383,14.0,6.0,,17.0,,17.0,5.0,15.0,9.0,...,3.0,7.0,,,,,,,,


In [14]:
store_df.fillna(0, inplace = True)

In [18]:
store_df.head()

item_nbr,208530,258411,264576,457574,507457,557241,700609,700610,743496,743497,...,1152350,1152355,1412204,2010755,2010837,2011032,2011054,2016716,2016737,2016738
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-01-02,0.0,31.0,11.0,0.0,13.0,0.0,9.0,8.0,11.0,17.0,...,20.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2015-01-03,30.721,42.0,6.0,0.0,17.0,0.0,13.0,6.0,20.0,11.0,...,17.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2015-01-04,0.0,38.0,13.0,0.0,20.0,0.0,23.0,5.0,17.0,12.0,...,14.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2015-01-05,46.617,22.0,7.0,0.0,19.0,0.0,11.0,9.0,14.0,10.0,...,10.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2015-01-06,43.383,14.0,6.0,0.0,17.0,0.0,17.0,5.0,15.0,9.0,...,3.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
store_df.shape

(953, 45)

## Test if stationary

In [16]:
import statsmodels.api as sm  
from statsmodels.tsa.stattools import adfuller, kpss
from statsmodels.tsa.stattools import acf  
from statsmodels.tsa.stattools import pacf
from statsmodels.tsa.seasonal import seasonal_decompose

In [17]:
def adfuller_test(series, signif=0.05, name='', verbose=False):
    """Perform ADFuller to test for Stationarity of given series and print report"""
    r = adfuller(series, autolag='AIC')
    output = {'test_statistic':round(r[0], 4), 'pvalue':round(r[1], 4), 'n_lags':round(r[2], 4), 'n_obs':r[3]}
    p_value = output['pvalue'] 
    def adjust(val, length= 6): return str(val).ljust(length)

    # Print Summary
    print(f'    Augmented Dickey-Fuller Test on item {name}', "\n   ", '-'*47)
    print(f' Null Hypothesis: Data has unit root. Non-Stationary.')
    print(f' Significance Level    = {signif}')
    print(f' Test Statistic        = {output["test_statistic"]}')
    print(f' No. Lags Chosen       = {output["n_lags"]}')

    for key,val in r[4].items():
        print(f' Critical value {adjust(key)} = {round(val, 3)}')

    if p_value <= signif:
        print(f" => P-Value = {p_value}. Rejecting Null Hypothesis.")
        print(f" => Series is Stationary.")
        return True
    else:
        print(f" => P-Value = {p_value}. Weak evidence to reject the Null Hypothesis.")
        print(f" => Series is Non-Stationary.")
        return False

In [23]:
for idx, column in store_df.iteritems():
    adfuller_test(column, name = column.name)
    print('\n')

    Augmented Dickey-Fuller Test on item 208530 
    -----------------------------------------------
 Null Hypothesis: Data has unit root. Non-Stationary.
 Significance Level    = 0.05
 Test Statistic        = -6.5675
 No. Lags Chosen       = 12
 Critical value 1%     = -3.437
 Critical value 5%     = -2.865
 Critical value 10%    = -2.568
 => P-Value = 0.0. Rejecting Null Hypothesis.
 => Series is Stationary.


    Augmented Dickey-Fuller Test on item 258411 
    -----------------------------------------------
 Null Hypothesis: Data has unit root. Non-Stationary.
 Significance Level    = 0.05
 Test Statistic        = -3.9356
 No. Lags Chosen       = 22
 Critical value 1%     = -3.437
 Critical value 5%     = -2.865
 Critical value 10%    = -2.568
 => P-Value = 0.0018. Rejecting Null Hypothesis.
 => Series is Stationary.


    Augmented Dickey-Fuller Test on item 264576 
    -----------------------------------------------
 Null Hypothesis: Data has unit root. Non-Stationary.
 Significa

    Augmented Dickey-Fuller Test on item 850389 
    -----------------------------------------------
 Null Hypothesis: Data has unit root. Non-Stationary.
 Significance Level    = 0.05
 Test Statistic        = -5.9307
 No. Lags Chosen       = 17
 Critical value 1%     = -3.437
 Critical value 5%     = -2.865
 Critical value 10%    = -2.568
 => P-Value = 0.0. Rejecting Null Hypothesis.
 => Series is Stationary.


    Augmented Dickey-Fuller Test on item 852934 
    -----------------------------------------------
 Null Hypothesis: Data has unit root. Non-Stationary.
 Significance Level    = 0.05
 Test Statistic        = -5.6058
 No. Lags Chosen       = 13
 Critical value 1%     = -3.437
 Critical value 5%     = -2.865
 Critical value 10%    = -2.568
 => P-Value = 0.0. Rejecting Null Hypothesis.
 => Series is Stationary.


    Augmented Dickey-Fuller Test on item 852937 
    -----------------------------------------------
 Null Hypothesis: Data has unit root. Non-Stationary.
 Significance

    Augmented Dickey-Fuller Test on item 2016716 
    -----------------------------------------------
 Null Hypothesis: Data has unit root. Non-Stationary.
 Significance Level    = 0.05
 Test Statistic        = -1.9548
 No. Lags Chosen       = 14
 Critical value 1%     = -3.437
 Critical value 5%     = -2.865
 Critical value 10%    = -2.568
 => P-Value = 0.3067. Weak evidence to reject the Null Hypothesis.
 => Series is Non-Stationary.


    Augmented Dickey-Fuller Test on item 2016737 
    -----------------------------------------------
 Null Hypothesis: Data has unit root. Non-Stationary.
 Significance Level    = 0.05
 Test Statistic        = -2.8164
 No. Lags Chosen       = 21
 Critical value 1%     = -3.437
 Critical value 5%     = -2.865
 Critical value 10%    = -2.568
 => P-Value = 0.056. Weak evidence to reject the Null Hypothesis.
 => Series is Non-Stationary.


    Augmented Dickey-Fuller Test on item 2016738 
    -----------------------------------------------
 Null Hypothesi

In [29]:
store_diff_df = store_df.diff()
store_diff_df = store_diff_df.iloc[1:]

In [30]:
for idx, column in store_diff_df.iteritems():
    adfuller_test(column, name = column.name)
    print('\n')

    Augmented Dickey-Fuller Test on item 208530 
    -----------------------------------------------
 Null Hypothesis: Data has unit root. Non-Stationary.
 Significance Level    = 0.05
 Test Statistic        = -11.7707
 No. Lags Chosen       = 15
 Critical value 1%     = -3.437
 Critical value 5%     = -2.865
 Critical value 10%    = -2.568
 => P-Value = 0.0. Rejecting Null Hypothesis.
 => Series is Stationary.


    Augmented Dickey-Fuller Test on item 258411 
    -----------------------------------------------
 Null Hypothesis: Data has unit root. Non-Stationary.
 Significance Level    = 0.05
 Test Statistic        = -9.927
 No. Lags Chosen       = 21
 Critical value 1%     = -3.437
 Critical value 5%     = -2.865
 Critical value 10%    = -2.568
 => P-Value = 0.0. Rejecting Null Hypothesis.
 => Series is Stationary.


    Augmented Dickey-Fuller Test on item 264576 
    -----------------------------------------------
 Null Hypothesis: Data has unit root. Non-Stationary.
 Significance

    Augmented Dickey-Fuller Test on item 852937 
    -----------------------------------------------
 Null Hypothesis: Data has unit root. Non-Stationary.
 Significance Level    = 0.05
 Test Statistic        = -12.0815
 No. Lags Chosen       = 19
 Critical value 1%     = -3.437
 Critical value 5%     = -2.865
 Critical value 10%    = -2.568
 => P-Value = 0.0. Rejecting Null Hypothesis.
 => Series is Stationary.


    Augmented Dickey-Fuller Test on item 852938 
    -----------------------------------------------
 Null Hypothesis: Data has unit root. Non-Stationary.
 Significance Level    = 0.05
 Test Statistic        = -11.8056
 No. Lags Chosen       = 19
 Critical value 1%     = -3.437
 Critical value 5%     = -2.865
 Critical value 10%    = -2.568
 => P-Value = 0.0. Rejecting Null Hypothesis.
 => Series is Stationary.


    Augmented Dickey-Fuller Test on item 888630 
    -----------------------------------------------
 Null Hypothesis: Data has unit root. Non-Stationary.
 Significan

    Augmented Dickey-Fuller Test on item 2016716 
    -----------------------------------------------
 Null Hypothesis: Data has unit root. Non-Stationary.
 Significance Level    = 0.05
 Test Statistic        = -15.8293
 No. Lags Chosen       = 13
 Critical value 1%     = -3.437
 Critical value 5%     = -2.865
 Critical value 10%    = -2.568
 => P-Value = 0.0. Rejecting Null Hypothesis.
 => Series is Stationary.


    Augmented Dickey-Fuller Test on item 2016737 
    -----------------------------------------------
 Null Hypothesis: Data has unit root. Non-Stationary.
 Significance Level    = 0.05
 Test Statistic        = -10.0361
 No. Lags Chosen       = 21
 Critical value 1%     = -3.437
 Critical value 5%     = -2.865
 Critical value 10%    = -2.568
 => P-Value = 0.0. Rejecting Null Hypothesis.
 => Series is Stationary.


    Augmented Dickey-Fuller Test on item 2016738 
    -----------------------------------------------
 Null Hypothesis: Data has unit root. Non-Stationary.
 Signifi

In [44]:
store_diff_df.head()

item_nbr,208530,258411,264576,457574,507457,557241,700609,700610,743496,743497,...,1152350,1152355,1412204,2010755,2010837,2011032,2011054,2016716,2016737,2016738
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-01-03,30.721,11.0,-5.0,0.0,4.0,0.0,4.0,-2.0,9.0,-6.0,...,-3.0,-3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2015-01-04,-30.721,-4.0,7.0,0.0,3.0,0.0,10.0,-1.0,-3.0,1.0,...,-3.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2015-01-05,46.617,-16.0,-6.0,0.0,-1.0,0.0,-12.0,4.0,-3.0,-2.0,...,-4.0,-3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2015-01-06,-3.234,-8.0,-1.0,0.0,-2.0,0.0,6.0,-4.0,1.0,-1.0,...,-7.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2015-01-07,-43.383,6.0,3.0,0.0,-9.0,0.0,4.0,18.0,0.0,7.0,...,-3.0,-3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Feature selection using lasso regression

In [None]:
# from sklearn.linear_model import Lasso
# import math

In [None]:
# def lasso_regression(data, predictors, alpha):
#     #Fit the model
#     lassoreg = Lasso(alpha=alpha,normalize=True, max_iter=1e5)
#     lassoreg.fit(data[predictors],data[105577])
#     y_pred = lassoreg.predict(data[predictors])
        
#     #Return the result in pre-defined format
#     rss = math.sqrt(sum((y_pred-data[105577])**2))
#     print(rss)
#     ret = []
#     ret.extend(lassoreg.coef_)
#     return ret

In [None]:
# alpha_lasso = [1e-15, 1e-10, 1e-8, 1e-5,1e-4, 1e-3,1e-2, 1, 5, 10]

In [None]:
# predictors = list(store_df.columns)
# predictors.remove(105577)

# for i in range(10):
#     print(f'{alpha_lasso[i]}: \n')
#     lasso_regression(store_df, predictors, alpha_lasso[i])
#     print('\n')

In [None]:
# coef = lasso_regression(store_df, predictors, 0.0001)

In [None]:
# coef = [[x[0], x[1]] for x in enumerate(coef)]
# coef = sorted(coef, key = lambda x: x[1], reverse = True)

In [None]:
# l = []
# for x in coef:
#     if x[1] < 0.5:
#         break
#     l.append(list(store_df.columns)[x[0]])

In [None]:
# l

In [None]:
# l.append(105577)

## Decide the best lag value

In [33]:
from statsmodels.tsa.api import VAR

In [72]:
model = VAR(store_diff_df)
for i in [1,2,3,4,5,6,7,8,9]:
    result = model.fit(i)
    print('Lag Order =', i)
    print('AIC : ', result.aic)
    print('BIC : ', result.bic)
    print('FPE : ', result.fpe)
    print('HQIC: ', result.hqic, '\n')



Lag Order = 1
AIC :  205.37097632160925
BIC :  215.94411418645566
FPE :  1.5594027947478644e+89
HQIC:  209.39929207057403 

Lag Order = 2
AIC :  194.2354098176666
BIC :  215.16931700389276
FPE :  2.3274563430284115e+84
HQIC:  202.21153497666276 

Lag Order = 3
AIC :  188.8954667755012
BIC :  220.207481367169
FPE :  1.1887342015618679e+82
HQIC:  200.82640930780232 

Lag Order = 4
AIC :  185.47216142569874
BIC :  227.17966925361767
FPE :  4.387870403092458e+80
HQIC:  201.36494909114182 

Lag Order = 5
AIC :  182.12858429667142
BIC :  234.24901912001081
FPE :  1.9091178352643356e+79
HQIC:  201.99026472801523 

Lag Order = 6
AIC :  180.10204629576927
BIC :  242.65288998455117
FPE :  3.46454434769969e+78
HQIC:  203.93968707542578 

Lag Order = 7
AIC :  178.31049880555273
BIC :  251.30928152402998
FPE :  9.161565925810065e+77
HQIC:  206.13118754269402 

Lag Order = 8
AIC :  177.1357796302266
BIC :  260.6000800211521
FPE :  5.360491087096376e+77
HQIC:  208.94662403826914 

Lag Order = 9
AIC :

In [73]:
x = model.select_order(maxlags=12)
x.summary()

0,1,2,3,4
,AIC,BIC,FPE,HQIC
0.0,218.7,219.0,9.913e+94,218.8
1.0,205.4,216.1,1.672e+89,209.5
2.0,194.4,215.5*,2.653e+84,202.4
3.0,189.0,220.6,1.365e+82,201.1*
4.0,185.6,227.6,4.891e+80,201.6
5.0,182.2,234.6,2.035e+79,202.2
6.0,180.1,243.0,3.486e+78,204.1
7.0,178.3,251.6,9.120e+77,206.2
8.0,177.0,260.8,5.042e+77,209.0


## Train the model

In [96]:
nobs = 2
df_train, df_test = store_diff_df[0:-nobs], store_diff_df[-nobs:]

# Check size
print(df_train.shape)
print(df_test.shape)

(950, 45)
(2, 45)


In [97]:
model = VAR(df_train)
model_fitted = model.fit(2)
model_fitted.summary()



  Summary of Regression Results   
Model:                         VAR
Method:                        OLS
Date:           Thu, 10, Oct, 2019
Time:                     21:53:01
--------------------------------------------------------------------
No. of Equations:         45.0000    BIC:                    215.270
Nobs:                     948.000    HQIC:                   202.291
Log likelihood:          -148535.    FPE:                2.48450e+84
AIC:                      194.301    Det(Omega_mle):     4.01712e+82
--------------------------------------------------------------------
Results for equation 208530
                coefficient       std. error           t-stat            prob
-----------------------------------------------------------------------------
const              0.013337         0.967455            0.014           0.989
L1.208530         -0.543544         0.023322          -23.306           0.000
L1.258411          0.059771         0.158619            0.377          

In [100]:
# Get the lag order
lag_order = model_fitted.k_ar
print(lag_order)  #> 4

# Input data for forecasting
forecast_input = df_train.values[-lag_order:]
forecast_input

2


array([[  0.   ,   2.   ,  11.   , -21.234,   0.   ,   0.   ,   0.   ,
          0.   ,  -8.   ,  -1.   ,  -2.   , -15.   , -19.   ,  -1.   ,
          0.   ,  16.   ,  -5.   ,   7.   ,   5.   ,   0.   ,   0.   ,
         -1.   ,   2.   ,   5.   ,  -2.   ,   1.   ,   4.   ,   8.   ,
          0.   ,  -8.   ,   0.   ,  -7.   , -35.   ,   2.   ,   5.   ,
         -3.   ,   0.   ,  -6.   ,   0.   ,   0.   ,   0.   ,   0.   ,
         11.   ,  -5.   ,   2.   ],
       [  0.   ,  10.   ,  -3.   ,   9.325,  -5.   ,   0.   ,   0.   ,
          1.   ,   7.   ,  -5.   ,  11.   ,  14.   ,   2.   ,   3.   ,
          4.973,  -3.   ,   2.   ,  -4.   ,   0.   ,   0.   ,   0.   ,
          3.   ,  -3.   ,   0.   ,  -4.   ,  14.   ,   1.   ,  -8.   ,
         -1.   ,   0.   ,   0.   ,   8.   , -61.   ,   1.   ,  -4.   ,
         -7.   ,   0.   ,  -8.   ,   0.   ,   0.   ,   0.   ,   0.   ,
         -6.   ,   1.   ,  -2.   ]])

In [101]:
fc = model_fitted.forecast(y=forecast_input, steps=lag_order)
df_forecast = pd.DataFrame(fc, index=store_diff_df.index[-int(lag_order):], columns=[str(x) + '_1d' for x in store_diff_df.columns])
df_forecast

Unnamed: 0_level_0,208530_1d,258411_1d,264576_1d,457574_1d,507457_1d,557241_1d,700609_1d,700610_1d,743496_1d,743497_1d,781794_1d,781796_1d,781797_1d,781798_1d,788708_1d,795610_1d,795611_1d,795612_1d,813167_1d,850388_1d,850389_1d,852934_1d,852937_1d,852938_1d,888630_1d,946277_1d,979553_1d,979554_1d,1037838_1d,1040170_1d,1109235_1d,1127862_1d,1146496_1d,1152346_1d,1152348_1d,1152350_1d,1152355_1d,1412204_1d,2010755_1d,2010837_1d,2011032_1d,2011054_1d,2016716_1d,2016737_1d,2016738_1d
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1
2017-08-14,11.95872,-13.835413,-6.566621,-20.591458,-1.47152,-4.704906,-1.02515,-0.091891,-10.03933,-0.505525,-10.354485,-10.451455,-2.05672,-7.932985,-3.659249,-12.058382,-2.750443,0.289413,1.001116,13.142638,-3.326395,-2.638792,0.745081,-3.129468,2.479149,-18.757608,-2.829798,0.614609,-1.856716,0.03082,-5.314564,-6.102653,23.104667,-1.179486,-2.62578,0.275861,-1.475701,7.614937,-4.9147,-37.844517,-10.133874,-30.330834,-0.485128,0.369241,-1.545818
2017-08-15,-10.763118,5.043739,5.528784,-5.9286,3.414439,4.407568,-0.459495,-0.928599,6.260627,-0.395305,0.316434,-5.800323,-4.39639,4.060415,1.331725,6.308177,1.003619,-0.76204,-4.167962,-1.356994,-3.063672,-1.157142,0.790806,0.885671,-2.201388,6.601665,0.797357,2.040054,2.339475,-0.247258,-8.186435,0.64622,28.658362,0.801385,2.641529,2.553156,0.237341,0.003671,4.350972,66.494282,21.020652,12.951614,1.367413,-1.764733,1.014867


In [86]:
def invert_transformation(df_train, df_forecast):
    """Revert back the differencing to get the forecast to original scale."""
    df_fc = df_forecast.copy()
    columns = df_train.columns
    for col in columns:        
        # Roll back 1st Diff
        df_fc[str(col)+'_forecast'] = df_train[col].iloc[-1] + df_fc[str(col)+'_1d'].cumsum()
    return df_fc

In [104]:
df_results = invert_transformation(df_train, df_forecast)

In [105]:
df_results

Unnamed: 0_level_0,208530_1d,258411_1d,264576_1d,457574_1d,507457_1d,557241_1d,700609_1d,700610_1d,743496_1d,743497_1d,781794_1d,781796_1d,781797_1d,781798_1d,788708_1d,795610_1d,795611_1d,795612_1d,813167_1d,850388_1d,850389_1d,852934_1d,852937_1d,852938_1d,888630_1d,946277_1d,979553_1d,979554_1d,1037838_1d,1040170_1d,1109235_1d,1127862_1d,1146496_1d,1152346_1d,1152348_1d,1152350_1d,1152355_1d,1412204_1d,2010755_1d,2010837_1d,2011032_1d,2011054_1d,2016716_1d,2016737_1d,2016738_1d,208530_forecast,258411_forecast,264576_forecast,457574_forecast,507457_forecast,557241_forecast,700609_forecast,700610_forecast,743496_forecast,743497_forecast,781794_forecast,781796_forecast,781797_forecast,781798_forecast,788708_forecast,795610_forecast,795611_forecast,795612_forecast,813167_forecast,850388_forecast,850389_forecast,852934_forecast,852937_forecast,852938_forecast,888630_forecast,946277_forecast,979553_forecast,979554_forecast,1037838_forecast,1040170_forecast,1109235_forecast,1127862_forecast,1146496_forecast,1152346_forecast,1152348_forecast,1152350_forecast,1152355_forecast,1412204_forecast,2010755_forecast,2010837_forecast,2011032_forecast,2011054_forecast,2016716_forecast,2016737_forecast,2016738_forecast
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1
2017-08-14,11.95872,-13.835413,-6.566621,-20.591458,-1.47152,-4.704906,-1.02515,-0.091891,-10.03933,-0.505525,-10.354485,-10.451455,-2.05672,-7.932985,-3.659249,-12.058382,-2.750443,0.289413,1.001116,13.142638,-3.326395,-2.638792,0.745081,-3.129468,2.479149,-18.757608,-2.829798,0.614609,-1.856716,0.03082,-5.314564,-6.102653,23.104667,-1.179486,-2.62578,0.275861,-1.475701,7.614937,-4.9147,-37.844517,-10.133874,-30.330834,-0.485128,0.369241,-1.545818,11.95872,-3.835413,-9.566621,-11.266458,-6.47152,-4.704906,-1.02515,0.908109,-3.03933,-5.505525,0.645515,3.548545,-0.05672,-4.932985,1.313751,-15.058382,-0.750443,-3.710587,1.001116,13.142638,-3.326395,0.361208,-2.254919,-3.129468,-1.520851,-4.757608,-1.829798,-7.385391,-2.856716,0.03082,-5.314564,1.897347,-37.895333,-0.179486,-6.62578,-6.724139,-1.475701,-0.385063,-4.9147,-37.844517,-10.133874,-30.330834,-6.485128,1.369241,-3.545818
2017-08-15,-10.763118,5.043739,5.528784,-5.9286,3.414439,4.407568,-0.459495,-0.928599,6.260627,-0.395305,0.316434,-5.800323,-4.39639,4.060415,1.331725,6.308177,1.003619,-0.76204,-4.167962,-1.356994,-3.063672,-1.157142,0.790806,0.885671,-2.201388,6.601665,0.797357,2.040054,2.339475,-0.247258,-8.186435,0.64622,28.658362,0.801385,2.641529,2.553156,0.237341,0.003671,4.350972,66.494282,21.020652,12.951614,1.367413,-1.764733,1.014867,1.195602,1.208326,-4.037837,-17.195058,-3.057081,-0.297338,-1.484645,-0.02049,3.221298,-5.90083,0.961949,-2.251778,-4.453111,-0.87257,2.645476,-8.750205,0.253176,-4.472627,-3.166846,11.785644,-6.390067,-0.795934,-1.464113,-2.243797,-3.722239,1.844057,-1.032441,-5.345336,-0.517241,-0.216438,-13.500999,2.543566,-9.236971,0.621899,-3.984251,-4.170984,-1.23836,-0.381392,-0.563728,28.649765,10.886778,-17.37922,-5.117714,-0.395493,-2.530951


In [106]:
from statsmodels.tsa.stattools import acf

In [107]:
def forecast_accuracy(forecast, actual):
    mape = np.mean(np.abs(forecast - actual)/np.abs(actual))  # MAPE
    me = np.mean(forecast - actual)             # ME
    mae = np.mean(np.abs(forecast - actual))    # MAE
    mpe = np.mean((forecast - actual)/actual)   # MPE
    rmse = np.mean((forecast - actual)**2)**.5  # RMSE
    corr = np.corrcoef(forecast, actual)[0,1]   # corr
    mins = np.amin(np.hstack([forecast[:,None], 
                              actual[:,None]]), axis=1)
    maxs = np.amax(np.hstack([forecast[:,None], 
                              actual[:,None]]), axis=1)
    minmax = 1 - np.mean(mins/maxs)             # minmax
    return({'mape':mape, 'me':me, 'mae': mae, 
            'mpe': mpe, 'rmse':rmse, 'corr':corr, 'minmax':minmax})

In [117]:
rmse, count = 0, 0
for col in df_test.columns:
    print('col:')
    accuracy_prod = forecast_accuracy(df_results[str(col) + '_forecast'].values, df_test[col])
    rmse += accuracy_prod['rmse']
    count += 1
    print([k + ': ' + str(round(v,4)) for k,v in accuracy_prod.items()])
    print('\n')

col:
['mape: inf', 'me: 6.5772', 'mae: 6.5772', 'mpe: inf', 'rmse: 8.4982', 'corr: nan', 'minmax: 1.0']


col:
['mape: 0.5381', 'me: 3.6865', 'mae: 4.4781', 'mpe: -0.5381', 'rmse: 5.8003', 'corr: 1.0', 'minmax: -0.8665']


col:
['mape: 0.3563', 'me: -1.8022', 'mae: 1.8022', 'mpe: 0.3563', 'rmse: 1.9576', 'corr: 1.0', 'minmax: -0.3563']


col:
['mape: 5.2976', 'me: -9.5683', 'mae: 9.5683', 'mpe: -4.6725', 'rmse: 10.0139', 'corr: 1.0', 'minmax: 4.6725']


col:
['mape: 0.9718', 'me: -6.7643', 'mae: 6.7643', 'mpe: -0.9527', 'rmse: 9.5259', 'corr: -1.0', 'minmax: 0.9527']


col:
['mape: 1.3682', 'me: -2.5011', 'mae: 8.1898', 'mpe: -1.3682', 'rmse: 8.5632', 'corr: -1.0', 'minmax: -8.673']


col:
['mape: inf', 'me: -1.2549', 'mae: 1.2549', 'mpe: -inf', 'rmse: 1.2758', 'corr: nan', 'minmax: inf']


col:
['mape: inf', 'me: 0.9438', 'mae: 0.9643', 'mpe: -inf', 'rmse: 1.3493', 'corr: -1.0', 'minmax: inf']


col:
['mape: 1.6672', 'me: 6.591', 'mae: 6.591', 'mpe: -1.6672', 'rmse: 6.7318', 'corr: 1.

  if sys.path[0] == '':


In [118]:
total_rmse = rmse/count
total_rmse

7.739341856555768