In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import cufflinks as cf

# Go offline
cf.go_offline()

To explore time series models, we will continue with the Rossmann sales data. This dataset has sales data for sales at every Rossmann store for a 3-year period, as well indicators of holidays and basic store information.

In the last class, we saw that we would plot the sales data at a particular store to identify how the sales changed over time. Additionally, we computed autocorrelation for the data at varying lag periods. This helps us identify if previous timepoints are predictive of future data and which time points are most important - the previous day? week? month?

In [None]:
import pandas as pd
import numpy as np

# Load the data and set the DateTime index
data = pd.read_csv('../../lessons/lesson-15/assets/dataset/rossmann.csv', skipinitialspace=True)

data['Date'] = pd.to_datetime(data['Date'])
data.set_index('Date', inplace=True)

# Filter to Store 1
store1_data = data[data.Store == 1]

# Filter to open days
store1_open_data = store1_data[store1_data.Open==1]

# Plot the sales over time
store1_open_data[['Sales']].iplot()

In [None]:
# Simple Moving Averages (SMA)
store1_open_data[['Sales']].ta_plot(study='sma',periods=[13,21,55])
# Bollinger Bands (BOLL)
store1_open_data[['Sales']].ta_plot(study='boll',periods=14)
# Moving Average Convergence Divergence (MACD)
store1_open_data[['Sales']].ta_plot(study='macd',fast_period=12,slow_period=26, signal_period=9)

### What is the 'Moving Average Convergence Divergence - MACD'
Moving average convergence divergence (`MACD`) is a trend-following momentum indicator that shows the relationship between two moving averages of prices. The `MACD` is calculated by subtracting the 26-day exponential moving average (EMA) from the 12-day EMA. A nine-day EMA of the `MACD`, called the "signal line", is then plotted on top of the `MACD`, functioning as a trigger for buy and sell signals.

Read more: Moving Average Convergence Divergence (MACD) http://www.investopedia.com/terms/m/macd.asp#ixzz4scoiCXeQ 


**Check** Compute the autocorrelation of Sales in Store 1 for lag 1 and 2. Will we be able to use a predictive model - particularly an autoregressive one?

In [None]:
store1_data.Sales.autocorr(lag=1) # -0.12

In [None]:
store1_data.Sales.autocorr(lag=7) 

Pandas and statsmodels both provide convenience plots for autocorrelations.

In [None]:
from pandas.plotting import autocorrelation_plot

autocorrelation_plot(store1_data.Sales)

In [None]:
from statsmodels.graphics.tsaplots import plot_acf

plot_acf(store1_data.Sales, lags=30)
plt.show()

In [None]:
from pandas.compat import lmap
def cf_auto_plot(series):
    n = len(series)
    data = np.asarray(series)
    mean = np.mean(data)
    c0 = np.sum((data - mean) ** 2) / float(n)

    def r(h):
        return ((data[:n - h] - mean) *
                (data[h:] - mean)).sum() / float(n) / c0
    x = np.arange(n) + 1
    y = lmap(r, x)
    df = pd.DataFrame(columns=['lag','autocorr'])
    df['lag'] = x
    df['autocorr'] = y
    df.iplot(kind='bar', x='lag', y='autocorr', 
             yTitle='Autocorrelation', xTitle='Lag', title='Autocorrelation vs lag',
             bargap=0.9, filename='cufflinks/categorical-bar-chart')
    return df

In [None]:
a = cf_auto_plot(store1_data.Sales)

In [None]:
def cf_lag_plot(series, lag=1):

    data = series.values
    y1 = data[:-lag]
    y2 = data[lag:]
    df = pd.DataFrame({
      "y1":y1,
      "y2": y2})
    
    df.iplot(kind='scatter', x='y1', y='y2', mode='markers', size=3,
             yTitle="y(t + %s)" % lag, xTitle='y(t)', title='y(t) vs y(t + %s)' % lag,
             filename='cufflinks/scatter-chart')
    return df

In [None]:
a = cf_lag_plot(store1_data.Sales, lag=1)

In [None]:
def cf_plot_acf(data, lags=30):
    df = pd.DataFrame(columns=['lag','autocorr'])
    for lag in range(lags+1):
        df.loc[lag] = [lag, data.autocorr(lag=lag)]
    df.iplot(kind='bar', x='lag', y='autocorr', 
             yTitle='Autocorrelation', xTitle='Lag', title='Autocorrelation vs lag',
             bargap=0.9, filename='cufflinks/categorical-bar-chart')
    return df

In [None]:
a = cf_plot_acf(store1_data.Sales, lags=30)

**Check**: What caused the spike at 7?

# ARMA Model

Recall that `ARMA(p, q)` models are a sum of an `AR(p)` and a `MA(q)` model. So if we want just an `AR(p)` model we use and `ARMA(p, 0)` model. 

In [None]:
from statsmodels.tsa.arima_model import ARMA

store1_sales_data = store1_open_data[['Sales']].astype(float)
model = ARMA(store1_sales_data, (1, 0)).fit()
print model.summary()

In [None]:
model = ARMA(store1_sales_data, (2, 0)).fit()
print model.summary()

Just like with other types of regression, we can compute the model residuals.

**Check**: What are residuals? In linear regression, what did we expect of residuals?

In [None]:
model.resid.iplot()

In [None]:
plot_acf(model.resid, lags=50)
plt.show()

Becuase of the errors, it doesn't look like an AR model is good enough -- the data isn't stationary. So let's expand to an `ARMA` model.

In [None]:
model = ARMA(store1_sales_data, (1, 2)).fit()
print model.summary()

In [None]:
from statsmodels.tsa.arima_model import ARIMA

model = ARIMA(store1_sales_data, (2, 0, 2)).fit()
print model.summary()

In [None]:
model = ARIMA(store1_sales_data, (2, 1, 2)).fit()
print model.summary()

In [None]:
model = ARIMA(store1_sales_data, (2, 1, 0)).fit()
print model.summary()

In [None]:
store1_sales_data.Sales.diff(1).autocorr(1) #-0.181

In [None]:
store1_sales_data.Sales.diff(1).iplot()

In [None]:
model.plot_predict(1, 50)

In [None]:
fig, ax = plt.subplots()
ax = store1_sales_data['2014'].plot(ax=ax)

fig = model.plot_predict(1, 200, ax=ax, plot_insample=False)

In [None]:
model = ARIMA(store1_sales_data, (2, 1, 2)).fit()
model.summary()
model.bse
plot_acf(model.resid, lags=50)
plt.show()