# Class 16 - Starter Code

Exploring Walmart Sales Data

In [None]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style="whitegrid", font_scale=1.5)
%matplotlib inline

### Load Dataset and Pre-Process

Walmart Sales Data

For this exercise, we will analyze the weekly sales data from Walmart over a two year period from 2010 to 2012.

The data is again separated by store and by department, but we will focus on analyzing one store for simplicity.

The data includes:

- Store - the store number
- Dept - the department number
- Date - the week
- Weekly_Sales -  sales for the given department in the given store
- IsHoliday - whether the week is a special holiday week


In [None]:
# Load data
data = pd.read_csv('../../assets/dataset/train.csv')

# Check info
data.info()

In [None]:
# Convert to datetime object
data['Date'] = pd.to_datetime(data['Date'])

# Use `Date` as index
data.set_index('Date', inplace=True)

# Check sample
data.sample(5)

### 1.1 Filter the dataframe to Store 1 sales and aggregate over departments to compute the total sales per store.

In [None]:
# Filter to store 1 sales and average over weeks
store1_sales = data[data['Store']==1]

# Aggregate weekly total sales
store1_sales = store1_sales[['Weekly_Sales']].resample('W').sum()

# Check weekly total sales
store1_sales.head()

### 1.2 Plot the rolling_mean for `Weekly_Sales`. What general trends do you observe?

In [None]:
store1_sales[['Weekly_Sales']].rolling(window=3).mean().plot()

### 1.3 Compute the 1, 2, 52 autocorrelations for `Weekly_Sales` and/or create an autocorrelation plot.

In [None]:
print 'Autocorrelation 1:  ', store1_sales['Weekly_Sales'].autocorr(1)
print 'Autocorrelation 2:  ', store1_sales['Weekly_Sales'].autocorr(2)
print 'Autocorrelation 52: ', store1_sales['Weekly_Sales'].autocorr(52)

### 1.4 Create an autocorrelation plot. What does the autocorrelation plot say about the type of model you want to build?

In [None]:
from pandas.tools.plotting import autocorrelation_plot

# Plot autocorrelation plot using Pandas
autocorrelation_plot(store1_sales['Weekly_Sales'])

In [None]:
from statsmodels.graphics.tsaplots import plot_acf

# Plot autocorrelation plot using Statsmodels
p = plot_acf(store1_sales['Weekly_Sales'], lags=140)

In [None]:
from statsmodels.graphics.tsaplots import plot_acf

p = plot_acf(store1_sales['Weekly_Sales'], lags=30)

# Components 1 and 2 seem particularly useful for autoregression, perhaps up to 4
# In the plot above notice, spike at around 52 - implying a yearly pattern as well
# No random spikes, probably not much use for a moving average model

### 1.5 Split the weekly sales data in a training and test set - using 75% of the data for training

In [None]:
n = len(store1_sales.Weekly_Sales)

train = store1_sales.Weekly_Sales[:int(.75*n)]
test = store1_sales.Weekly_Sales[int(.75*n):]

print min(test.index)
print max(test.index)

### 1.6 Create an AR(1) model on the training data and compute the mean absolute error of the predictions.

In [None]:
import statsmodels.api as sm
from sklearn.metrics import mean_absolute_error

In [None]:
model = sm.tsa.ARIMA(train, (1, 0, 0)).fit()

predictions = model.predict(
    '2012-02-27',
    '2012-10-29',
    dynamic=False, 
)

print("Mean absolute error: ", mean_absolute_error(test, predictions))
model.summary()

### 1.7 Plot the residuals - where are their significant errors.

In [None]:
model.resid.plot()

In [None]:
plot_acf(model.resid, lags=100)

### 1.8 Compute and AR(2) model and an ARMA(2, 2) model - does this improve your mean absolute error on the held out set.

In [None]:
model = sm.tsa.ARIMA(train, (2, 0, 0)).fit()

predictions = model.predict(
    '2012-02-27',
    '2012-10-29',
    dynamic=True, 
)

print("Mean absolute error: ", mean_absolute_error(test, predictions))
model.summary()

In [None]:
model = sm.tsa.ARIMA(train, (2, 0, 2)).fit()

predictions = model.predict(
    '2012-02-27',
    '2012-10-29',
    dynamic=True, 
)

print("Mean absolute error: ", mean_absolute_error(test, predictions))
model.summary()

### 1.9 Finally, compute an ARIMA model to improve your prediction error - iterate on the p, q, and parameters comparing the model's performance.

In [None]:
model = sm.tsa.ARIMA(train, (2, 1, 3)).fit()

predictions = model.predict(
    '2012-02-27',
    '2012-10-29',
    dynamic=False, 
    typ='levels'
)

print("Mean absolute error: ", mean_absolute_error(test, predictions))
model.summary()

In [None]:
model.resid.plot()