In [2]:
import pandas as pd
import numpy as np

df = pd.read_csv('data/train.csv')
df = df[df['item'] == 1]
df['date'] = pd.to_datetime(df['date']) - pd.to_timedelta(7, unit='d')
df = df.filter(['date', 'sales']).groupby([pd.Grouper(key='date', freq='W-MON')]).sum().reset_index()

In [3]:
df['shift_sale'] = df['sales'].shift(1)
df = df.iloc[1:]

In [4]:
print(df.head())

        date  sales  shift_sale
1 2013-01-07    863       894.0
2 2013-01-14    867       863.0
3 2013-01-21    816       867.0
4 2013-01-28    969       816.0
5 2013-02-04    920       969.0


In [5]:
def four_week_avg(sales):
    sum = 0
    week_avg = []
    for i in range(3, -1, -1):
        for j in range(i):
            sum += sales[j]
        if(i!=0):
            week_avg.append(sum/i)
        sum = 0
    week_avg.append(sales[0])
    week_avg.reverse()
    for row in range(len(sales) - 4):
        for row in range(row, row + 4):
            sum += sales[row]
        week_avg.append(sum / 4)
        sum = 0
    return week_avg

In [6]:
df['week_avg'] = four_week_avg(df['sales'].tolist())
print(df.head())

        date  sales  shift_sale    week_avg
1 2013-01-07    863       894.0  863.000000
2 2013-01-14    867       863.0  863.000000
3 2013-01-21    816       867.0  865.000000
4 2013-01-28    969       816.0  848.666667
5 2013-02-04    920       969.0  878.750000


Model and Evaluation

In [20]:
import datetime as dt
from matplotlib import pyplot as plt
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, median_absolute_error, explained_variance_score, max_error

In [32]:
test = df.iloc[-52:]
df = df.iloc[:-52]

Using scikits train_test_split we are going to split the data for training and validation. After we trained our model we first check how it did with the data it trained on.

In [33]:
X = df.drop('sales', axis=1)
y = df['sales']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
clf = svm.SVR(C=1, kernel='linear', degree=8, gamma='scale', coef0=10)
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)
print(f'Model fit results:\n'
      f'r2_score {r2_score(y_test, predictions)} \t MSE {mean_squared_error(y_test, predictions)}'
      f'\tEVS {explained_variance_score(y_test, predictions)} \n MAE {mean_absolute_error(y_test, predictions)}'
      f'\tMAD {median_absolute_error(y_test, predictions)}\t ME {max_error(y_test, predictions)}')

Model fit results:
r2_score -5034.824026185133 	 MSE 232589146.49522856	EVS -4714.180438254682 
 MAE 11904.194971074943	MAD 9053.493520772085	 ME 29094.34086033143


In [35]:
predictions = clf.predict(test.drop('sales', axis=1))
print(f'Model test results:\n'
      f'r2_score {r2_score(test["sales"], predictions)} \t MSE {mean_squared_error(test["sales"], predictions)}'
      f'\tEVS {explained_variance_score(test["sales"], predictions)} \n MAE {mean_absolute_error(test["sales"], predictions)}'
      f'\tMAD {median_absolute_error(test["sales"], predictions)}\t ME {max_error(test["sales"], predictions)}')

Model test results:
r2_score -7042.506505757563 	 MSE 539671809.1835542	EVS -5901.113755525139 
 MAE 20056.899007223976	MAD 20273.123977661133	 ME 43620.30727199279
