# Validate work against Jason Brownlee 

In [1]:
# interactive figures
%matplotlib widget 

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

# ml training code
from one_dimensional_time_series_forecasting import time_series_prediction
from one_dimensional_time_series_forecasting import hit_rate

# model evalution metrics
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

# data preprocessing
from sklearn.preprocessing import normalize
from sklearn.preprocessing import MinMaxScaler

import random

## Import car dataset used by Brownlee in: Deep Learning for Time Series Forecasting

In [2]:
df = pd.read_csv('./test_data/monthly-car-sales.csv') # sp_500 = GSPC.csv, # airplaine = AirPassengers.csv
fig,ax = plt.subplots(figsize=(15,4))
df.plot(x='Month',y='Sales',marker='o',ax=ax,rot=30)
plt.tight_layout()
display(df.info())
display(df)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108 entries, 0 to 107
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Month   108 non-null    object
 1   Sales   108 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 1.8+ KB


None

Unnamed: 0,Month,Sales
0,1960-01,6550
1,1960-02,8728
2,1960-03,12026
3,1960-04,14395
4,1960-05,14587
...,...,...
103,1968-08,16722
104,1968-09,14385
105,1968-10,21342
106,1968-11,17180


In [2]:
df = pd.read_csv('./test_data/monthly-car-sales.csv') # sp_500 = GSPC.csv, # airplaine = AirPassengers.csv

# modeling
window_length = 24
split = 12

# scale input data
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(df['Sales'].to_numpy().reshape(-1,1)).flatten()

# initialize class object
normal = time_series_prediction(df['Month'],df['Sales'],window_length,1)# pass: time series dates, univariate time series, lag window length, a number of steps ahead to predict
normal.sliding_window_1(verbose=0) # time series to supervised ML problem
normal.train_test_split(split=split) # testing and training dataset split
normal.test_train_plot()    # visualize training split

# perform some prediction tasks
normal.naive_model()
# normal.linear_regression()
# normal.support_vector_machine(model_tunning=True)
normal.neural_net_mlp(model_tunning=True)

# visualize results
normal.vis_results_time_series(second_plot='error')

# tabulate results
tabulated_results_0 = normal.results()
tabulated_results_0.plot()
display(tabulated_results_0)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …


Naive model results:
MAPE: 0.16990394422632252
RMSE:  3783.9662392785694
MAE:  3235.6666666666665

Training neural network: 
Fitting 5 folds for each of 81 candidates, totalling 405 fits
best_score:  -4194734.484060521
best_model:  MLPRegressor(learning_rate='invscaling', learning_rate_init=0.01, max_iter=1000,
             shuffle=False)
best_params:  {'activation': 'relu', 'hidden_layer_sizes': (100,), 'learning_rate': 'invscaling', 'learning_rate_init': 0.01}
MAPE: 0.09056330106229304
RMSE:  2019.926895085841
MAE:  1635.0664134395408


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Unnamed: 0,date,Value,Linear,SVM,NN,Naive
0,1960-01,6550,,,,
1,1960-02,8728,,,,
2,1960-03,12026,,,,
3,1960-04,14395,,,,
4,1960-05,14587,,,,
...,...,...,...,...,...,...
103,1968-08,16722,,,18055.259886,18024
104,1968-09,14385,,,14267.208925,16722
105,1968-10,21342,,,18248.353785,14385
106,1968-11,17180,,,19180.855767,21342


# His implementation

In [34]:
# evaluate mlp for monthly car sales dataset
from math import sqrt
from numpy import array
from numpy import mean
from numpy import std
from pandas import DataFrame
from pandas import concat
from pandas import read_csv
from sklearn.metrics import mean_squared_error
from keras.models import Sequential

from keras.layers import Dense
from matplotlib import pyplot

# split a univariate dataset into train/test sets
def train_test_split(data, n_test):
    return data[:-n_test], data[-n_test:]

# transform list into supervised learning format
def series_to_supervised(data, n_in, n_out=1):
    df = DataFrame(data)
    cols = list()

    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))

    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))

    # put it all together
    agg = concat(cols, axis=1)

    # drop rows with NaN values
    agg.dropna(inplace=True)
    # display(agg)
    return agg.values

# root mean squared error or rmse
def measure_rmse(actual, predicted):
    return sqrt(mean_squared_error(actual, predicted))

# fit a model
def model_fit(train, config):
    # unpack config
    n_input, n_nodes, n_epochs, n_batch = config

    # prepare data
    data = series_to_supervised(train, n_input)
    train_x, train_y = data[:, :-1], data[:, -1]

    # define model
    model = Sequential()
    model.add(Dense(n_nodes, activation='relu', input_dim=n_input))
    model.add(Dense(1))
    model.compile(loss='mse', optimizer='adam')

    # fit
    model.fit(train_x, train_y, epochs=n_epochs, batch_size=n_batch, verbose=0)

    return model

# forecast with a pre-fit model
def model_predict(model, history, config):
    # unpack config
    n_input, _, _, _ = config

    # prepare data
    x_input = array(history[-n_input:]).reshape(1, n_input)

    # forecast
    yhat = model.predict(x_input, verbose=0)

    return yhat[0]

# # walk-forward validation for univariate data
# def walk_forward_validation(data, n_test, cfg):
#     predictions = list()

#     # split dataset
#     train, test = train_test_split(data, n_test)

#     # fit model
#     model = model_fit(train, cfg)

#     # seed history with training dataset
#     history = [x for x in train]

#     # step over each time-step in the test set
#     for i in range(len(test)):
#         # fit model and make forecast for history
#         yhat = model_predict(model, history, cfg)

#         # store forecast in list of predictions
#         predictions.append(yhat)
        
#         # add actual observation to history for the next loop
#         history.append(test[i])

#     # estimate prediction error
#     error = measure_rmse(test, predictions)
#     print(' > %.3f' % error)

#     return error

# # repeat evaluation of a config
# def repeat_evaluate(data, config, n_test, n_repeats=1):
#     # fit and evaluate the model n times
#     scores = [walk_forward_validation(data, n_test, config) for _ in range(n_repeats)]
#     return scores

# # summarize model performance
# def summarize_scores(name, scores):
#     # print a summary
#     scores_m, score_std = mean(scores), std(scores)
#     print('%s: %.3f RMSE (+/- %.3f)' % (name, scores_m, score_std))
#     # box and whisker plot
#     plt.figure()
#     pyplot.boxplot(scores)
#     pyplot.show()
######################################################################################################################################
# start modelling
######################################################################################################################################

series = read_csv('./test_data/monthly-car-sales.csv', header=0, index_col=0)
data = series.values
# data split
n_test = 12 # which to predict

# define config
config = [24, 500, 100, 100] # lag variables, hidden layer_nodes, training epochs n_batch

# walk forward validation without using the function
predictions = list()

# split dataset
train, test = train_test_split(data, n_test)

# fit model
model = model_fit(train, config)

# seed history with training dataset
history = [x for x in train]

# step over each time-step in the test set
for i in range(len(test)):
    # fit model and make forecast for history
    yhat = model_predict(model, history, config)

    # store forecast in list of predictions
    predictions.append(yhat)
    
    # add actual observation to history for the next loop
    history.append(test[i])

# estimate prediction error
error = measure_rmse(test, predictions)
print(' > %.3f' % error)

 > 1645.944


In [31]:
train.shape

(96, 1)

In [35]:
blah = series_to_supervised(train, 24, n_out=1)
blah.shape

# okay so he splits the data where he wants ie 96 for train and 12 for test. then with lag of 24 there are 72 input patterns

(72, 25)

In [26]:
test

array([[13210],
       [14251],
       [20139],
       [21725],
       [26099],
       [21084],
       [18024],
       [16722],
       [14385],
       [21342],
       [17180],
       [14577]], dtype=int64)

In [27]:
df.iloc[-12:,1]

96     13210
97     14251
98     20139
99     21725
100    26099
101    21084
102    18024
103    16722
104    14385
105    21342
106    17180
107    14577
Name: Sales, dtype: int64

108