# Validate work against Jason Brownlee 

In [1]:
# interactive figures
%matplotlib widget 

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

# ml training code
from one_dimensional_time_series_forecasting import time_series_prediction
from one_dimensional_time_series_forecasting import hit_rate

# model evalution metrics
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

# data preprocessing
from sklearn.preprocessing import normalize
from sklearn.preprocessing import MinMaxScaler

import random


## Import car dataset used by Brownlee in: Deep Learning for Time Series Forecasting

In [2]:
df = pd.read_csv('./test_data/monthly-car-sales.csv') # sp_500 = GSPC.csv, # airplaine = AirPassengers.csv
fig,ax = plt.subplots(figsize=(15,4))
df.plot(x='Month',y='Sales',marker='o',ax=ax,rot=30)
plt.tight_layout()
display(df.info())
display(df)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108 entries, 0 to 107
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Month   108 non-null    object
 1   Sales   108 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 1.8+ KB


None

Unnamed: 0,Month,Sales
0,1960-01,6550
1,1960-02,8728
2,1960-03,12026
3,1960-04,14395
4,1960-05,14587
...,...,...
103,1968-08,16722
104,1968-09,14385
105,1968-10,21342
106,1968-11,17180


In [7]:
df = pd.read_csv('./test_data/monthly-car-sales.csv') # sp_500 = GSPC.csv, # airplaine = AirPassengers.csv

# modeling
window_length = 24
split = 12

# scale input data
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(df['Sales'].to_numpy().reshape(-1,1)).flatten()

# initialize class object
normal = time_series_prediction(df['Month'],df['Sales'],window_length,1)# pass: time series dates, univariate time series, lag window length, a number of steps ahead to predict
normal.sliding_window_1(verbose=0) # time series to supervised ML problem
normal.train_test_split(split=split) # testing and training dataset split
normal.test_train_plot()    # visualize training split

# perform some prediction tasks
normal.naive_model()
# normal.linear_regression()
# normal.support_vector_machine(model_tunning=True)
normal.neural_net_mlp(model_tunning=True)

# visualize results
normal.vis_results_time_series(second_plot='error')

# tabulate results
tabulated_results_0 = normal.results()
tabulated_results_0.plot()
display(tabulated_results_0)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …


Naive model results:
MAPE: 0.16990394422632252
RMSE:  3783.9662392785694
MAE:  3235.6666666666665

Training neural network: 
Fitting 5 folds for each of 108 candidates, totalling 540 fits
best_score:  -1989.3265674363797
best_model:  MLPRegressor(hidden_layer_sizes=(500,), learning_rate='adaptive',
             learning_rate_init=0.01, max_iter=5000, shuffle=False)
best_params:  {'activation': 'relu', 'hidden_layer_sizes': (500,), 'learning_rate': 'adaptive', 'learning_rate_init': 0.01}
MAPE: 0.08839383810942124
RMSE:  1770.1722048970073
MAE:  1596.2092976222577


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Unnamed: 0,date,Value,Linear,SVM,NN,Naive
0,1960-01,6550,,,,
1,1960-02,8728,,,,
2,1960-03,12026,,,,
3,1960-04,14395,,,,
4,1960-05,14587,,,,
...,...,...,...,...,...,...
103,1968-08,16722,,,17681.487147,18024
104,1968-09,14385,,,16035.45195,16722
105,1968-10,21342,,,19824.44414,14385
106,1968-11,17180,,,19709.66986,21342


In [5]:
# what is the accuracy of price movements for these predictions

# data to feed to hit_rate function:
dates = tabulated_results_0['date'].iloc[split+window_length:]
original_values = tabulated_results_0['Value'].iloc[split+window_length:]
# lin_predictions = tabulated_results_0['Linear'].iloc[split+window_length:]
# svm_predictions = tabulated_results_0['SVM'].iloc[split+window_length:]
nn_predictions =  tabulated_results_0['NN'].iloc[split+window_length:]
naive_predictions =  tabulated_results_0['Naive'].iloc[split+window_length:]

# hit rate calculations
# print('Linear Regression:')
# df_lin = hit_rate(dates,original_values,lin_predictions)

# print('SVM:')
# df_svm = hit_rate(dates,original_values,svm_predictions)

print('NN:')
df_nn = hit_rate(dates,original_values,nn_predictions)

print('Naive:')
df_naive = hit_rate(dates,original_values,naive_predictions)

NN:
Movement prediction accuracy: 58.33 %
Confusion matrix:
[[38  2]
 [28  4]]
Naive:
Movement prediction accuracy: 56.94 %
Confusion matrix:
[[38  2]
 [29  3]]


In [8]:
normal.nn_grid_params.sort_values(by = 'rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_activation,param_hidden_layer_sizes,param_learning_rate,param_learning_rate_init,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
97,0.293648,0.075493,0.001202,0.000399,relu,"(500,)",adaptive,0.01,"{'activation': 'relu', 'hidden_layer_sizes': (...",-1835.107312,-2.544563e+03,-1197.602396,-1758.462729,-2610.897329,-1989.326567,528.875532,1
90,0.602880,0.326969,0.001201,0.000401,relu,"(500,)",constant,0.001,"{'activation': 'relu', 'hidden_layer_sizes': (...",-1408.069703,-2.149891e+03,-1452.706890,-2660.216436,-2731.676067,-2080.512091,567.662447,2
100,0.986758,0.461883,0.001604,0.000487,relu,"(1000,)",constant,0.01,"{'activation': 'relu', 'hidden_layer_sizes': (...",-1729.314472,-2.418883e+03,-1214.843047,-2641.108436,-2567.657582,-2114.361311,553.726952,3
105,1.381499,0.486203,0.001001,0.001266,relu,"(1000,)",adaptive,0.001,"{'activation': 'relu', 'hidden_layer_sizes': (...",-1407.051278,-2.336520e+03,-1448.746427,-2715.115925,-2711.655087,-2123.817821,584.791270,4
93,1.288506,1.221910,0.002202,0.001169,relu,"(500,)",invscaling,0.001,"{'activation': 'relu', 'hidden_layer_sizes': (...",-1534.265318,-2.284163e+03,-1552.629274,-2559.258939,-2823.612414,-2150.785792,524.447171,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101,0.074584,0.030882,0.002804,0.001600,relu,"(1000,)",constant,1,"{'activation': 'relu', 'hidden_layer_sizes': (...",-14376.051106,-1.566654e+04,-17577.458121,-17028.993034,-17062.058337,-16342.219398,1169.444835,104
92,0.020537,0.011471,0.001003,0.000004,relu,"(500,)",constant,1,"{'activation': 'relu', 'hidden_layer_sizes': (...",-14375.451683,-1.605069e+05,-17577.233113,-17027.600864,-17062.261292,-45309.895591,57609.399224,105
86,0.006807,0.001602,0.000600,0.000490,relu,"(100,)",invscaling,1,"{'activation': 'relu', 'hidden_layer_sizes': (...",-14376.681934,-1.566564e+04,-17576.235322,-381195.175880,-17060.845386,-89174.916335,146014.404392,106
89,0.005406,0.001498,0.000815,0.000409,relu,"(100,)",adaptive,1,"{'activation': 'relu', 'hidden_layer_sizes': (...",-14375.318688,-1.566524e+04,-379364.747411,-17029.241091,-37718.829941,-92830.675796,143523.304385,107


# His implementation

In [9]:
# evaluate mlp for monthly car sales dataset
from math import sqrt
from numpy import array
from numpy import mean
from numpy import std
from pandas import DataFrame
from pandas import concat
from pandas import read_csv
from sklearn.metrics import mean_squared_error
from keras.models import Sequential

from keras.layers import Dense
from matplotlib import pyplot

# split a univariate dataset into train/test sets
def train_test_split(data, n_test):
    return data[:-n_test], data[-n_test:]

# transform list into supervised learning format
def series_to_supervised(data, n_in, n_out=1):
    df = DataFrame(data)
    cols = list()

    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))

    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))

    # put it all together
    agg = concat(cols, axis=1)

    # drop rows with NaN values
    agg.dropna(inplace=True)
    display(agg)
    return agg.values

# root mean squared error or rmse
def measure_rmse(actual, predicted):
    return sqrt(mean_squared_error(actual, predicted))

# fit a model
def model_fit(train, config):
    # unpack config
    n_input, n_nodes, n_epochs, n_batch = config

    # prepare data
    data = series_to_supervised(train, n_input)
    train_x, train_y = data[:, :-1], data[:, -1]

    # define model
    model = Sequential()
    model.add(Dense(n_nodes, activation='relu', input_dim=n_input))
    model.add(Dense(1))
    model.compile(loss='mse', optimizer='adam')

    # fit
    model.fit(train_x, train_y, epochs=n_epochs, batch_size=n_batch, verbose=0)

    return model

# forecast with a pre-fit model
def model_predict(model, history, config):
    # unpack config
    n_input, _, _, _ = config

    # prepare data
    x_input = array(history[-n_input:]).reshape(1, n_input)

    # forecast
    yhat = model.predict(x_input, verbose=0)

    return yhat[0]

# # walk-forward validation for univariate data
# def walk_forward_validation(data, n_test, cfg):
#     predictions = list()

#     # split dataset
#     train, test = train_test_split(data, n_test)

#     # fit model
#     model = model_fit(train, cfg)

#     # seed history with training dataset
#     history = [x for x in train]

#     # step over each time-step in the test set
#     for i in range(len(test)):
#         # fit model and make forecast for history
#         yhat = model_predict(model, history, cfg)

#         # store forecast in list of predictions
#         predictions.append(yhat)
        
#         # add actual observation to history for the next loop
#         history.append(test[i])

#     # estimate prediction error
#     error = measure_rmse(test, predictions)
#     print(' > %.3f' % error)

#     return error

# # repeat evaluation of a config
# def repeat_evaluate(data, config, n_test, n_repeats=1):
#     # fit and evaluate the model n times
#     scores = [walk_forward_validation(data, n_test, config) for _ in range(n_repeats)]
#     return scores

# # summarize model performance
# def summarize_scores(name, scores):
#     # print a summary
#     scores_m, score_std = mean(scores), std(scores)
#     print('%s: %.3f RMSE (+/- %.3f)' % (name, scores_m, score_std))
#     # box and whisker plot
#     plt.figure()
#     pyplot.boxplot(scores)
#     pyplot.show()
######################################################################################################################################
# start modelling
######################################################################################################################################

series = read_csv('./test_data/monthly-car-sales.csv', header=0, index_col=0)
data = series.values
# data split
n_test = 12 # which to predict

# define config
config = [24, 500, 100, 100] # lag variables, hidden layer_nodes, training epochs n_batch

# walk forward validation without using the function
predictions = list()

# split dataset
train, test = train_test_split(data, n_test)

# fit model
model = model_fit(train, config)

# seed history with training dataset
history = [x for x in train]

# step over each time-step in the test set
for i in range(len(test)):
    # fit model and make forecast for history
    yhat = model_predict(model, history, config)

    # store forecast in list of predictions
    predictions.append(yhat)
    
    # add actual observation to history for the next loop
    history.append(test[i])

# estimate prediction error
error = measure_rmse(test, predictions)
print(' > %.3f' % error)

Unnamed: 0,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,...,0.10,0.11,0.12,0.13,0.14,0.15,0.16,0.17,0.18,0.19
24,6550.0,8728.0,12026.0,14395.0,14587.0,13791.0,9498.0,8251.0,7049.0,9545.0,...,13784.0,15926.0,13821.0,11143.0,7975.0,7610.0,10015.0,12759.0,8816.0,10677
25,8728.0,12026.0,14395.0,14587.0,13791.0,9498.0,8251.0,7049.0,9545.0,9364.0,...,15926.0,13821.0,11143.0,7975.0,7610.0,10015.0,12759.0,8816.0,10677.0,10947
26,12026.0,14395.0,14587.0,13791.0,9498.0,8251.0,7049.0,9545.0,9364.0,8456.0,...,13821.0,11143.0,7975.0,7610.0,10015.0,12759.0,8816.0,10677.0,10947.0,15200
27,14395.0,14587.0,13791.0,9498.0,8251.0,7049.0,9545.0,9364.0,8456.0,7237.0,...,11143.0,7975.0,7610.0,10015.0,12759.0,8816.0,10677.0,10947.0,15200.0,17010
28,14587.0,13791.0,9498.0,8251.0,7049.0,9545.0,9364.0,8456.0,7237.0,9374.0,...,7975.0,7610.0,10015.0,12759.0,8816.0,10677.0,10947.0,15200.0,17010.0,20900
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,14767.0,10895.0,17130.0,17697.0,16611.0,12674.0,12760.0,20249.0,22135.0,20677.0,...,17562.0,14720.0,12225.0,11608.0,20985.0,19692.0,24081.0,22114.0,14220.0,13434
92,10895.0,17130.0,17697.0,16611.0,12674.0,12760.0,20249.0,22135.0,20677.0,19933.0,...,14720.0,12225.0,11608.0,20985.0,19692.0,24081.0,22114.0,14220.0,13434.0,13598
93,17130.0,17697.0,16611.0,12674.0,12760.0,20249.0,22135.0,20677.0,19933.0,15388.0,...,12225.0,11608.0,20985.0,19692.0,24081.0,22114.0,14220.0,13434.0,13598.0,17187
94,17697.0,16611.0,12674.0,12760.0,20249.0,22135.0,20677.0,19933.0,15388.0,15113.0,...,11608.0,20985.0,19692.0,24081.0,22114.0,14220.0,13434.0,13598.0,17187.0,16119


 > 1513.734


In [27]:
normal.X_train

array([[ 6550.,  8728., 12026., ..., 10015., 12759.,  8816.],
       [ 8728., 12026., 14395., ..., 12759.,  8816., 10677.],
       [12026., 14395., 14587., ...,  8816., 10677., 10947.],
       ...,
       [17130., 17697., 16611., ..., 14220., 13434., 13598.],
       [17697., 16611., 12674., ..., 13434., 13598., 17187.],
       [16611., 12674., 12760., ..., 13598., 17187., 16119.]])

In [33]:
columns = ['t-'+str(i) for i in range(24,0,-1)]
df_check = pd.DataFrame(columns=columns,data=normal.X_train)
df_check['t'] = normal.y_train
df_check

Unnamed: 0,t-24,t-23,t-22,t-21,t-20,t-19,t-18,t-17,t-16,t-15,...,t-9,t-8,t-7,t-6,t-5,t-4,t-3,t-2,t-1,t
0,6550.0,8728.0,12026.0,14395.0,14587.0,13791.0,9498.0,8251.0,7049.0,9545.0,...,13784.0,15926.0,13821.0,11143.0,7975.0,7610.0,10015.0,12759.0,8816.0,10677.0
1,8728.0,12026.0,14395.0,14587.0,13791.0,9498.0,8251.0,7049.0,9545.0,9364.0,...,15926.0,13821.0,11143.0,7975.0,7610.0,10015.0,12759.0,8816.0,10677.0,10947.0
2,12026.0,14395.0,14587.0,13791.0,9498.0,8251.0,7049.0,9545.0,9364.0,8456.0,...,13821.0,11143.0,7975.0,7610.0,10015.0,12759.0,8816.0,10677.0,10947.0,15200.0
3,14395.0,14587.0,13791.0,9498.0,8251.0,7049.0,9545.0,9364.0,8456.0,7237.0,...,11143.0,7975.0,7610.0,10015.0,12759.0,8816.0,10677.0,10947.0,15200.0,17010.0
4,14587.0,13791.0,9498.0,8251.0,7049.0,9545.0,9364.0,8456.0,7237.0,9374.0,...,7975.0,7610.0,10015.0,12759.0,8816.0,10677.0,10947.0,15200.0,17010.0,20900.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67,14767.0,10895.0,17130.0,17697.0,16611.0,12674.0,12760.0,20249.0,22135.0,20677.0,...,17562.0,14720.0,12225.0,11608.0,20985.0,19692.0,24081.0,22114.0,14220.0,13434.0
68,10895.0,17130.0,17697.0,16611.0,12674.0,12760.0,20249.0,22135.0,20677.0,19933.0,...,14720.0,12225.0,11608.0,20985.0,19692.0,24081.0,22114.0,14220.0,13434.0,13598.0
69,17130.0,17697.0,16611.0,12674.0,12760.0,20249.0,22135.0,20677.0,19933.0,15388.0,...,12225.0,11608.0,20985.0,19692.0,24081.0,22114.0,14220.0,13434.0,13598.0,17187.0
70,17697.0,16611.0,12674.0,12760.0,20249.0,22135.0,20677.0,19933.0,15388.0,15113.0,...,11608.0,20985.0,19692.0,24081.0,22114.0,14220.0,13434.0,13598.0,17187.0,16119.0


In [6]:
blah = series_to_supervised(train, 24, n_out=1)
blah.shape

# okay so he splits the data where he wants ie 96 for train and 12 for test. then with lag of 24 there are 72 input patterns

(72, 25)

In [34]:
test

array([[13210],
       [14251],
       [20139],
       [21725],
       [26099],
       [21084],
       [18024],
       [16722],
       [14385],
       [21342],
       [17180],
       [14577]], dtype=int64)

In [35]:
normal.y_test

array([13210., 14251., 20139., 21725., 26099., 21084., 18024., 16722.,
       14385., 21342., 17180., 14577.])