# Implement basic smoothening 

Implement moving average smoothing, exponential moving average 

In [1]:
# interactive figures
%matplotlib widget 

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

# ml training code and other analysis
from one_dimensional_time_series_forecasting import time_series_prediction
from one_dimensional_time_series_forecasting import hit_rate

# model evalution metrics
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

# data preprocessing
from sklearn.preprocessing import normalize
from sklearn.preprocessing import MinMaxScaler

# Looking at Airplane dataset

In [2]:
# import some data
df = pd.read_csv('./test_data/AirPassengers.csv') 
df

Unnamed: 0,Month,#Passengers
0,1949-01,112
1,1949-02,118
2,1949-03,132
3,1949-04,129
4,1949-05,121
...,...,...
139,1960-08,606
140,1960-09,508
141,1960-10,461
142,1960-11,390


In [3]:
# implement simple moving average
df['MA_5'] = df['#Passengers'].rolling(window=5).mean()
df['MA_10'] = df['#Passengers'].rolling(window=10).mean()
df['MA_15'] = df['#Passengers'].rolling(window=15).mean()

df

Unnamed: 0,Month,#Passengers,MA_5,MA_10,MA_15
0,1949-01,112,,,
1,1949-02,118,,,
2,1949-03,132,,,
3,1949-04,129,,,
4,1949-05,121,122.4,,
...,...,...,...,...,...
139,1960-08,606,539.2,469.0,475.933333
140,1960-09,508,548.6,483.6,478.333333
141,1960-10,461,546.4,489.2,472.533333
142,1960-11,390,517.4,486.5,461.266667


In [4]:
# visualize differences
df.plot(subplots=True)
plt.tight_layout()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

The lag created by a moving average is easily visible above.

In [5]:
# set global forecasting variables:
ma_window = 5
window_length = 15
split = 100

# Forecast normal model ie no feature engineering

In [6]:

# initialize class object
normal = time_series_prediction(df['Month'],df['#Passengers'],window_length,1)
normal.sliding_window_1(verbose=0) # time series to supervised ML problem
normal.train_test_split(split=split) # testing and training dataset split
normal.test_train_plot()    # visualize training split

# perform some prediction tasks
normal.linear_regression()
normal.support_vector_machine(model_tunning=True)
normal.neural_net_mlp(model_tunning=True)
normal.naive_model()

# visualize results
normal.vis_results_time_series(second_plot='error')

tabulated_results_0 = normal.results()
tabulated_results_0.plot()
display(tabulated_results_0)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Training multivariate linear regression:

Linear regression coefficients: 
 [ 0.17068564 -0.43619066 -0.52110009  0.85529166  0.21998151 -0.16326028
  0.15272419 -0.13551272  0.0870167  -0.15020303  0.17336489 -0.15420544
 -0.00593946  0.20685781  0.70047134]
RMSE:  16.37160479513395
MAE:  12.641974322064023

Training support vector machine:
Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-3)]: Using backend LokyBackend with 14 concurrent workers.
[Parallel(n_jobs=-3)]: Done  77 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-3)]: Done 240 out of 240 | elapsed:  1.1min finished


best_score:  -223.00536321094188
best_model:  SVR(C=1, kernel='linear')
best_params:  {'C': 1, 'epsilon': 0.1, 'kernel': 'linear'}
RMSE:  17.889176273260063
MAE:  13.473920953157533

Training neural network: 
Fitting 5 folds for each of 81 candidates, totalling 405 fits


[Parallel(n_jobs=-3)]: Using backend LokyBackend with 14 concurrent workers.
[Parallel(n_jobs=-3)]: Done 140 tasks      | elapsed:    2.0s


best_score:  -284.513310457943
best_model:  MLPRegressor(hidden_layer_sizes=(1000,), learning_rate='adaptive',
             learning_rate_init=0.01, max_iter=1000, shuffle=False)
best_params:  {'activation': 'relu', 'hidden_layer_sizes': (1000,), 'learning_rate': 'adaptive', 'learning_rate_init': 0.01}
RMSE:  71.70874595089586
MAE:  52.41641782447754

Naive model results:
RMSE:  52.4913786024544
MAE:  44.724137931034484


[Parallel(n_jobs=-3)]: Done 405 out of 405 | elapsed:    7.9s finished


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Unnamed: 0,date,Value,Linear,SVM,NN,Naive
0,1949-01,112,,,,
1,1949-02,118,,,,
2,1949-03,132,,,,
3,1949-04,129,,,,
4,1949-05,121,,,,
...,...,...,...,...,...,...
139,1960-08,606,621.999,615.884,463.219,622
140,1960-09,508,515.385,509.973,474.302,606
141,1960-10,461,429.972,428.478,473.137,508
142,1960-11,390,410.561,415.206,463.996,461


In [7]:
# what is the accuracy of price movements for these predictions

# data to feed to hit_rate function:
dates = tabulated_results_0['date'].iloc[split+window_length:]
original_values = tabulated_results_0['Value'].iloc[split+window_length:]
lin_predictions = tabulated_results_0['Linear'].iloc[split+window_length:]
svm_predictions = tabulated_results_0['SVM'].iloc[split+window_length:]
nn_predictions =  tabulated_results_0['NN'].iloc[split+window_length:]
naive_predictions =  tabulated_results_0['Naive'].iloc[split+window_length:]

# hit rate calculations
print('Linear Regression:')
df_lin = hit_rate(dates,original_values,lin_predictions)

print('SVM:')
df_svm = hit_rate(dates,original_values,svm_predictions)

print('NN:')
df_nn = hit_rate(dates,original_values,nn_predictions)

print('Naive:')
df_naive = hit_rate(dates,original_values,naive_predictions)

Linear Regression:
Movement prediction accuracy: 93.1 %
Confusion matrix:
[[13  1]
 [ 1 14]]
SVM:
Movement prediction accuracy: 96.55 %
Confusion matrix:
[[14  0]
 [ 1 14]]
NN:
Movement prediction accuracy: 62.07 %
Confusion matrix:
[[ 7  7]
 [ 4 11]]
Naive:
Movement prediction accuracy: 58.62 %
Confusion matrix:
[[8 6]
 [6 9]]


# forecast model with smoothed inputs

In [8]:
# data preprocessing
df = pd.read_csv('./test_data/AirPassengers.csv') 
display(df)
df['MA_5'] = df['#Passengers'].rolling(window=ma_window).mean()
df.dropna(inplace=True)
df


Unnamed: 0,Month,#Passengers
0,1949-01,112
1,1949-02,118
2,1949-03,132
3,1949-04,129
4,1949-05,121
...,...,...
139,1960-08,606
140,1960-09,508
141,1960-10,461
142,1960-11,390


Unnamed: 0,Month,#Passengers,MA_5
4,1949-05,121,122.4
5,1949-06,135,127.0
6,1949-07,148,133.0
7,1949-08,148,136.2
8,1949-09,136,137.6
...,...,...,...
139,1960-08,606,539.2
140,1960-09,508,548.6
141,1960-10,461,546.4
142,1960-11,390,517.4


In [9]:
# initialize class object
smoothed = time_series_prediction(df['Month'],df['MA_5'],window_length,1)
smoothed.sliding_window_1(verbose=0) # time series to supervised ML problem
smoothed.train_test_split(split=split-ma_window) # want to test on the same set at the normal forecasting problem above
smoothed.test_train_plot()    # visualize training split

# perform some prediction tasks
smoothed.linear_regression()
smoothed.support_vector_machine(model_tunning=True)
smoothed.neural_net_mlp(model_tunning=True)
smoothed.naive_model()

# visualize results
smoothed.vis_results_time_series(second_plot='error')

# evaluation metric against original data not smoothed data

# add all these results for forecast window period together
df_results = pd.DataFrame(columns=['date','Original Values','Inverted linear','Inverted svm','Inverted NN'])
df_results['date'] = df['Month'].iloc[split-ma_window+window_length:]
df_results['Original Values'] = df['#Passengers'].iloc[split-ma_window+window_length:]
df_results['Inverted linear'] = smoothed.linear_reg_predictions
df_results['Inverted svm'] = smoothed.svm_predictions
df_results['Inverted NN'] = smoothed.neural_net_predictions

# RMSE of this method
mse_lin = mean_squared_error(df_results['Original Values'],df_results['Inverted linear'])
mse_svm = mean_squared_error(df_results['Original Values'],df_results['Inverted svm'])
mse_nn = mean_squared_error(df_results['Original Values'],df_results['Inverted NN'])

print(f'RMSE linear: {mse_lin**0.5}')
print(f'RMSE svm: {mse_svm**0.5}')
print(f'RMSE nn: {mse_nn**0.5}')

# df_results.plot()
tabulated_results_1 = smoothed.results()
tabulated_results_1.plot()
display(tabulated_results_1)


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Training multivariate linear regression:

Linear regression coefficients: 
 [ 0.26714802  0.15951685 -1.08317582  0.30665284  0.7208186  -0.52500476
  0.30471375 -0.04499283 -0.19083488  0.29917389 -0.25893702  0.11687344
 -0.1581064  -0.51205362  1.59896034]
RMSE:  3.909402199669011
MAE:  3.2781382396203185

Training support vector machine:
Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-3)]: Using backend LokyBackend with 14 concurrent workers.
[Parallel(n_jobs=-3)]: Done 153 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-3)]: Done 240 out of 240 | elapsed:   44.5s finished


best_score:  -11.72509082212272
best_model:  SVR(C=10, kernel='linear')
best_params:  {'C': 10, 'epsilon': 0.1, 'kernel': 'linear'}
RMSE:  4.937315319399749
MAE:  3.9302145493773804

Training neural network: 
Fitting 5 folds for each of 81 candidates, totalling 405 fits


[Parallel(n_jobs=-3)]: Using backend LokyBackend with 14 concurrent workers.
[Parallel(n_jobs=-3)]: Done 140 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-3)]: Done 405 out of 405 | elapsed:    6.2s finished


best_score:  -180.51878320410975
best_model:  MLPRegressor(hidden_layer_sizes=(1000,), learning_rate_init=0.01, max_iter=1000,
             shuffle=False)
best_params:  {'activation': 'relu', 'hidden_layer_sizes': (1000,), 'learning_rate': 'constant', 'learning_rate_init': 0.01}
RMSE:  12.06107898500596
MAE:  10.604776828537565

Naive model results:
RMSE:  24.53082414704678
MAE:  21.353333333333335


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

RMSE linear: 67.94477819933
RMSE svm: 68.55088294519737
RMSE nn: 72.67904161411502


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Unnamed: 0,date,Value,Linear,SVM,NN,Naive
0,1949-05,122.4,,,,
1,1949-06,127.0,,,,
2,1949-07,133.0,,,,
3,1949-08,136.2,,,,
4,1949-09,137.6,,,,
...,...,...,...,...,...,...
135,1960-08,539.2,537.498,531.763,522.424,501.8
136,1960-09,548.6,555.575,554.238,540.659,539.2
137,1960-10,546.4,540.631,543.415,541.751,548.6
138,1960-11,517.4,522.087,515.968,524.337,546.4


In [10]:
# what is the accuracy of price movements for these predictions

# data to feed to hit_rate function:
dates = df_results['date']
original_values = df_results['Original Values']
lin_predictions = df_results['Inverted linear']
svm_predictions = df_results['Inverted svm']
nn_predictions =  df_results['Inverted NN']

# hit rate calculations
print('Linear Regression:')
df_lin = hit_rate(dates,original_values,lin_predictions)

print('SVM:')
df_svm = hit_rate(dates,original_values,svm_predictions)

print('NN:')
df_nn = hit_rate(dates,original_values,nn_predictions)


Linear Regression:
Movement prediction accuracy: 66.67 %
Confusion matrix:
[[ 9  5]
 [ 5 11]]
SVM:
Movement prediction accuracy: 66.67 %
Confusion matrix:
[[ 9  5]
 [ 5 11]]
NN:
Movement prediction accuracy: 50.0 %
Confusion matrix:
[[6 8]
 [7 9]]
