# Training, hyperparametering tuning and cross-validation

In [21]:
# cross validation and hyper-parameter search
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import GridSearchCV

# some models
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor

# interactive plots
%matplotlib widget

# typical data processing and visualization libraries
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# model evalution metrics
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

# some helper functions

In [36]:
# some helper functions
def slide_window(data, slide_step_size): # column of data, integer
    # initialize input array
    num_rows = len(data) - slide_step_size
    array = np.zeros((num_rows, slide_step_size + 1))
    
    # loop through data and populate array
    for i in range(num_rows):
        # input features
        array[i,0:slide_step_size+1] = data[i:i+slide_step_size+1]
        # target feature
        array[i,-1] = data[i+slide_step_size]
        # show pattern
        # print(array[i,0:slide_step_size],' : ',array[i,slide_step_size])
    return array[:,0:-2], array[:,-1] 

def genSine(f0, fs, dur):
    t = np.arange(dur)
    sinusoid = np.sin(2*np.pi*t*(f0/fs))
    sinusoid = normalise(sinusoid)
    return sinusoid

In [177]:
# generate noisy sinusoid
f0 = 4
fs = 0.001
dur = 2000
t = np.linspace(0,1000,dur)
sinusoid = np.sin(t*1.2) + np.sin(t*0.05) + 0.0085*t 
noise = np.random.normal(0,4,dur)

result =  sinusoid + noise
plt.figure(figsize=(10,3))
plt.plot(t,result)
plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

# Prepare data for cross validaiton hyperparameter tuning

In [178]:
# import some data
sp_500 = pd.read_csv('./test_data/GSPC.csv')
training_data = result[-2000:-500]#sp_500['Volume'][-2500:-500]/1e9
testing_data = result[-500:]#sp_500['Volume'][-500:]/1e9

# transform time series data into supervised ml problem
window_length = 20
X_train, y_train = slide_window(np.array(training_data), window_length)
X_test, y_test =  slide_window(np.array(testing_data), window_length)

# different folds for cross validation
tscv = TimeSeriesSplit(n_splits=5)

# visualize cross validation splits
fig,ax = plt.subplots(5,1,sharex=True)
i = 0
for tr_index, val_index in tscv.split(training_data): # training and validation splits for 5 folds
    # print(tr_index, val_index)
    ax[i].plot(tr_index,training_data[tr_index[0]:tr_index[-1]+1],'b-')
    ax[i].plot(val_index,training_data[val_index[0]:val_index[-1]+1],'r-')
    i += 1
plt.tight_layout()
plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

# Hyper parameter tuning, using cross validation for SVM

In [179]:
# define model: support vector machine for regression
model = SVR()

# hyperparameter values to check
param_grid = [
  {'C': [0.1, 1, 10, 100], 'kernel': ['linear','rbf','sigmoid'],'epsilon':[0.1,1,10,100]},
 ]

# perform grid search, using cross validaiton
gsearch = GridSearchCV(estimator=model, cv=tscv, param_grid=param_grid, scoring = 'neg_mean_squared_error',verbose=4,n_jobs=10)
gsearch.fit(X_train, y_train)
print('best_score: ', gsearch.best_score_)
print('best_model: ', gsearch.best_estimator_)
print('best_params: ',gsearch.best_params_)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   5 tasks      | elapsed:    0.0s
best_score:  -16.831621245973235
best_model:  SVR(C=0.1, kernel='linear')
best_params:  {'C': 0.1, 'epsilon': 0.1, 'kernel': 'linear'}
[Parallel(n_jobs=10)]: Done 240 out of 240 | elapsed:  1.4min finished


In [180]:
# check model on test data
# model
svm_regres_predictions = gsearch.best_estimator_.predict(X_test)

# evaluate
mse = mean_squared_error(y_test,svm_regres_predictions)
mae = mean_absolute_error(y_test,svm_regres_predictions)

print('RMSE: ',np.sqrt(mse))
print('MAE: ',mae)

# visualize results
fig,ax = plt.subplots(1,1,figsize=(10,4))
ax.plot(range(len(testing_data[window_length:])),np.array(testing_data[window_length:]),'o-',linewidth=3,label='real values',markersize=5) # plotting normalized training data
ax.plot(range(len(testing_data[window_length:])),svm_regres_predictions[:],'o-',label='svm regression prediction',markersize=5)
plt.grid()

RMSE:  4.1638863812838
MAE:  3.2469719538808426


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [181]:
# results from hyperparameter tuning and cross validation
ddf = pd.DataFrame(gsearch.cv_results_)
ddf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48 entries, 0 to 47
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   mean_fit_time      48 non-null     float64
 1   std_fit_time       48 non-null     float64
 2   mean_score_time    48 non-null     float64
 3   std_score_time     48 non-null     float64
 4   param_C            48 non-null     object 
 5   param_epsilon      48 non-null     object 
 6   param_kernel       48 non-null     object 
 7   params             48 non-null     object 
 8   split0_test_score  48 non-null     float64
 9   split1_test_score  48 non-null     float64
 10  split2_test_score  48 non-null     float64
 11  split3_test_score  48 non-null     float64
 12  split4_test_score  48 non-null     float64
 13  mean_test_score    48 non-null     float64
 14  std_test_score     48 non-null     float64
 15  rank_test_score    48 non-null     int32  
dtypes: float64(11), int32(1), ob

In [182]:
ddf

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_epsilon,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.048844,0.0348475,0.002402,0.001021,0.1,0.1,linear,"{'C': 0.1, 'epsilon': 0.1, 'kernel': 'linear'}",-16.319159,-17.111801,-16.567283,-15.62956,-18.53031,-16.83162,0.9738061,1
1,0.02342,0.01791191,0.004004,0.002193,0.1,0.1,rbf,"{'C': 0.1, 'epsilon': 0.1, 'kernel': 'rbf'}",-16.531602,-18.097802,-18.690651,-19.09296,-20.96223,-18.67505,1.437614,20
2,0.02242,0.01783347,0.004404,0.001745,0.1,0.1,sigmoid,"{'C': 0.1, 'epsilon': 0.1, 'kernel': 'sigmoid'}",-16.140093,-17.275637,-17.223723,-17.38122,-21.2259,-17.84931,1.747131,11
3,0.042039,0.02574459,0.002002,0.000896,0.1,1.0,linear,"{'C': 0.1, 'epsilon': 1, 'kernel': 'linear'}",-16.746795,-17.114863,-17.000449,-15.66123,-18.50653,-17.00597,0.9098442,7
4,0.015814,0.0116967,0.003203,0.001471,0.1,1.0,rbf,"{'C': 0.1, 'epsilon': 1, 'kernel': 'rbf'}",-16.252871,-18.454125,-19.076236,-19.01306,-21.17022,-18.7933,1.571427,22
5,0.017416,0.01377617,0.003604,0.002061,0.1,1.0,sigmoid,"{'C': 0.1, 'epsilon': 1, 'kernel': 'sigmoid'}",-16.226858,-17.793431,-17.680901,-17.49343,-20.75335,-17.9896,1.491681,12
6,0.004604,0.005281034,0.000601,0.00049,0.1,10.0,linear,"{'C': 0.1, 'epsilon': 10, 'kernel': 'linear'}",-16.407676,-17.693751,-18.098982,-18.77575,-22.32451,-18.66013,1.988132,16
7,0.000801,0.0004004479,0.0004,0.00049,0.1,10.0,rbf,"{'C': 0.1, 'epsilon': 10, 'kernel': 'rbf'}",-15.813016,-16.481489,-17.589289,-19.88515,-23.95525,-18.74484,2.949654,21
8,0.000801,0.0004004242,0.0002,0.000401,0.1,10.0,sigmoid,"{'C': 0.1, 'epsilon': 10, 'kernel': 'sigmoid'}",-15.778625,-16.471842,-17.512256,-19.40594,-23.83897,-18.60153,2.890286,15
9,0.000801,0.0004003764,0.0002,0.0004,0.1,100.0,linear,"{'C': 0.1, 'epsilon': 100, 'kernel': 'linear'}",-15.606365,-16.965713,-19.810199,-23.41167,-30.57488,-21.27377,5.36474,27


In [183]:
# normal lin reg model
reg_model = LinearRegression().fit(X_train,y_train)
linear_predictions = reg_model.predict(X_test)

# evaluate
mse = mean_squared_error(y_test,predictions)
mae = mean_absolute_error(y_test,predictions)

print(reg_model.coef_)
print('RMSE: ',np.sqrt(mse))
print('MAE: ',mae)

[ 0.05513105  0.08184611  0.11265548  0.03699413  0.0139196  -0.0030286
  0.02905063  0.03076681  0.06597417  0.04587086  0.06138448  0.05038053
  0.02653327  0.01449827  0.01543064  0.02995697  0.00998237  0.00922721
  0.0991654 ]
RMSE:  5.423869099758996
MAE:  4.341159653069514


In [184]:
from sklearn.svm import LinearSVR

# train model
svm_regres = LinearSVR(max_iter=1000,C=0.5).fit(X_train,y_train)

# predict
svm_predictions = svm_regres.predict(X_test)

# evaluate
mse = mean_squared_error(y_test,svm_predictions[:])
mae = mean_absolute_error(y_test,svm_predictions[:])

print('RMSE: ',np.sqrt(mse))
print('MAE: ',mae)

RMSE:  4.175708327970001
MAE:  3.2576018631727313


#  Hyper parameter tuning and cross validation neural network

In [185]:
# define model: support vector machine for regression
MLP = MLPRegressor(shuffle=False,max_iter=1000) # must set shuffle to false to avoid leakage of information due to sequance problem

# hyperparameter values to check
param_grid = [
  {'hidden_layer_sizes': [(10,),(100,),(1000,)], 'activation': ['logistic', 'tanh', 'relu'],'learning_rate': ['constant', 'invscaling', 'adaptive'], 'learning_rate_init':[0.001,0.01,1]}
 ]

# perform grid search, using cross validaiton
gsearch = GridSearchCV(estimator=MLP, cv=tscv, param_grid=param_grid, scoring = 'neg_mean_squared_error',verbose=4,n_jobs=10)
gsearch.fit(X_train, y_train)
print('best_score: ', gsearch.best_score_)
print('best_model: ', gsearch.best_estimator_)
print('best_params: ',gsearch.best_params_)

Fitting 5 folds for each of 81 candidates, totalling 405 fits
[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   5 tasks      | elapsed:    0.4s
[Parallel(n_jobs=10)]: Done  78 tasks      | elapsed:    5.0s
[Parallel(n_jobs=10)]: Done 201 tasks      | elapsed:   33.1s
[Parallel(n_jobs=10)]: Done 372 tasks      | elapsed:  1.1min
[Parallel(n_jobs=10)]: Done 405 out of 405 | elapsed:  1.4min finished
best_score:  -17.362663307404166
best_model:  MLPRegressor(activation='logistic', hidden_layer_sizes=(10,),
             learning_rate='invscaling', max_iter=1000, shuffle=False)
best_params:  {'activation': 'logistic', 'hidden_layer_sizes': (10,), 'learning_rate': 'invscaling', 'learning_rate_init': 0.001}


In [186]:
# check model on test data
# model
mlp_predictions = gsearch.best_estimator_.predict(X_test)

# evaluate
mse = mean_squared_error(y_test,mlp_predictions)
mae = mean_absolute_error(y_test,mlp_predictions)

print('RMSE: ',np.sqrt(mse))
print('MAE: ',mae)

# visualize results
fig,ax = plt.subplots(1,1,figsize=(10,4))
ax.plot(range(len(testing_data[window_length:])),np.array(testing_data[window_length:]),'o-',linewidth=3,label='real values',markersize=5) # plotting normalized training data
ax.plot(range(len(testing_data[window_length:])),mlp_predictions[:],'o-',label='mlp regression prediction',markersize=5)
ax.plot(range(len(testing_data[window_length:])),svm_regres_predictions[:],'o-',label='svm regression prediction',markersize=5)
ax.plot(range(len(testing_data[window_length:])),linear_predictions[:],'o-',label='linear regression prediction',markersize=5)
ax.legend()
plt.grid()

RMSE:  4.369280595459022
MAE:  3.453285930035183


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [187]:
# hyperparameter values to check
param_grid = [
  {'hidden_layer_sizes': [(60,),(70,),(80,),(90,),(100,),(110,),(120,),(130,),(140,)]}
 ]

# perform grid search, using cross validaiton
gsearch = GridSearchCV(estimator=MLP, cv=tscv, param_grid=param_grid, scoring = 'neg_mean_squared_error',verbose=4,n_jobs=10)
gsearch.fit(X_train, y_train)
print('best_score: ', gsearch.best_score_)
print('best_model: ', gsearch.best_estimator_)
print('best_params: ',gsearch.best_params_)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   5 tasks      | elapsed:    1.0s
[Parallel(n_jobs=10)]: Done  38 out of  45 | elapsed:    5.2s remaining:    0.9s
[Parallel(n_jobs=10)]: Done  45 out of  45 | elapsed:    5.9s finished
best_score:  -24.991721217149415
best_model:  MLPRegressor(hidden_layer_sizes=(130,), max_iter=1000, shuffle=False)
best_params:  {'hidden_layer_sizes': (130,)}
