# Training, hyperparametering tuning and cross-validation

In [32]:
# cross validation and hyper-parameter search
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import GridSearchCV

# some models
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression

# interactive plots
%matplotlib widget

# typical data processing and visualization libraries
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# model evalution metrics
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

In [2]:
# some helper functions
def slide_window(data, slide_step_size): # column of data, integer
    # initialize input array
    num_rows = len(data) - slide_step_size
    array = np.zeros((num_rows, slide_step_size + 1))
    
    # loop through data and populate array
    for i in range(num_rows):
        # input features
        array[i,0:slide_step_size+1] = data[i:i+slide_step_size+1]
        # target feature
        array[i,-1] = data[i+slide_step_size]
        # show pattern
        # print(array[i,0:slide_step_size],' : ',array[i,slide_step_size])
    return array[:,0:-2], array[:,-1] 

In [17]:
# import some data
sp_500 = pd.read_csv('./test_data/GSPC.csv')
training_data = sp_500['Volume'][-2500:-500]/1e9
testing_data = sp_500['Volume'][-500:]/1e9

# transform time series data into supervised ml problem
X_train, y_train = slide_window(np.array(training_data), 5)
X_test, y_test =  slide_window(np.array(testing_data), 5)

# different folds for cross validation
tscv = TimeSeriesSplit(n_splits=5)

# visualize cross validation splits
fig,ax = plt.subplots(5,1,sharex=True)
i = 0
for tr_index, val_index in tscv.split(training_data): # training and validation splits for 5 folds
    # print(tr_index, val_index)
    ax[i].plot(tr_index,training_data[tr_index[0]:tr_index[-1]+1],'b-')
    ax[i].plot(val_index,training_data[val_index[0]:val_index[-1]+1],'r-')
    i += 1
plt.tight_layout()
plt.show()

# define model: support vector machine for regression
model = SVR()

# hyperparameter values to check
param_grid = [
  {'C': [0.1, 1, 10, 100], 'kernel': ['linear','rbf','sigmoid'],'epsilon':[0.1,1,10,100]},
 ]

# perform grid search, using cross validaiton
gsearch = GridSearchCV(estimator=model, cv=tscv, param_grid=param_grid, scoring = 'neg_mean_squared_error',verbose=4,n_jobs=-1)
gsearch.fit(X_train, y_train)
print('best_score: ', gsearch.best_score_)
print('best_model: ', gsearch.best_estimator_)
print('best_params: ',gsearch.best_params_)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Fitting 5 folds for each of 48 candidates, totalling 240 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  66 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:    6.1s finished
best_score:  -0.5464900465144715
best_model:  SVR(C=100, kernel='linear')
best_params:  {'C': 100, 'epsilon': 0.1, 'kernel': 'linear'}


In [25]:
testing_data

16718    3.39203
16719    3.75977
16720    3.54472
16721    3.62828
16722    4.95263
          ...   
17213    4.23537
17214    3.68413
17215    3.37651
17216    3.51779
17217    3.65164
Name: Volume, Length: 500, dtype: float64

In [30]:
# check model on test data
# model
svm_regres_predictions = gsearch.best_estimator_.predict(X_test)

# evaluate
mse = mean_squared_error(y_test,svm_regres_predictions)
mae = mean_absolute_error(y_test,svm_regres_predictions)

print('RMSE: ',np.sqrt(mse))
print('MAE: ',mae)

# visualize results
fig,ax = plt.subplots(1,1,figsize=(10,4))
ax.plot(sp_500['Volume'][-500+5:].index,np.array(testing_data[-495:]),'o-',linewidth=3,label='real values',markersize=5) # plotting normalized training data
ax.plot(sp_500['Volume'][-500+5:].index,svm_regres_predictions[:],'o-',label='linear regression prediction',markersize=5)
plt.grid()

RMSE:  0.6011615889902527
MAE:  0.39925008611755686


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [23]:
ddf = pd.DataFrame(gsearch.cv_results_)
ddf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48 entries, 0 to 47
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   mean_fit_time      48 non-null     float64
 1   std_fit_time       48 non-null     float64
 2   mean_score_time    48 non-null     float64
 3   std_score_time     48 non-null     float64
 4   param_C            48 non-null     object 
 5   param_epsilon      48 non-null     object 
 6   param_kernel       48 non-null     object 
 7   params             48 non-null     object 
 8   split0_test_score  48 non-null     float64
 9   split1_test_score  48 non-null     float64
 10  split2_test_score  48 non-null     float64
 11  split3_test_score  48 non-null     float64
 12  split4_test_score  48 non-null     float64
 13  mean_test_score    48 non-null     float64
 14  std_test_score     48 non-null     float64
 15  rank_test_score    48 non-null     int32  
dtypes: float64(11), int32(1), ob

In [24]:
ddf

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_epsilon,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.032229,0.02224999,0.002402,0.001020553,0.1,0.1,linear,"{'C': 0.1, 'epsilon': 0.1, 'kernel': 'linear'}",-1.145331,-0.482749,-0.360634,-0.343422,-0.416555,-0.549738,0.301756,4
1,0.034031,0.02486608,0.005205,0.002787948,0.1,0.1,rbf,"{'C': 0.1, 'epsilon': 0.1, 'kernel': 'rbf'}",-1.958234,-0.454519,-0.537714,-0.430055,-0.412847,-0.758674,0.601309,13
2,0.057252,0.0488026,0.010009,0.004150966,0.1,0.1,sigmoid,"{'C': 0.1, 'epsilon': 0.1, 'kernel': 'sigmoid'}",-3.125884,-1.210837,-1.541257,-1.011587,-0.553576,-1.488628,0.878912,17
3,0.020218,0.01650036,0.001802,0.0007489654,0.1,1.0,linear,"{'C': 0.1, 'epsilon': 1, 'kernel': 'linear'}",-1.222922,-0.561918,-0.508188,-0.426921,-0.442407,-0.632471,0.299143,9
4,0.008208,0.005605263,0.001802,0.0007491439,0.1,1.0,rbf,"{'C': 0.1, 'epsilon': 1, 'kernel': 'rbf'}",-2.155105,-0.555861,-0.713148,-0.574222,-0.441409,-0.887949,0.639428,16
5,0.020018,0.0138399,0.004404,0.001498262,0.1,1.0,sigmoid,"{'C': 0.1, 'epsilon': 1, 'kernel': 'sigmoid'}",-3.455841,-1.515729,-2.252759,-1.530909,-0.698146,-1.890677,0.924445,19
6,0.000601,0.0004903686,0.0004,0.0004903881,0.1,10.0,linear,"{'C': 0.1, 'epsilon': 10, 'kernel': 'linear'}",-5.189008,-5.17419,-8.391162,-8.599769,-6.40267,-6.75136,1.493691,23
7,0.000801,0.000400424,0.0002,0.0004004478,0.1,10.0,rbf,"{'C': 0.1, 'epsilon': 10, 'kernel': 'rbf'}",-5.189008,-5.17419,-8.391162,-8.599769,-6.40267,-6.75136,1.493691,23
8,0.000601,0.0004903297,0.0,0.0,0.1,10.0,sigmoid,"{'C': 0.1, 'epsilon': 10, 'kernel': 'sigmoid'}",-5.189008,-5.17419,-8.391162,-8.599769,-6.40267,-6.75136,1.493691,23
9,0.0004,0.0004904465,0.0002,0.0004002571,0.1,100.0,linear,"{'C': 0.1, 'epsilon': 100, 'kernel': 'linear'}",-5.189008,-5.17419,-8.391162,-8.599769,-6.40267,-6.75136,1.493691,35


In [33]:
# normal lin reg model
reg_model = LinearRegression().fit(X_train,y_train)
predictions = reg_model.predict(X_test)

# evaluate
mse = mean_squared_error(y_test,predictions)
mae = mean_absolute_error(y_test,predictions)

print('RMSE: ',np.sqrt(mse))
print('MAE: ',mae)

RMSE:  0.6092412706471098
MAE:  0.4167273472483654


In [35]:
from sklearn.svm import LinearSVR

# train model
svm_regres = LinearSVR(max_iter=1000,C=0.5).fit(X_train,y_train)

# predict
svm_predictions = svm_regres.predict(X_test)

# evaluate
mse = mean_squared_error(y_test,svm_predictions[:])
mae = mean_absolute_error(y_test,svm_predictions[:])

print('RMSE: ',np.sqrt(mse))
print('MAE: ',mae)

RMSE:  0.6023936628304294
MAE:  0.40177135648895923
