In [None]:
from collections import defaultdict
from itertools import product
import pickle

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from algorithms.svr import SupportVectorRegressor
from algorithms.preprocessing import load_and_preprocess_dataset
from algorithms.model_selection import DEFAULT_METRICS
import config

In [None]:
X_train, y_train, X_test, y_test = load_and_preprocess_dataset()

In [None]:
X_train = np.asarray(X_train)
y_train = np.asarray(y_train)
X_test = np.asarray(X_test)
y_test = np.asarray(y_test)

In [None]:
ridge_logs_dir = config.LOGS_DIR / 'ridge'
nn_logs_dir = config.LOGS_DIR / 'nn'
svr_logs_dir = config.LOGS_DIR / 'svr'

In [None]:
hyperparameters = [
    (0.1, 0.001, 0.01, 'linear', 0.5),
    (0.1, 0.001, 0.01, 'rbf', 0.5),
    (0.1, 0.001, 0.001, 'linear', 0.5),
    (0.1, 0.001, 0.001, 'rbf', 0.5),
    (0.1, 0.001, 0.001, 'rbf', 1),
    (0.1, 0.01, 0.01, 'linear', 0.5),
    (0.1, 0.01, 0.001, 'rbf', 0.5),
    (0.2, 0.001, 0.001, 'rbf', 0.5),
    (0.2, 0.001, 0.001, 'rbf', 1),
    (0.2, 0.01, 0.001, 'linear', 1),
    (0.2, 0.01, 0.001, 'rbf', 1),
]

In [None]:
svr_scores = []
for C, epsilon, tolerance, kernel_type, gamma in hyperparameters:
    with open(svr_logs_dir / f'C_{C}-epsilon_{epsilon}-tolerance_{tolerance}-kernel_type_{kernel_type}-gamma_{gamma}.pkl', 'rb') as f:
        ridge_histories = pickle.load(f)
    svr_scores.append(ridge_histories)

In [None]:
svr_last_scores = []

for model in svr_scores:
    model_last_scores = defaultdict(int)
    for fold in model:
        for metric in DEFAULT_METRICS.keys():
            model_last_scores[f'train_{metric}'] += fold[f'train_{metric}'][-1] / len(model)
            model_last_scores[f'valid_{metric}'] += fold[f'valid_{metric}'][-1] / len(model)
    svr_last_scores.append(model_last_scores)

In [None]:
valid_svr = {metric: [ls[f'valid_{metric}'] for ls in svr_last_scores] for metric in DEFAULT_METRICS.keys()}

In [None]:
plt.figure()
plt.title('The number of models with a given validation MSE')
plt.xlabel('MSE')
plt.ylabel('Number of models')
plt.hist(valid_svr['MSE'])
plt.show()

In [None]:
print(np.mean(valid_svr['MSE']))

In [None]:
svr_best_score = np.inf
svr_best_model_index = -1

for i, model in enumerate(svr_last_scores):
    if model['valid_MAPE'] < svr_best_score:
        svr_best_model_index = i
        svr_best_score = model['valid_MAPE']

In [None]:
for k, v in svr_last_scores[svr_best_model_index].items():
    if k.startswith('train'):
        print(k, v)

In [None]:
for k, v in svr_last_scores[svr_best_model_index].items():
    if k.startswith('valid'):
        print(k, v)

In [None]:
print(np.mean(valid_svr['MSE']), np.std(valid_svr['MSE']))

In [None]:
best_model_params = hyperparameters[svr_best_model_index]
print(best_model_params)

In [None]:
best_model = SupportVectorRegressor(X_train, y_train, C=0.1, epsilon=0.001, tolerance=0.01, kernel_type='rbf', gamma=0.5)
history = best_model.fit(X_test, y_test, max_iterations=250)

In [None]:
h = pd.DataFrame.from_dict(history)[['train_MSE', 'valid_MSE']].iloc[:50]
h = h.set_axis(['Train MSE', 'Test MSE'], axis=1)
h.plot()
plt.xlabel('Epochs')
plt.ylabel('MSE')
plt.title('MSE vs Epoch')
plt.show()

In [None]:
history['valid_MSE']