In [1]:
# Import libraries

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import pyreadr
import os

from data import pipeline

pd.set_option("display.precision", 2)

# Data preparation

In [2]:
X_train, Y_train, X_test, Y_test = pipeline.load("SVR") # load / buildAndSave / build
print(X_train.shape, Y_train.shape)

Loading SVR dataset
Done!
(1050578, 31) (1050578, 1)


In [3]:
from sklearn.model_selection import train_test_split

def reduceRandomly(X, Y, new_size):
    # Shuffle the dataset
    X_init, X_shuffled, Y_init, Y_shuffled = train_test_split(X, Y, test_size= len(X)-1)
    
    # Insert the non-shuffled value at a random index
    index = np.random.randint(0, len(X_shuffled))
    X_shuffled = np.concatenate((X_shuffled[:index], X_init, X_shuffled[index:]))
    Y_shuffled = np.concatenate((Y_shuffled[:index], Y_init, Y_shuffled[index:]))
    
    # Reduce the size of X and Y
    X_shuffled_reduced = X_shuffled[:new_size]
    Y_shuffled_reduced = Y_shuffled[:new_size]
    return X_shuffled_reduced, Y_shuffled_reduced

X_train, Y_train = reduceRandomly(X_train, Y_train, 10000)

In [4]:
from sklearn.preprocessing import StandardScaler

Y_train = Y_train.reshape(-1, 1)
Y_test = Y_test.reshape(-1, 1)

sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

sc_y = StandardScaler()
Y_train = sc_y.fit_transform(Y_train)
Y_train = Y_train.reshape(-1)

In [5]:
print(X_train.shape, Y_train.shape)

(10000, 31) (10000,)


# Training

In [6]:
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import scipy.stats as stats

regressor = SVR(kernel= 'rbf', C= 100)

In [7]:
rand_list = {"C": stats.expon(scale=100)}

tuner = RandomizedSearchCV(
    estimator= regressor,
    param_distributions= rand_list,
    n_iter= 10,
    cv= 5,
    random_state= 0)

In [None]:
# Reload tuner
# with open("svr_automl.pkl", "rb") as f:
#     tuner = pickle.load(f)

tuner.fit(X_train, Y_train)

tuner.cv_results_
with open("svr_automl.pkl", "wb") as f:
    pickle.dump(tuner, f)

In [None]:
# Reload tuner
with open("svr_automl.pkl", "rb") as f:
    tuner = pickle.load(f)

regressor.set_params(**tuner.best_params_)

regressor.fit(X_train, Y_train, n_jobs= -1)

In [None]:
from sklearn.metrics import mean_squared_error
from math import sqrt

print('Loss (RMSE) : %.3f' % sqrt(mean_squared_error(Y_train, regressor.predict(X_train))))

# Evaluation

In [None]:
X_test = X_test[:1000]
Y_test = Y_test[:1000]
print(X_test.shape, Y_test.shape)

In [None]:
decades_per_year = 37
Y_naive = []
for decade in range(0, len(Y_test), decades_per_year):
    growth_avg = Y_test[decade:decade + decades_per_year].mean()
    Y_naive += [growth_avg] * decades_per_year

In [None]:
print('Test RMSE with naive model : %.3f' % sqrt(mean_squared_error(Y_test, Y_naive)))

In [None]:
Y_preds = regressor.predict(X_test)

In [None]:
print('Test RMSE : %.3f' % sqrt(mean_squared_error(Y_test, Y_preds)))

In [None]:
Y_preds = sc_y.inverse_transform(Y_preds.reshape(-1, 1))
Y_preds = Y_preds.reshape(-1)

year = 5
plt.plot([decade for decade in range(37)], Y_test[37*year:37*(year+1)], color="blue", label= "raw")
plt.plot([decade for decade in range(37)], Y_preds[37*year:37*(year+1)], color="red", label= "svr")
plt.plot([decade for decade in range(37)], Y_naive[37*year:37*(year+1)], color="red", label= "naive model")
plt.xlabel("Decades")
plt.ylabel("Growth")
plt.legend()
plt.show()