In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.svm import LinearSVR
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVR

In [2]:
RANDOM_SEED = 2
CWD = Path.cwd()
# Data Dir
DATA_DIR = CWD.parent / "transformed"

In [3]:
train = pd.read_parquet("transformed/train.parquet")
val = pd.read_parquet("transformed/val.parquet")
test = pd.read_parquet("transformed/test.parquet")

In [14]:
X_train = train.iloc[:,:-2]
y_train = train.iloc[:,-1]

X_val = val.iloc[:,:-2]
y_val = val.iloc[:,-1]

X_test = test.iloc[:,:-2]
y_test = test.iloc[:,-1]

## LinearSVR model

In [5]:
def predict_scores(model, X, y_true):
    y_pred = model.predict(X)
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    mae = mean_absolute_error(y_true, y_pred)
    return {"RMSE": rmse, "MAE": mae}


def experiment(HPdict, model_args):
    grid = ParameterGrid(HPdict)

    scores = pd.DataFrame(columns=["RMSE", "MAE", "is_val"])
    for i, hps in enumerate(grid):
        print(f"training {i+1}th model")
        model = LinearSVR(**model_args)
        model.set_params(**hps)
        model.fit(X_train, y_train)
        
        row = predict_scores(model, X_train, y_train)
        scores = pd.concat(
            [scores, pd.DataFrame({**hps, **row, "is_val": False}, index=[0])],
            ignore_index=True)
        
        row = predict_scores(model, X_val, y_val)
        scores = pd.concat(
            [scores, pd.DataFrame({**hps, **row, "is_val": True}, index=[0])],
            ignore_index=True)
    return scores

In [6]:
hyperparams = {
    "C": [0.01, 0.1, 1, 10, 100, 1000],
    "tol":[1e-5]
}
model_args = {"dual":False, "loss":"squared_epsilon_insensitive", "random_state":RANDOM_SEED}
scores = experiment(hyperparams, model_args)
print("Done")

training 1th model
training 2th model
training 3th model
training 4th model
training 5th model
training 6th model
Done


In [7]:
print(scores[scores.is_val == False])
print(scores[scores.is_val == True])

        RMSE       MAE is_val        C      tol
0   7.885028  0.234479  False     0.01  0.00001
2   7.885028  0.234482  False     0.10  0.00001
4   7.885028  0.234483  False     1.00  0.00001
6   7.885028  0.234483  False    10.00  0.00001
8   7.885028  0.234483  False   100.00  0.00001
10  7.885028  0.234483  False  1000.00  0.00001
         RMSE       MAE is_val        C      tol
1   11.727788  0.241655   True     0.01  0.00001
3   11.727788  0.241659   True     0.10  0.00001
5   11.727788  0.241659   True     1.00  0.00001
7   11.727788  0.241659   True    10.00  0.00001
9   11.727788  0.241659   True   100.00  0.00001
11  11.727788  0.241659   True  1000.00  0.00001


In [8]:
model = LinearSVR(dual = False, C=0.01, loss='squared_epsilon_insensitive')
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
print("RMSE", rmse)
print("MAE", mae)

RMSE 3.04049225464921
MAE 0.21159881873611164


In [9]:
X_train_subset = X_train[:5000]
y_train_subset = y_train[:5000]

X_test_subset = X_test[:5000]
y_test_subset = y_test[:5000]

X_val_subset = X_val[:5000]
y_val_subset = y_val[:5000]

## SVR model

In [10]:
# SVR
def predict_scores(model, X, y_true):
    y_pred = model.predict(X)
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    mae = mean_absolute_error(y_true, y_pred)
    return {"RMSE": rmse, "MAE": mae}


def experiment(HPdict, model_args):
    grid = ParameterGrid(HPdict)

    scores = pd.DataFrame(columns=["RMSE", "MAE", "is_val"])
    for i, hps in enumerate(grid):
        print(f"training {i+1}th model")
        model = SVR(**model_args)
        model.set_params(**hps)
        model.fit(X_train_subset, y_train_subset)
        
        row = predict_scores(model, X_train_subset, y_train_subset)
        
        scores = pd.concat(
            [scores, pd.DataFrame({**hps, **row}, index=[0])])
        row = predict_scores(model, X_val, y_val)
        
    return scores

In [11]:
hyperparams = ({
    
    "kernel":['poly', 'rbf', 'sigmoid'],
    "C": [0.01, 0.1, 1, 10]
})
model_args = {}
scores = experiment(hyperparams, model_args)
print("Done")

training 1th model
training 2th model
training 3th model
training 4th model
training 5th model
training 6th model
training 7th model
training 8th model
training 9th model
training 10th model
training 11th model
training 12th model
Done


In [12]:
print(scores)

         RMSE         MAE is_val      C   kernel
0    0.754316    0.131708    NaN   0.01     poly
0    0.757307    0.134802    NaN   0.01      rbf
0    0.811837    0.236309    NaN   0.01  sigmoid
0    0.751345    0.129339    NaN   0.10     poly
0    0.754895    0.133446    NaN   0.10      rbf
0    3.994056    2.206653    NaN   0.10  sigmoid
0    0.749991    0.126968    NaN   1.00     poly
0    0.747692    0.126454    NaN   1.00      rbf
0   39.682383   21.865257    NaN   1.00  sigmoid
0    0.749241    0.125764    NaN  10.00     poly
0    0.725136    0.110171    NaN  10.00      rbf
0  359.568702  202.900404    NaN  10.00  sigmoid


In [13]:
model = SVR(kernel='poly', C=0.01)
model.fit(X_train, y_train_subset)
y_pred = model.predict(X_test_subset)
rmse = mean_squared_error(y_test_subset, y_pred, squared=False)
mae = mean_absolute_error(y_test_subset, y_pred)
print("RMSE", rmse)
print("MAE", mae)

RMSE 0.593893709957067
MAE 0.12227245704578672
