## Model 2: Support Vector Machines

In [1]:
import pandas as pd
import numpy as np
import time

import matplotlib as mpl
import matplotlib.pyplot as plt

# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import roc_curve, auc, RocCurveDisplay
from sklearn.svm import SVC
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import accuracy_score

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

save_figures = False
figure_path = "../figures/models/"
# df_init = pd.read_parquet("../data/init.parquet")
# df_before = pd.read_parquet("../data/before_split.parquet")
# df_train = pd.read_parquet("../data/prep_train.parquet")
# df_val = pd.read_parquet("../data/prep_val.parquet")
# df_test = pd.read_parquet("../data/prep_test.parquet")
# df_tt_train = pd.read_parquet("../data/prep_tt_train.parquet")
# df_tt_test = pd.read_parquet("../data/prep_tt_test.parquet")
df_all = pd.read_parquet("../data/prep_all.parquet")

In [2]:
df_all.head()

Unnamed: 0,Year_Y2015,Year_Y2016,Country_England,Country_Wales,Supermarket_Asda,Supermarket_Tesco Extra,Supermarket_Tesco Metro,Supermarket_Waitrose,Time_Evening,Time_Morning,...,YearCountryAge_Y2016EnglandAge_g2,YearCountryAge_Y2016EnglandAge_g3,YearCountryAge_Y2016WalesAge_g1,YearCountryAge_Y2016WalesAge_g2,YearCountryAge_Y2016WalesAge_g3,YearCountryAge_nan,ObsSize,FemaleN,MaleN,y
0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,-0.471136,-1.156159,0.871226,1
1,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,-0.471136,-1.156159,0.871226,1
2,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,-0.471136,0.583858,-0.993069,1
3,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.811174,2.323876,-0.993069,1
4,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,-0.471136,0.583858,-0.993069,1


In [3]:
X_all = df_all.iloc[:,0:-1].to_numpy()
y_all = df_all["y"].to_numpy()

X_all = np.ascontiguousarray(X_all)
y_all = np.ascontiguousarray(y_all)

### Linear Kernel, Grid Search

In [4]:
SVC_lin = SVC(kernel = "linear", 
              class_weight = "balanced", 
              cache_size = 1000, 
              tol = 0.002, 
              max_iter = 200_000)
SVC_lin 

SVC(cache_size=1000, class_weight='balanced', kernel='linear', max_iter=200000,
    tol=0.002)

In [5]:
c_penalty = np.logspace(-3, 2, 11)
params_lin = {'C':c_penalty}

random_states_split = [123, 456, 789, 
                       741, 852, 963, 
                       159, 753, 951, 357]

gridCV_lin = GridSearchCV(estimator = SVC_lin, 
                          param_grid = params_lin, 
                          scoring = 'balanced_accuracy', 
                          n_jobs = -2,
                          refit = True, 
                          cv = 5)
gridCV_lin

GridSearchCV(cv=5,
             estimator=SVC(cache_size=1000, class_weight='balanced',
                           kernel='linear', max_iter=200000, tol=0.002),
             n_jobs=-2,
             param_grid={'C': array([1.00000000e-03, 3.16227766e-03, 1.00000000e-02, 3.16227766e-02,
       1.00000000e-01, 3.16227766e-01, 1.00000000e+00, 3.16227766e+00,
       1.00000000e+01, 3.16227766e+01, 1.00000000e+02])},
             scoring='balanced_accuracy')

In [6]:
%%time

BalAccuracy_lin = []

for each_rs in random_states_split:
    start_time = time.time()
    X_other_temp, X_test_temp, y_other_temp, y_test_temp = \
        train_test_split(X_all, y_all, 
                         test_size = 0.1, 
                         random_state = each_rs,
                         stratify = y_all)
    
    gridCV_lin.fit(X_other_temp, y_other_temp)
    
    BalAccuracy_lin.append(gridCV_lin.best_score_)
    
    print(gridCV_lin.best_params_, flush = True)
    
    end_time = time.time()
    print(f"Iteration {each_rs}: {end_time - start_time:.3f} seconds", flush = True)

{'C': 31.622776601683793}
Iteration 123: 11.841 seconds
{'C': 0.03162277660168379}
Iteration 456: 10.300 seconds
{'C': 1.0}
Iteration 789: 10.650 seconds
{'C': 31.622776601683793}
Iteration 741: 11.726 seconds
{'C': 1.0}
Iteration 852: 11.308 seconds
{'C': 31.622776601683793}
Iteration 963: 11.188 seconds
{'C': 0.03162277660168379}
Iteration 159: 10.842 seconds
{'C': 0.1}
Iteration 753: 10.445 seconds
{'C': 0.03162277660168379}
Iteration 951: 10.381 seconds
{'C': 0.03162277660168379}
Iteration 357: 10.755 seconds
Wall time: 1min 49s


In [7]:
print(f"{np.mean(BalAccuracy_lin):.4f} ({np.std(BalAccuracy_lin):.4f})")

0.6820 (0.0048)


### Radial Basis Function Kernel, Grid Search

In [8]:
SVC_rbf = SVC(kernel = "rbf", 
              class_weight = "balanced", 
              cache_size = 1000, 
              tol = 0.002, 
              max_iter = 100_000)
SVC_rbf 

SVC(cache_size=1000, class_weight='balanced', max_iter=100000, tol=0.002)

In [9]:
c_penalty = np.logspace(-3, 2, 6)
gamma_penalty = np.logspace(-2, 2, 17)
params_rbf = {'C':c_penalty, 
              "gamma": gamma_penalty}


gridCV_rbf = GridSearchCV(estimator = SVC_rbf, 
                          param_grid = params_rbf, 
                          scoring = 'balanced_accuracy', 
                          n_jobs = -2,
                          refit = True, 
                          cv = 5, verbose = 4)
gridCV_rbf

GridSearchCV(cv=5,
             estimator=SVC(cache_size=1000, class_weight='balanced',
                           max_iter=100000, tol=0.002),
             n_jobs=-2,
             param_grid={'C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02]),
                         'gamma': array([1.00000000e-02, 1.77827941e-02, 3.16227766e-02, 5.62341325e-02,
       1.00000000e-01, 1.77827941e-01, 3.16227766e-01, 5.62341325e-01,
       1.00000000e+00, 1.77827941e+00, 3.16227766e+00, 5.62341325e+00,
       1.00000000e+01, 1.77827941e+01, 3.16227766e+01, 5.62341325e+01,
       1.00000000e+02])},
             scoring='balanced_accuracy', verbose=4)

In [10]:
%%time

BalAccuracy_rbf = []

for each_rs in random_states_split:
    start_time = time.time()
    X_other_temp, X_test_temp, y_other_temp, y_test_temp = \
        train_test_split(X_all, y_all, 
                         test_size = 0.1, 
                         random_state = each_rs, 
                         stratify = y_all)
    
    gridCV_rbf.fit(X_other_temp, y_other_temp)
    
    BalAccuracy_rbf.append(gridCV_rbf.best_score_)
    
    print(gridCV_rbf.best_params_, flush = True)
    
    end_time = time.time()
    print(f"Iteration {each_rs}: {end_time - start_time:.3f} seconds", flush = True)

Fitting 5 folds for each of 102 candidates, totalling 510 fits
{'C': 1.0, 'gamma': 0.03162277660168379}
Iteration 123: 85.417 seconds
Fitting 5 folds for each of 102 candidates, totalling 510 fits
{'C': 1.0, 'gamma': 0.03162277660168379}
Iteration 456: 89.629 seconds
Fitting 5 folds for each of 102 candidates, totalling 510 fits
{'C': 1.0, 'gamma': 0.05623413251903491}
Iteration 789: 89.425 seconds
Fitting 5 folds for each of 102 candidates, totalling 510 fits
{'C': 1.0, 'gamma': 0.05623413251903491}
Iteration 741: 87.508 seconds
Fitting 5 folds for each of 102 candidates, totalling 510 fits
{'C': 1.0, 'gamma': 0.05623413251903491}
Iteration 852: 95.596 seconds
Fitting 5 folds for each of 102 candidates, totalling 510 fits
{'C': 1.0, 'gamma': 0.05623413251903491}
Iteration 963: 92.271 seconds
Fitting 5 folds for each of 102 candidates, totalling 510 fits
{'C': 1.0, 'gamma': 0.03162277660168379}
Iteration 159: 91.968 seconds
Fitting 5 folds for each of 102 candidates, totalling 510 fits

In [11]:
print(f"{np.mean(BalAccuracy_rbf):.4f} ({np.std(BalAccuracy_rbf):.4f})")

0.6907 (0.0031)


In [12]:
print(f"{np.mean(BalAccuracy_lin):.4f} ({np.std(BalAccuracy_lin):.4f})")
print(f"{np.mean(BalAccuracy_rbf):.4f} ({np.std(BalAccuracy_rbf):.4f})")

0.6820 (0.0048)
0.6907 (0.0031)


In [13]:
import json

svc_scores = {"lin": BalAccuracy_lin,
              "rbf": BalAccuracy_rbf}

with open("../results/svc_scores.json", "w") as outfile:
    json.dump(svc_scores, outfile)
    
# with open("logistic.json", "r") as readfile:
#     dict_data = json.load(readfile)