# Support Vector Machines


Imports


In [133]:
from sklearn.svm import LinearSVC, SVC
from sklearn.metrics import (
    classification_report,
    ConfusionMatrixDisplay,
    confusion_matrix,
)
import pandas as pd
import numpy as np
from binning import bin

Read the train and test datasets


In [2]:
train_df = pd.read_csv("Training_set.csv")
test_df = pd.read_csv("Testing_set.csv")

Get the training features and label


In [26]:
X_train = train_df.iloc[:, :-1]
y_train = train_df.iloc[:, -1]

Get the testing features and label


In [27]:
X_test = test_df.iloc[:, :-1]
y_test = test_df.iloc[:, -1]

In [155]:
def bin(label: np.ndarray):
    label_copy = label.copy()
    label_copy[label <= 15] = 0
    label_copy[(label_copy > 15) & (label_copy <= 45)] = 1
    label_copy[(label_copy > 45) & (label_copy <= 120)] = 2
    label_copy[label_copy > 120] = 3

    return label_copy

In [55]:
def create_bins(df: pd.Series):
    quantiles = df.quantile([0, 1 / 3, 2 / 3, 1])
    print(quantiles)
    bins_df = pd.cut(
        df,
        bins=quantiles,
        labels=[0, 1, 2],
    )
    bins_df = bins_df.fillna(0)
    return bins_df

In [137]:
print("Train")
print(f"Less than 15: {len(y_train[y_train <= 15])}")
print(f"15 to 45: {len(y_train[(y_train > 15) & (y_train <= 45)])}")
print(f"45 to 105: {len(y_train[(y_train > 45) & (y_train <= 120)])}")
print(f"More than 105: {len(y_train[y_train > 120])}")

Train
Less than 15: 46697
15 to 45: 168836
45 to 105: 131077
More than 105: 38906


In [102]:
print("Test")
print(f"Less than 15: {len(y_test[y_test <= 30])}")
print(f"15 to 45: {len(y_test[(y_test > 30) & (y_test <= 45)])}")
print(f"45 to 105: {len(y_test[(y_test > 45) & (y_test <= 105)])}")
print(f"More than 105: {len(y_test[y_test > 120])}")

Test
Less than 15: 35520
15 to 45: 18414
45 to 105: 29469
More than 105: 9998


Bin the label


In [156]:
y_train_binned = bin(y_train.to_numpy())
y_test_binned = bin(y_test.to_numpy())

In [157]:
print(len(y_train_binned[y_train_binned == 0]))
print(len(y_train_binned[y_train_binned == 1]))
print(len(y_train_binned[y_train_binned == 2]))
print(len(y_train_binned[y_train_binned == 3]))

46697
168836
131077
38906


In [72]:
print(len(y_test_binned[y_test_binned == 0]))
print(len(y_test_binned[y_test_binned == 1]))
print(len(y_test_binned[y_test_binned == 2]))
print(len(y_test_binned[y_test_binned == 3]))

35520
30305
30554
0


Initialize an SVM classifier


In [146]:
svc = LinearSVC(dual=False, verbose=10)

Train the model


In [147]:
svc.fit(X_train, y_train_binned)

[LibLinear]iter  1 act 2.157e+05 pre 2.156e+05 delta 3.065e-04 f 3.855e+05 |g| 1.407e+09 CG   1
cg reaches trust region boundary
iter  2 act 2.370e+03 pre 2.450e+03 delta 3.357e-04 f 1.699e+05 |g| 2.287e+07 CG   3
cg reaches trust region boundary
iter  3 act 8.324e+02 pre 8.231e+02 delta 3.725e-04 f 1.675e+05 |g| 8.612e+06 CG   4
cg reaches trust region boundary
iter  4 act 2.505e+02 pre 2.283e+02 delta 4.509e-04 f 1.667e+05 |g| 2.163e+06 CG   6
cg reaches trust region boundary
iter  5 act 8.987e+01 pre 8.986e+01 delta 6.503e-04 f 1.664e+05 |g| 4.397e+05 CG   4
cg reaches trust region boundary
iter  6 act 1.402e+02 pre 1.404e+02 delta 7.881e-04 f 1.663e+05 |g| 1.367e+06 CG   6
cg reaches trust region boundary
iter  7 act 1.609e+02 pre 1.604e+02 delta 1.060e-03 f 1.662e+05 |g| 6.183e+05 CG   7
cg reaches trust region boundary
iter  8 act 1.193e+02 pre 1.187e+02 delta 1.073e-03 f 1.660e+05 |g| 1.314e+06 CG   7
iter  9 act 3.805e+01 pre 3.827e+01 delta 1.073e-03 f 1.659e+05 |g| 1.820e+07 

Make the predictions on the test set


In [148]:
y_pred = svc.predict(X_test)

Evaluate the predictions


In [151]:
print(
    classification_report(
        y_true=y_test_binned, y_pred=y_pred, zero_division=0, digits=5
    )
)

              precision    recall  f1-score   support

           0    1.00000   0.00009   0.00017     11718
           1    0.46732   0.84371   0.60148     42216
           2    0.41553   0.25817   0.31847     32447
           3    0.00000   0.00000   0.00000      9998

    accuracy                        0.45649     96379
   macro avg    0.47071   0.27549   0.23003     96379
weighted avg    0.46617   0.45649   0.37070     96379



In [154]:
cm = confusion_matrix(y_test_binned, y_pred, labels=[0, 1, 2, 3])
# disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[0, 1, 2, 3])
# disp.plot()
cm

array([[    1,  9870,  1847,     0],
       [    0, 35618,  6598,     0],
       [    0, 24070,  8377,     0],
       [    0,  6660,  3338,     0]])