# Reading Data

In [None]:
import torch
from sklearn.svm import SVC
from tqdm import tqdm
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt

# 'vit_b_16_features.pt' file can be obtained by running 'feature_extraction.ipynb'
feature_path = './Data/Features/vit_b_16_features.pt'
data = torch.load(feature_path)
for key, value in data.items():
    print(f'{key.capitalize()} Data (X, y): ', value[0].shape, value[1].shape)

Train Data (X, y):  (60000, 768) (60000,)
Test Data (X, y):  (10000, 768) (10000,)


# Data Splitting

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    data['train'][0], data['train'][1], test_size=5000, stratify=data['train'][1], random_state=10)
data['train'] = [X_train, y_train]
data['val'] = [X_val, y_val]

for key, value in data.items():
    print(f'{key.capitalize()} Data (X, y): ', value[0].shape, value[1].shape)

Train Data (X, y):  (55000, 768) (55000,)
Test Data (X, y):  (10000, 768) (10000,)
Val Data (X, y):  (5000, 768) (5000,)


# Data Transformation

## PCA

In [None]:
import numpy as np

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

pca = PCA()
scalar = StandardScaler().fit(data['train'][0])
pca.fit(scalar.transform(data['train'][0]))
explained_variance_ratios = np.cumsum(pca.explained_variance_ratio_)

data['train'][0] = pca.transform(scalar.transform(data['train'][0]))
data['val'][0] = pca.transform(scalar.transform(data['val'][0]))
data['test'][0] = pca.transform(scalar.transform(data['test'][0]))

## Normalization

In [None]:
scalar = StandardScaler().fit(*data['train'])

data['train'][0] = scalar.transform(data['train'][0])
data['val'][0] = scalar.transform(data['val'][0])
data['test'][0] = scalar.transform(data['test'][0])

## Anova Test

In [None]:
from sklearn.feature_selection import f_classif

scores, _ = f_classif(*data['train'])

feature_score = list(zip(range(0, data['train'][0].shape[1]), scores))
feature_score = sorted(feature_score, key=lambda item: item[1], reverse=True)

feature_ordering = [item[0] for item in feature_score]

data['train'][0] = data['train'][0][:, feature_ordering]
data['val'][0] = data['val'][0][:, feature_ordering]
data['test'][0] = data['test'][0][:, feature_ordering]

explained_variance_ratios = np.cumsum(
    pca.explained_variance_ratio_[feature_ordering])

## Feature Subset

In [None]:
n_component = 278

# SVM Hyperparameter Tuning

In [None]:
np.random.seed(10)

# variables to keep track of the best model, score, and settings to save
best_model = 0
best_score = 0
C_best = 0
k_best = '0'

for C in tqdm([0.1, 1, 10, 25]):

    for k in ['linear', 'poly', 'rbf', 'sigmoid']:

        if k == 'poly':
            for degree in [2, 3 ,4]:
                clf = SVC(C = C, kernel = 'poly', degree = degree, probability = True).fit(data['train'][0][:,:n_component], data['train'][1])
                val_preds = clf.predict(data['val'][0][:,:n_component])
                s = f1_score(data['val'][1], val_preds, average='macro')
                print(f'{{C : {C}, kernel : poly, degree : {degree}}} -> f1_socre = {s}')
                if s > best_score:
                    best_score = s
                    best_model = clf
                    C_best = C
                    k_best = 'poly_d'+str(degree)
        else:
            clf = SVC(C = C, kernel = k, probability = True).fit(data['train'][0][:,:n_component], data['train'][1])
            val_preds = clf.predict(data['val'][0][:,:n_component])
            s = f1_score(data['val'][1], val_preds, average='macro')
            print(f'{{C : {C}, kernel : {k}}} -> f1_socre = {s}')
            if s > best_score:
                best_score = s
                best_model = clf
                C_best = C
                k_best = k

torch.save(best_model, './Data/Models/HP_SVM_C'+str(C_best)+'_k_'+k_best+'_f1score_'+str(round(best_score,3))+'.pt')
