# Reading Data

In [1]:
import torch

feature_path = './Data/Features/vit_b_16_features.pt'
data = torch.load(feature_path)
for key, value in data.items():
    print(f'{key.capitalize()} Data (X, y): ', value[0].shape, value[1].shape)

Train Data (X, y):  (60000, 768) (60000,)
Test Data (X, y):  (10000, 768) (10000,)


# Data Splitting

In [2]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    data['train'][0], data['train'][1], test_size=5000, stratify=data['train'][1], random_state=10)
data['train'] = [X_train, y_train]
data['val'] = [X_val, y_val]

for key, value in data.items():
    print(f'{key.capitalize()} Data (X, y): ', value[0].shape, value[1].shape)

Train Data (X, y):  (55000, 768) (55000,)
Test Data (X, y):  (10000, 768) (10000,)
Val Data (X, y):  (5000, 768) (5000,)


# Data Transformation

## PCA

In [3]:
import numpy as np

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

pca = PCA()
scalar = StandardScaler().fit(data['train'][0])
pca.fit(scalar.transform(data['train'][0]))
explained_variance_ratios = np.cumsum(pca.explained_variance_ratio_)

data['train'][0] = pca.transform(scalar.transform(data['train'][0]))
data['val'][0] = pca.transform(scalar.transform(data['val'][0]))
data['test'][0] = pca.transform(scalar.transform(data['test'][0]))

## Normalization

In [4]:
scalar = StandardScaler().fit(*data['train'])

data['train'][0] = scalar.transform(data['train'][0])
data['val'][0] = scalar.transform(data['val'][0])
data['test'][0] = scalar.transform(data['test'][0])

## Anova Test

In [5]:
from sklearn.feature_selection import f_classif

scores, _ = f_classif(*data['train'])

feature_score = list(zip(range(0, data['train'][0].shape[1]), scores))
feature_score = sorted(feature_score, key=lambda item: item[1], reverse=True)

feature_ordering = [item[0] for item in feature_score]

data['train'][0] = data['train'][0][:, feature_ordering]
data['val'][0] = data['val'][0][:, feature_ordering]
data['test'][0] = data['test'][0][:, feature_ordering]

explained_variance_ratios = np.cumsum(
    pca.explained_variance_ratio_[feature_ordering])

## Feature Subset

In [6]:
n_component = 278

# SVM

In [7]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, PredefinedSplit
np.random.seed(10)

param_grid = {'C': [0.1, 1, 10, 100],
              'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
              'degree': [2, 3 ,4]}

grid = GridSearchCV(SVC(), param_grid, scoring='f1_macro', n_jobs=-1, verbose=3, cv=PredefinedSplit(
    ([-1,]*data['train'][0][:, :n_component].shape[0]) + ([0,]*data['val'][0][:, :n_component].shape[0])))

grid.fit(np.concatenate([data['train'][0][:, :n_component], data['val'][0][:, :n_component]],
         axis=0), np.concatenate([data['train'][1], data['val'][1]], axis=0))

Fitting 1 folds for each of 48 candidates, totalling 48 fits


KeyboardInterrupt: 

In [1]:
grid.cv_results_

NameError: name 'grid' is not defined

In [None]:
from sklearn.metrics import classification_report

print(classification_report(data['val'][1], grid.best_estimator_.predict(data['val'][0][:, :n_component])))

In [None]:
torch.save(grid.best_estimator_, './Data/Models/best_SVM.pt')