# test svm fillip

In [30]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.metrics import classification_report
from preprocessing import generate_balanced_data

In [23]:
seed = 42

In [24]:
dataset = np.load('../data/dataset.npz')
X, y  = dataset['X'], dataset['y']

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, train_size=0.7, random_state=seed, stratify=y # so we maintain class distribution
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, train_size=0.5, random_state=seed, stratify=y_temp
)

print(f"Training data shape: {X_train.shape}, {y_train.shape}")
print(f"Validation data shape: {X_val.shape}, {y_val.shape}")
print(f"Test data shape: {X_test.shape}, {y_test.shape}")


X_train, y_train = generate_balanced_data(X_train, y_train, seed=seed)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

Training data shape: (6883, 400), (6883,)
Validation data shape: (1475, 400), (1475,)
Test data shape: (1476, 400), (1476,)


In [25]:
param_grid = {
    'C': [0.1, 1, 10, 100, 1000],
    'gamma': [1e-3, 1e-4, 'scale', 'auto'],
    'kernel': ['rbf']
}
svc = SVC(kernel='rbf', random_state=42)
random_search = RandomizedSearchCV(svc, param_grid, n_iter=10, cv=5, random_state=seed, n_jobs=-1)
random_search.fit(X_train, y_train)

print(f"Best Parameters: {random_search.best_params_}")


Best Parameters: {'kernel': 'rbf', 'gamma': 'auto', 'C': 10}


In [26]:
best_svc = random_search.best_estimator_
y_pred = best_svc.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.86      0.90        94
           1       0.85      1.00      0.92        34
           2       0.86      0.94      0.90        79
           3       0.97      0.96      0.96       143
           4       0.88      0.89      0.89        75
           5       0.93      0.91      0.92       105
           6       0.98      0.96      0.97       137
           7       0.91      0.93      0.92        91
           8       0.93      0.88      0.90        64
           9       0.79      0.90      0.84        30
          10       0.94      0.90      0.92       124
          11       0.86      0.86      0.86        79
          12       0.96      0.95      0.95       143
          13       0.91      0.91      0.91       137
          14       0.90      0.82      0.86        11
          15       0.82      0.88      0.85        67
          16       1.00      1.00      1.00        63

    accuracy              

In [27]:
pca = PCA(n_components=0.95)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

random_search = RandomizedSearchCV(svc, param_grid, n_iter=10, cv=5, random_state=seed, n_jobs=-1)
random_search.fit(X_train, y_train)

print(f"Best Parameters: {random_search.best_params_}")

Best Parameters: {'kernel': 'rbf', 'gamma': 'auto', 'C': 100}


In [28]:
best_svc = random_search.best_estimator_
y_pred = best_svc.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.86      0.90        94
           1       0.89      1.00      0.94        34
           2       0.88      0.94      0.91        79
           3       0.99      0.97      0.98       143
           4       0.86      0.89      0.88        75
           5       0.94      0.93      0.94       105
           6       0.99      0.98      0.98       137
           7       0.93      0.93      0.93        91
           8       0.92      0.91      0.91        64
           9       0.88      0.93      0.90        30
          10       0.93      0.91      0.92       124
          11       0.87      0.86      0.87        79
          12       0.95      0.95      0.95       143
          13       0.93      0.93      0.93       137
          14       0.75      0.82      0.78        11
          15       0.90      0.91      0.90        67
          16       1.00      1.00      1.00        63

    accuracy              

In [31]:
components_range = [20, 30, 50, 64, 80, 100, 150]
for n in components_range:
    pca = PCA(n_components=n)
    X_train_pca = pca.fit_transform(X_train)
    svc = SVC(kernel='rbf', C=10, gamma='scale', random_state=42)
    scores = cross_val_score(svc, X_train_pca, y_train, cv=3)
    print(f"n_components: {n}, Cross-validation accuracy: {np.mean(scores):.2f}")

n_components: 20, Cross-validation accuracy: 0.85
n_components: 30, Cross-validation accuracy: 0.86
n_components: 50, Cross-validation accuracy: 0.87
n_components: 64, Cross-validation accuracy: 0.87
n_components: 80, Cross-validation accuracy: 0.86
n_components: 100, Cross-validation accuracy: 0.86
n_components: 150, Cross-validation accuracy: 0.85
