# test svm fillip

In [2]:
import numpy as np
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.decomposition import PCA, IncrementalPCA
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.feature_selection import RFE, SelectKBest, f_classif
from sklearn.metrics import classification_report
from preprocessing import generate_balanced_data
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVC

In [3]:
seed = 42

In [4]:
dataset = np.load('../data/dataset.npz')
X, y  = dataset['X'], dataset['y']

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, train_size=0.7, random_state=seed, stratify=y # so we maintain class distribution
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, train_size=0.5, random_state=seed, stratify=y_temp
)

print(f"Training data shape: {X_train.shape}, {y_train.shape}")
print(f"Validation data shape: {X_val.shape}, {y_val.shape}")
print(f"Test data shape: {X_test.shape}, {y_test.shape}")


X_train, y_train = generate_balanced_data(X_train, y_train, seed=seed)
# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

Training data shape: (6883, 400), (6883,)
Validation data shape: (1475, 400), (1475,)
Test data shape: (1476, 400), (1476,)


In [5]:
param_grid = {
    'C': [0.1, 1, 10, 100, 1000],
    'gamma': [1e-3, 1e-4, 'scale', 'auto'],
    'kernel': ['rbf']
}
svc = SVC(kernel='rbf', random_state=42)
random_search = RandomizedSearchCV(svc, param_grid, n_iter=10, cv=5, random_state=seed, n_jobs=-1)
random_search.fit(X_train, y_train)

print(f"Best Parameters: {random_search.best_params_}")


Best Parameters: {'kernel': 'rbf', 'gamma': 'scale', 'C': 1000}


In [6]:
best_svc = random_search.best_estimator_
y_pred = best_svc.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.86      0.90        94
           1       0.85      1.00      0.92        34
           2       0.84      0.94      0.89        79
           3       0.97      0.97      0.97       143
           4       0.88      0.91      0.89        75
           5       0.92      0.90      0.91       105
           6       0.98      0.97      0.97       137
           7       0.93      0.92      0.93        91
           8       0.97      0.88      0.92        64
           9       0.85      0.93      0.89        30
          10       0.92      0.90      0.91       124
          11       0.86      0.85      0.85        79
          12       0.96      0.95      0.95       143
          13       0.92      0.92      0.92       137
          14       0.69      0.82      0.75        11
          15       0.85      0.87      0.86        67
          16       1.00      1.00      1.00        63

    accuracy              

In [27]:
pca = PCA(n_components=0.95)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

random_search = RandomizedSearchCV(svc, param_grid, n_iter=10, cv=5, random_state=seed, n_jobs=-1)
random_search.fit(X_train, y_train)

print(f"Best Parameters: {random_search.best_params_}")

Best Parameters: {'kernel': 'rbf', 'gamma': 'auto', 'C': 100}


In [28]:
best_svc = random_search.best_estimator_
y_pred = best_svc.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.86      0.90        94
           1       0.89      1.00      0.94        34
           2       0.88      0.94      0.91        79
           3       0.99      0.97      0.98       143
           4       0.86      0.89      0.88        75
           5       0.94      0.93      0.94       105
           6       0.99      0.98      0.98       137
           7       0.93      0.93      0.93        91
           8       0.92      0.91      0.91        64
           9       0.88      0.93      0.90        30
          10       0.93      0.91      0.92       124
          11       0.87      0.86      0.87        79
          12       0.95      0.95      0.95       143
          13       0.93      0.93      0.93       137
          14       0.75      0.82      0.78        11
          15       0.90      0.91      0.90        67
          16       1.00      1.00      1.00        63

    accuracy              

In [31]:
components_range = [20, 30, 50, 64, 80, 100, 150]
for n in components_range:
    pca = PCA(n_components=n)
    X_train_pca = pca.fit_transform(X_train)
    svc = SVC(kernel='rbf', C=10, gamma='scale', random_state=42)
    scores = cross_val_score(svc, X_train_pca, y_train, cv=3)
    print(f"n_components: {n}, Cross-validation accuracy: {np.mean(scores):.2f}")

n_components: 20, Cross-validation accuracy: 0.85
n_components: 30, Cross-validation accuracy: 0.86
n_components: 50, Cross-validation accuracy: 0.87
n_components: 64, Cross-validation accuracy: 0.87
n_components: 80, Cross-validation accuracy: 0.86
n_components: 100, Cross-validation accuracy: 0.86
n_components: 150, Cross-validation accuracy: 0.85


In [34]:
# ensabmle method?
estimators = [
    ('svc_rbf', SVC(kernel='rbf', C=10, gamma='scale', probability=True)),
    ('svc_poly', SVC(kernel='poly', degree=3, C=1, probability=True)),
    ('rf', RandomForestClassifier(n_estimators=100))
]

stacking_clf = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(),
    cv=5
)

stacking_clf.fit(X_train, y_train)
y_pred = stacking_clf.predict(X_test)

In [35]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.86      0.88        94
           1       0.94      1.00      0.97        34
           2       0.88      0.95      0.91        79
           3       0.99      0.96      0.98       143
           4       0.89      0.88      0.89        75
           5       0.91      0.93      0.92       105
           6       0.99      0.98      0.99       137
           7       0.93      0.92      0.93        91
           8       0.95      0.92      0.94        64
           9       0.78      0.93      0.85        30
          10       0.93      0.92      0.92       124
          11       0.89      0.85      0.87        79
          12       0.96      0.96      0.96       143
          13       0.92      0.91      0.92       137
          14       0.90      0.82      0.86        11
          15       0.87      0.93      0.90        67
          16       1.00      1.00      1.00        63

    accuracy              

In [None]:
model = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=0.95)),
    ('feature_selection', RFE(estimator=SVC(kernel='linear'))),
    ('svc', SVC(kernel='rbf', random_state=seed))
])

param_grid = {
    'svc__C': [0.1, 1, 10, 100],
    'svc__gamma': [1e-3, 1e-4, 'scale'],
    'feature_selection__n_features_to_select': [20, 50, 100, 150],
}

random_search = RandomizedSearchCV(model, param_grid, n_iter=20, cv=5, random_state=seed, n_jobs=-1, verbose=2)
random_search.fit(X_train, y_train)

print(f"Best Parameters: {random_search.best_params_}")

best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)

print(classification_report(y_test, y_pred))


splitting it up so my laptop doesnt blue screen :P, adding PolynomialFeatures with degree exploded my memory, i think the total amount of features topped 80k without me knowing :skull:

In [7]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [8]:
pca = PCA(n_components=0.95)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

#### loads of different rfe

In [9]:
k_best = SelectKBest(score_func=f_classif, k=30)

X_train_rfe = k_best.fit_transform(X_train_pca, y_train)

X_test_rfe = k_best.transform(X_test_pca)

In [None]:
svc_linear = SVC(kernel='linear', random_state=seed)
rfe = RFE(estimator=svc_linear, n_features_to_select=75)

X_train_rfe = rfe.fit_transform(X_train_pca, y_train)
X_test_rfe = rfe.transform(X_test_pca)

In [None]:
svc_linear_l1 = LinearSVC(C=0.01, penalty='l1', dual=False, random_state=seed)
rfe = RFE(estimator=svc_linear_l1, n_features_to_select=75)

X_train_rfe = rfe.fit_transform(X_train_pca, y_train)
X_test_rfe = rfe.transform(X_test_pca)

In [16]:
l1_svc = LinearSVC(C=0.01, penalty='l1', dual=False, random_state=seed)
l1_svc.fit(X_train_pca, y_train)

selector = SelectFromModel(l1_svc, prefit=True)
X_train_rfe = selector.transform(X_train_pca)
X_test_rfe = selector.transform(X_test_pca)

In [10]:
poly = PolynomialFeatures(degree=2)

X_train_poly = poly.fit_transform(X_train_rfe)
X_test_poly = poly.transform(X_test_rfe)

print(f"Shape after polynomial features (train): {X_train_poly.shape}")
print(f"Shape after polynomial features (test): {X_test_poly.shape}")

Shape after polynomial features (train): (11305, 496)
Shape after polynomial features (test): (1476, 496)


In [12]:
svc = SVC(kernel='rbf', random_state=42)

param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [1e-3, 1e-4, 'scale']
}

random_search = RandomizedSearchCV(svc, param_grid, n_iter=10, cv=5, random_state=seed, n_jobs=-1, verbose=2)
random_search.fit(X_train_poly, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [13]:
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test_poly)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.78      0.89      0.83        94
           1       0.94      1.00      0.97        34
           2       0.94      0.94      0.94        79
           3       0.98      0.94      0.96       143
           4       0.91      0.95      0.93        75
           5       0.95      0.96      0.96       105
           6       0.99      0.97      0.98       137
           7       0.96      0.93      0.94        91
           8       0.93      0.89      0.91        64
           9       0.91      0.97      0.94        30
          10       0.99      0.92      0.95       124
          11       0.80      0.89      0.84        79
          12       0.95      0.97      0.96       143
          13       0.91      0.85      0.88       137
          14       0.77      0.91      0.83        11
          15       0.94      0.88      0.91        67
          16       1.00      1.00      1.00        63

    accuracy              