# BASELINE: Support Vector Machines (SVM)

## 1. Preprocessing

In [2]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
import numpy as np

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from scipy.stats import pointbiserialr
from sklearn.metrics import make_scorer, recall_score, confusion_matrix
import matplotlib.pyplot as plt
from model_functions import preprocess_diabetes

from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer, recall_score

In [3]:
file = 'data/diabetes.csv'

df = pd.read_csv(file)
prep_data = preprocess_diabetes(df)

df = prep_data["df"]
X_train = prep_data["X_train"]
X_test = prep_data["X_test"]
y_train = prep_data["y_train"]
y_test = prep_data["y_test"]

Selected 18 features from 21 (threshold=0.05)


## 2. SVM Visualization

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm

# Numeric target for plotting colors
y_numeric = (y_train == "Malignant").astype(int)

# Train SVM on 2D PCA data
model = svm.SVC(kernel='rbf', degree=1, C=1.0)
model.fit(X_train_pca[:,[0,1]], y_train)

# Create grid
x_min, x_max = X_train[:, 0].min() - 1, X_train[:, 0].max() + 1
y_min, y_max = X_train_pca[:, 1].min() - 1, X_train_pca[:, 1].max() + 1
xx, yy = np.meshgrid(
    np.linspace(x_min, x_max, 500),
    np.linspace(y_min, y_max, 500)
)

# Decision values
Z = model.decision_function(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

# --- Plot ---
plt.figure(figsize=(8,6))

plt.scatter(
    X_train_pca[:, 0],
    X_train_pca[:, 1],
    c=y_numeric,
    cmap="coolwarm",
    s=40,
    edgecolors='k'
)

# Decision boundary + margins
plt.contour(xx, yy, Z, colors='k', levels=[0], linewidths=2)
plt.contour(xx, yy, Z, colors='k', levels=[-1, 1], linestyles='--')

# Support vectors
plt.scatter(
    model.support_vectors_[:, 0],
    model.support_vectors_[:, 1],
    s=120,
    facecolors='none',
    edgecolors='k'
)

plt.title("SVM Decision Boundary and Margins (PCA space)")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.tight_layout()
plt.show()


NameError: name 'X_train_pca' is not defined

## 3. Hyperparameters tunning

In [5]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer, recall_score

# Pipeline: scaling + SVM
pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("svm", SVC())
])

# Scoring dictionary
scoring = {
    "accuracy": "accuracy",
    "recall": make_scorer(recall_score, pos_label=1)
}

# PARAM GRID:
# param_grid = [
#     # ------ Linear ------
#     {
#         "svm__kernel": ["linear"],
#         "svm__C": [0.001, 0.01, 0.1, 1, 10]
#     },
#     # ------ RBF ------
#     {
#         "svm__kernel": ["rbf"],
#         "svm__C": [0.01, 0.1, 1, 10, 100],
#         "svm__gamma": ["scale", 0.01, 0.1, 1]
#     },
#     # ------ Polynomial ------
#     {
#         "svm__kernel": ["poly"],
#         "svm__degree": [2],
#         "svm__C": [0.01, 0.1, 1, 10],
#         "svm__gamma": ["scale", 0.01, 0.1]
#     }
# ]

param_grid = [
    # ------ Linear ------
    {
        "svm__kernel": ["linear"],
        "svm__C": [0.1, 1, 10]
    },

    # ------ RBF ------
    {
        "svm__kernel": ["rbf"],
        "svm__C": [1, 10],          # reduced from 5 values → 2
        "svm__gamma": ["scale", 0.1]   # reduced from 4 → 2
    },

    # ------ Polynomial ------
    {
        "svm__kernel": ["poly"],
        "svm__degree": [2],         # keep only degree=2
        "svm__C": [1, 10],          # reduced from 4 → 2
        "svm__gamma": ["scale"]     # fixed to "scale"
    }
]

# GRID SEARCH
grid = GridSearchCV(
    pipe,
    param_grid,
    cv=30,
    scoring=scoring,
    refit="recall",   # optimize Recall (priority in medical contexts)
    n_jobs=-1,
    verbose=1
)

grid.fit(X_train, y_train)

print("\nBest parameters:")
print(grid.best_params_)

print("\nBest Recall:", grid.best_score_)


Fitting 30 folds for each of 9 candidates, totalling 270 fits

Best parameters:
{'svm__C': 1, 'svm__degree': 2, 'svm__gamma': 'scale', 'svm__kernel': 'poly'}

Best Recall: 0.8161044730081751


Best parameters:
{'svm__C': 1, 'svm__degree': 2, 'svm__gamma': 'scale', 'svm__kernel': 'poly'}

Best Recall: 0.8161044730081751

## 4. SVM function

In [4]:
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, recall_score, roc_auc_score
import numpy as np

def svm_cross_validation(X, y,
                         kernel='poly',
                         C=1,
                         gamma='scale',
                         probability=True,
                         degree=2,
                         k=5):

    if y.dtype == object or isinstance(y[0], str):
        y = np.where(y == "Malignant", 1, 0)

    kf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

    accuracies, f1s, recalls, aucs = [], [], [], []
    all_preds = []

    for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
        print(f"Fold {fold+1}/{k}")

        # Split (pandas vs numpy safe)
        if hasattr(X, "iloc"):
            X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_train, y_val = y[train_idx], y[val_idx]
        else:
            X_train, X_val = X[train_idx], X[val_idx]
            y_train, y_val = y[train_idx], y[val_idx]

        # Scaling inside fold
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_val = scaler.transform(X_val)

        # Train SVM
        svm = SVC(kernel=kernel, C=C, gamma=gamma, degree=degree, probability=probability)
        svm.fit(X_train, y_train)

        # Predict
        y_pred = svm.predict(X_val)
        y_proba = svm.predict_proba(X_val)[:, 1]

        # Metrics
        accuracies.append(accuracy_score(y_val, y_pred))
        f1s.append(f1_score(y_val, y_pred, pos_label=1))
        recalls.append(recall_score(y_val, y_pred, pos_label=1))
        aucs.append(roc_auc_score(y_val, y_proba))

        all_preds.extend(y_pred)

    metrics = {
        "accuracy_mean": np.mean(accuracies),
        "accuracy_std": np.std(accuracies),
        "f1_mean": np.mean(f1s),
        "f1_std": np.std(f1s),
        "recall_mean": np.mean(recalls),
        "recall_std": np.std(recalls),
        "auc_mean": np.mean(aucs),
        "auc_std": np.std(aucs),
    }

    return metrics, np.array(all_preds)


In [1]:
import time

X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)

start = time.time()

metrics_svm, preds_svm = svm_cross_validation(
    X_train,
    y_train,
    k=5
)

print(metrics_svm)

end = time.time()

training_time = end - start
print(f"Training time: {training_time:.4f} seconds")

NameError: name 'X_train' is not defined