# SVM classification on the Ozone dataset
Author: Tomas Hobza

In [None]:
# SVM classification on the Breast Cancer dataset
# Author: Tomas Hobza

from scipy.io import arff
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import pickle
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import (balanced_accuracy_score, f1_score, 
                             precision_score, recall_score, accuracy_score,
                             confusion_matrix, classification_report)
import matplotlib.pyplot as plt
import seaborn as sns
import time

# set random seed for reproducibility
np.random.seed(42)

## 0. Load the dataset

In [None]:
## 0. Load the dataset

# Load the parquet files
df = pd.read_parquet('br_train_cleaned.parquet')

print(f"Dataset shape: {df.shape}")
print(f"\nClass distribution:")
print(df['class'].value_counts())
print(f"\nClass balance ratio: {df['class'].value_counts().min() / df['class'].value_counts().max():.3f}")

# Display first few rows
print("\nFirst 5 rows:")
display(df.head())

# Check for missing values
print(f"\nMissing values: {df.isnull().sum().sum()}")

In [None]:
# Split the data into features and labels
X = df.drop(columns=['class'])
y = df['class']

# Split the data into training and testing sets (80:20)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set size: {X_train.shape[0]} samples")
print(f"Test set size: {X_test.shape[0]} samples")
print(f"\nTraining set class distribution:")
print(y_train.value_counts())
print(f"\nTest set class distribution:")
print(y_test.value_counts())

## 1. Experiment - holdout vs cross-validation

In [26]:
# == HOLDOUT APPROACH ==
# train default model on t
model = LinearSVC(class_weight='balanced', max_iter=2000, random_state=42)
start = time.time()
model.fit(X_train, y_train)
holdout_time = time.time() - start
y_pred = model.predict(X_test)
holdout_acc = balanced_accuracy_score(y_test, y_pred)
print(f"Holdout balanced accuracy: {holdout_acc:.4f}")

# == CROSS-VALIDATION APPROACH ==
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
start = time.time()
cv_scores = cross_val_score(
    LinearSVC(C=0.01, class_weight='balanced', max_iter=2000, random_state=42),
    X,
    y,
    cv=kf,
    scoring='balanced_accuracy',
    n_jobs=-1
)
cv_time = time.time() - start

print(f"CV Scores per fold: {[f'{s:.4f}' for s in cv_scores]}")
print(f"Mean: {cv_scores.mean():.4f}")
print(f"Std:  {cv_scores.std():.4f}")
print(f"Training Time: {cv_time:.1f}s ({cv_time/holdout_time:.1f}x longer)\n")

# Comparison
print("="*60)
print("COMPARISON")
print("="*60)
print(f"Holdout:          {holdout_acc:.4f}")
print(f"CV Mean:          {cv_scores.mean():.4f}")
print(f"Difference:       {abs(holdout_acc - cv_scores.mean()):.4f}")
print(f"CV Std Dev:       ±{cv_scores.std():.4f}")

Holdout balanced accuracy: 0.8948
CV Scores per fold: ['0.9147', '0.9345', '0.9545', '0.8805', '0.9487']
Mean: 0.9266
Std:  0.0268
Training Time: 0.8s (275.9x longer)

COMPARISON
Holdout:          0.8948
CV Mean:          0.9266
Difference:       0.0318
CV Std Dev:       ±0.0268


After comparing holdout vs. cross-validation and finding CV provides more robust estimates, we proceeded with holdout validation for our extensive parameter experiments due to computational constraints. We used a fixed random seed (42) and stratified splitting to ensure fair comparisons across all experiments. This approach allowed us to test a wider range of hyperparameters while maintaining reproducibility.

## 2. Experiment - C parameter