# 3a

In [34]:
import numpy as np
import time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

In [None]:
# data pre-processing for SPAMBASE:
data = np.loadtxt("spambase/spambase.data", delimiter=",")
X = data[:, :-1]
y = data[:, -1]

scaler = StandardScaler()
X_normalized = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_normalized, y, test_size=0.2, random_state=42)

1. SVM with Linear Kernel

In [None]:
print("\n 1. LINEAR KERNEL SVM")
print("="*60)
start_time = time.time()

# train linear svm:
svm_linear = SVC(kernel='linear', C=1.0, random_state=42)
svm_linear.fit(X_train, y_train)

train_pred_linear = svm_linear.predict(X_train)
test_pred_linear = svm_linear.predict(X_test)

train_acc_linear = np.mean(train_pred_linear == y_train)
test_acc_linear = np.mean(test_pred_linear == y_test)

linear_time = time.time() - start_time

print(f"Train Accuracy: {train_acc_linear:.4f}")
print(f"Test Accuracy: {test_acc_linear:.4f}")
print(f"Number of Support Vectors: {sum(svm_linear.n_support_)}")
print(f"Training Time: {linear_time:.2f} seconds")

2. Training SVM with RBF Kernel:

In [None]:
print("\n 2. RBF Kernel SVM")
print("="*60)
start_time = time.time()

svm_rbf = SVC(kernel='rbf', C=1.0, gamma='scale', random_state=42)
svm_rbf.fit(X_train, y_train)

train_pred_rbf = svm_rbf.predict(X_train)
test_pred_rbf = svm_rbf.predict(X_test)

train_acc_rbf = np.mean(train_pred_rbf==y_train)
test_acc_rbf = np.mean(test_pred_rbf==y_test)

rbf_time = time.time() - start_time

print(f"Train Accuracy: {train_acc_rbf:.4f}")
print(f"Test Accuracy: {test_acc_rbf:.4f}")
print(f"Number of Support Vectors: {sum(svm_rbf.n_support_)}")
print(f"Training Time: {rbf_time:.2f} seconds")

In [None]:
print("\n3. POLYNOMIAL KERNEL SVM")
print("="*60)

start_time = time.time()

svm_poly = SVC(kernel='poly', degree=3, C=10.0, gamma='scale', random_state=42)
svm_poly.fit(X_train, y_train)

train_pred_poly = svm_poly.predict(X_train)
test_pred_poly = svm_poly.predict(X_test)

train_acc_poly = np.mean(train_pred_poly == y_train)
test_acc_poly = np.mean(test_pred_poly == y_test)

poly_time = time.time() - start_time

print(f"Train Accuracy: {train_acc_poly:.4f}")
print(f"Test Accuracy: {test_acc_poly:.4f}")
print(f"Number of Support Vectors: {sum(svm_poly.n_support_)}")
print(f"Training Time: {poly_time:.2f} seconds")

In [None]:
# Try different C values and parameters
def test_hyperparameters():
    print("\n" + "="*60)
    print("HYPERPARAMETER TUNING")
    print("="*60)

    # Test different C values for each kernel
    C_values = [0.1, 1.0, 10.0]

    results = {}

    for kernel_type in ['linear', 'rbf', 'poly']:
        print(f"\nTesting {kernel_type.upper()} kernel with different C values:")
        print("-"*40)

        kernel_results = []

        for C in C_values:
            if kernel_type == 'linear':
                svm = SVC(kernel='linear', C=C, random_state=42)
            elif kernel_type == 'rbf':
                svm = SVC(kernel='rbf', C=C, gamma='scale', random_state=42)
            else:  # poly
                svm = SVC(kernel='poly', C=C, degree=3, gamma='scale', random_state=42)

            svm.fit(X_train, y_train)
            test_acc = svm.score(X_test, y_test)

            kernel_results.append((C, test_acc))
            print(f"  C={C:4.1f}: Test Accuracy = {test_acc:.4f}")

        results[kernel_type] = kernel_results

    return results

# Run hyperparameter testing
hp_results = test_hyperparameters()

In [None]:
def print_summary_table():
    print("\n" + "="*80)
    print("PROBLEM 3A: SPAMBASE SVM RESULTS SUMMARY")
    print("="*80)

    # Create results table
    results_data = [
        ["Linear", train_acc_linear, test_acc_linear, sum(svm_linear.n_support_), linear_time],
        ["RBF", train_acc_rbf, test_acc_rbf, sum(svm_rbf.n_support_), rbf_time],
        ["Polynomial", train_acc_poly, test_acc_poly, sum(svm_poly.n_support_), poly_time]
    ]

    print(f"{'Kernel':<12} {'Train Acc':<12} {'Test Acc':<12} {'Support Vectors':<18} {'Time (s)':<10}")
    print("-"*70)

    for row in results_data:
        kernel, train, test, sv, time_s = row
        print(f"{kernel:<12} {train:<12.4f} {test:<12.4f} {sv:<18} {time_s:<10.2f}")

def check_expectation(kernel, accuracy):
    if kernel == 'Linear':
        return 0.89 <= accuracy <= 0.93
    elif kernel == 'RBF':
        return 0.93 <= accuracy <= 0.97
    else:  # Polynomial
        return 0.85 <= accuracy <= 0.90

print_summary_table()

# 3B

In [37]:
X_train_digits = np.loadtxt("mnist_haar_bingyu/training_image.txt", delimiter=",")
y_train_digits = np.loadtxt("mnist_haar_bingyu/training_label.txt", delimiter=",")
X_test_digits = np.loadtxt("mnist_haar_bingyu/testing_image.txt", delimiter=",")
y_test_digits = np.loadtxt("mnist_haar_bingyu/testing_label.txt", delimiter=",")

In [38]:
print(f"Training set shape: {X_train_digits.shape}")
print(f"Training labels shape: {y_train_digits.shape}")
print(f"Test set shape: {X_test_digits.shape}")
print(f"Test labels shape: {y_test_digits.shape}")
print(f"Number of classes: {len(np.unique(y_train_digits))}")
print(f"Features per sample: {X_train_digits.shape[1]} (HAAR features)")

Training set shape: (60000, 200)
Training labels shape: (60000,)
Test set shape: (10000, 200)
Test labels shape: (10000,)
Number of classes: 10
Features per sample: 200 (HAAR features)


Multi-class classification with different kernels:

In [39]:
def train_svm_multiclass(X_train, y_train, X_test, y_test, kernel_type, **params):
    print(f"\n {kernel_type.upper()} KERNEL SVM")
    print("-"*40)
    start_time = time.time()

    # SVM modelling:
    if kernel_type == "linear":
        svm = SVC(kernel="linear", C=params.get('C', 1.0), random_state=42)
    elif kernel_type == "rbf":
        svm = SVC(kernel="rbf", C=params.get('C', 1.0), gamma=params.get('gamma', 'scale'), random_state=42)
    elif kernel_type == "poly":
        svm = SVC(kernel="poly", C=params.get('C', 1.0), degree=params.get('degree', 3), gamma=params.get('gamma', 'scale'), random_state=42)

    svm.fit(X_train, y_train)

    train_pred = svm.predict(X_train)
    test_pred = svm.predict(X_test)

    train_acc = np.mean(train_pred == y_train)
    test_acc = np.mean(test_pred == y_test)

    elapsed_time = time.time() - start_time

    print(f"Train Accuracy: {train_acc:.4f}")
    print(f"Test Accuracy: {test_acc:.4f}")
    print(f"Total Support Vectors: {sum(svm.n_support_)}")
    print(f"Support Vectors per class: {svm.n_support_}")
    print(f"Training Time: {elapsed_time:.2f} seconds")

    return svm, train_acc, test_acc, elapsed_time

In [40]:
print("\n" + "="*60)
print("PROBLEM 3B: DIGITS DATASET WITH HAAR FEATURES")
print("="*60)

# Store results
results_digits = {}

# 1. Linear Kernel
svm_linear_d, train_lin_d, test_lin_d, time_lin_d = train_svm_multiclass(
    X_train_digits, y_train_digits, X_test_digits, y_test_digits,
    'linear', C=1.0
)
results_digits['linear'] = (train_lin_d, test_lin_d, time_lin_d)

# 2. RBF Kernel
svm_rbf_d, train_rbf_d, test_rbf_d, time_rbf_d = train_svm_multiclass(
    X_train_digits, y_train_digits, X_test_digits, y_test_digits,
    'rbf', C=10.0, gamma='scale'
)
results_digits['rbf'] = (train_rbf_d, test_rbf_d, time_rbf_d)

# 3. Polynomial Kernel
svm_poly_d, train_poly_d, test_poly_d, time_poly_d = train_svm_multiclass(
    X_train_digits, y_train_digits, X_test_digits, y_test_digits,
    'poly', C=1.0, degree=3
)
results_digits['poly'] = (train_poly_d, test_poly_d, time_poly_d)


PROBLEM 3B: DIGITS DATASET WITH HAAR FEATURES

 LINEAR KERNEL SVM
----------------------------------------
Train Accuracy: 0.9537
Test Accuracy: 0.9343
Total Support Vectors: 9830
Support Vectors per class: [ 449  430 1037 1353  962 1351  585  918 1356 1389]
Training Time: 1041.06 seconds

 RBF KERNEL SVM
----------------------------------------
Train Accuracy: 0.9887
Test Accuracy: 0.9761
Total Support Vectors: 8244
Support Vectors per class: [ 422  414  738 1140  923  967  527  793 1074 1246]
Training Time: 55.01 seconds

 POLY KERNEL SVM
----------------------------------------
Train Accuracy: 0.9702
Test Accuracy: 0.9645
Total Support Vectors: 11419
Support Vectors per class: [ 687  679  897 1560 1269 1597  654 1101 1330 1645]
Training Time: 49.42 seconds


In [41]:
from sklearn.model_selection import GridSearchCV

def tune_svm_digits(X_train, y_train, kernel_type):
    """
    Find optimal hyperparameters for each kernel on digits dataset
    """
    print(f"\nTuning {kernel_type.upper()} kernel...")
    print("-"*40)

    # Define parameter grids for each kernel
    if kernel_type == 'linear':
        param_grid = {
            'C': [0.01, 0.1, 1, 10, 100]
        }
    elif kernel_type == 'rbf':
        param_grid = {
            'C': [0.1, 1, 10, 100],
            'gamma': ['scale', 'auto', 0.001, 0.01]
        }
    else:  # polynomial
        param_grid = {
            'C': [0.1, 1, 10],
            'degree': [2, 3, 4],
            'gamma': ['scale', 'auto']
        }

    # Create SVM
    svm = SVC(kernel=kernel_type, random_state=42)

    # Grid search with 3-fold cross-validation
    # Note: We can use a subset of data for faster tuning if needed
    # Sample 20% of training data for tuning (to speed up)
    n_samples = min(2000, len(X_train))
    indices = np.random.choice(len(X_train), n_samples, replace=False)
    X_subset = X_train[indices]
    y_subset = y_train[indices]

    grid_search = GridSearchCV(
        svm, param_grid,
        cv=3,
        scoring='accuracy',
        n_jobs=-1,  # Use all cores
        verbose=1
    )

    grid_search.fit(X_subset, y_subset)

    print(f"Best parameters: {grid_search.best_params_}")
    print(f"Best CV score: {grid_search.best_score_:.4f}")

    # Train final model with best parameters on full training set
    print("Training on full dataset with best parameters...")
    best_svm = grid_search.best_estimator_
    best_svm.fit(X_train, y_train)

    return best_svm, grid_search.best_params_

# Run hyperparameter tuning for each kernel
print("\n" + "="*60)
print("HYPERPARAMETER TUNING FOR DIGITS DATASET")
print("="*60)

# Tune Linear
best_linear_svm, best_linear_params = tune_svm_digits(X_train_digits, y_train_digits, 'linear')
test_acc_linear = best_linear_svm.score(X_test_digits, y_test_digits)
print(f"Linear Test Accuracy with best params: {test_acc_linear:.4f}\n")

# Tune RBF
best_rbf_svm, best_rbf_params = tune_svm_digits(X_train_digits, y_train_digits, 'rbf')
test_acc_rbf = best_rbf_svm.score(X_test_digits, y_test_digits)
print(f"RBF Test Accuracy with best params: {test_acc_rbf:.4f}\n")

# Tune Polynomial
best_poly_svm, best_poly_params = tune_svm_digits(X_train_digits, y_train_digits, 'poly')
test_acc_poly = best_poly_svm.score(X_test_digits, y_test_digits)
print(f"Polynomial Test Accuracy with best params: {test_acc_poly:.4f}\n")


HYPERPARAMETER TUNING FOR DIGITS DATASET

Tuning LINEAR kernel...
----------------------------------------
Fitting 3 folds for each of 5 candidates, totalling 15 fits
Best parameters: {'C': 0.01}
Best CV score: 0.8715
Training on full dataset with best parameters...
Linear Test Accuracy with best params: 0.9377


Tuning RBF kernel...
----------------------------------------
Fitting 3 folds for each of 16 candidates, totalling 48 fits
Best parameters: {'C': 10, 'gamma': 'scale'}
Best CV score: 0.9175
Training on full dataset with best parameters...
RBF Test Accuracy with best params: 0.9761


Tuning POLY kernel...
----------------------------------------
Fitting 3 folds for each of 18 candidates, totalling 54 fits
Best parameters: {'C': 10, 'degree': 3, 'gamma': 'scale'}
Best CV score: 0.9165
Training on full dataset with best parameters...
Polynomial Test Accuracy with best params: 0.9760

