# SVM Parameter Optimization Experiment

This notebook implements a Support Vector Machine (SVM) parameter optimization experiment using the Breast Cancer Wisconsin dataset. We'll evaluate SVM performance across multiple samples and kernel configurations, tracking convergence and identifying the best parameters.

In [None]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report
import os
from datetime import datetime

%matplotlib inline
plt.style.use('ggplot')

## 1. Load and Explore Data

In [None]:
# Load data
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target

# Display dataset information
print(f"Dataset shape: {X.shape}")
print(f"Number of features: {X.shape[1]}")
print(f"Number of samples: {X.shape[0]}")
print(f"Target classes: {np.unique(y)}")
print(f"Class names: {data.target_names}")
print(f"Class distribution: {pd.Series(y).value_counts().to_dict()}")

In [None]:
# Create output directories for saving results and figures
os.makedirs('results', exist_ok=True)
os.makedirs('figures', exist_ok=True)

In [None]:
# View feature statistics
X.describe().T.sort_values(by='mean', ascending=False).head(10)

In [None]:
# Visualize class distribution
plt.figure(figsize=(8, 5))
sns.countplot(x=y)
plt.xticks([0, 1], data.target_names)
plt.title('Class Distribution')
plt.xlabel('Target Class')
plt.ylabel('Count')
plt.show()

## 2. Define Hyperparameter Search Space

In [None]:
# Hyperparameter ranges
kernels = ['linear', 'rbf', 'poly']
C_range = np.logspace(-3, 3, 10)
gamma_range = np.logspace(-3, 3, 10)

print(f"Kernels to try: {kernels}")
print(f"C values: {np.round(C_range, 5)}")
print(f"Gamma values: {np.round(gamma_range, 5)}")

# Plot parameter ranges on logarithmic scale
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

ax1.plot(np.arange(len(C_range)), C_range, 'bo-')
ax1.set_yscale('log')
ax1.set_title('C Parameter Range (Log Scale)')
ax1.set_xlabel('Index')
ax1.set_ylabel('C Value')
ax1.grid(True)

ax2.plot(np.arange(len(gamma_range)), gamma_range, 'ro-')
ax2.set_yscale('log')
ax2.set_title('Gamma Parameter Range (Log Scale)')
ax2.set_xlabel('Index')
ax2.set_ylabel('Gamma Value')
ax2.grid(True)

plt.tight_layout()
plt.show()

## 3. Parameter Optimization Process

In [None]:
# Set experiment parameters
n_samples = 10      # Number of different train/test splits to try
n_iterations = 100  # Number of random parameter combinations per sample
random_seed = 42    # Base random seed for reproducibility

In [None]:
# Initialize tracking variables
results = []
best_accuracy = 0
best_params = None
best_sample = None
convergence_data = []
all_sample_data = {}

# Outer loop over different train/test splits
for sample in tqdm(range(n_samples), desc="Samples"):
    np.random.seed(random_seed + sample)  # reproducibility
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=random_seed + sample
    )

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    sample_best_acc = 0
    sample_best_params = None
    iteration_accuracies = []
    sample_all_results = []

    # Inner loop: random search on hyperparameters
    for iteration in tqdm(range(n_iterations), desc=f"Sample {sample+1} Iterations", leave=False):
        kernel = np.random.choice(kernels)
        C = np.random.choice(C_range)

        if kernel == 'linear':
            svm = SVC(kernel=kernel, C=C, random_state=random_seed)
            gamma = None
        else:
            gamma = np.random.choice(gamma_range)
            svm = SVC(kernel=kernel, C=C, gamma=gamma, random_state=random_seed)

        svm.fit(X_train_scaled, y_train)
        acc = svm.score(X_test_scaled, y_test)
        iteration_accuracies.append(acc)
        
        # Store all results for this sample
        sample_all_results.append({
            'iteration': iteration,
            'kernel': kernel,
            'C': C,
            'gamma': gamma,
            'accuracy': acc
        })

        # Update bests
        if acc > sample_best_acc:
            sample_best_acc = acc
            sample_best_params = dict(kernel=kernel, C=C, gamma=gamma)
        if acc > best_accuracy:
            best_accuracy = acc
            best_params = dict(kernel=kernel, C=C, gamma=gamma, sample=sample)
            convergence_data = iteration_accuracies.copy()
            best_sample = sample

    # Store results for this sample
    all_sample_data[f'S{sample+1}'] = pd.DataFrame(sample_all_results)
    
    results.append({
        'Sample': f'S{sample+1}',
        'Best Accuracy': round(sample_best_acc, 3),
        'Parameters': f"{sample_best_params['kernel']}, "
                      f"{round(sample_best_params['C'], 3)}, "
                      f"{round(sample_best_params['gamma'], 3) if sample_best_params['gamma'] is not None else '—'}"
    })

## 4. Analyze Results

In [None]:
# Create results table
results_df = pd.DataFrame(results)
print("\nResults Table:")
results_df

In [None]:
# Save results to CSV
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
results_path = f'results/svm_results_{timestamp}.csv'
results_df.to_csv(results_path, index=False)
print(f"Results saved to {results_path}")

In [None]:
# Plot convergence of the overall best run
plt.figure(figsize=(10, 6))
plt.plot(convergence_data, marker='o', linestyle='-', alpha=0.7)
plt.axhline(y=best_accuracy, color='r', linestyle='--', alpha=0.5, 
            label=f'Best Accuracy: {best_accuracy:.3f}')

plt.title(f'Convergence of Best SVM (Sample {best_sample+1})')
plt.xlabel('Iteration')
plt.ylabel('Accuracy')
plt.grid(True, alpha=0.3)
plt.legend()
plt.tight_layout()

# Save figure
fig_path = f'figures/convergence_{timestamp}.png'
plt.savefig(fig_path, dpi=300, bbox_inches='tight')
print(f"Convergence plot saved to {fig_path}")

plt.show()

print(f"\nOverall best accuracy: {best_accuracy:.3f}")
print(f"Best parameters: kernel={best_params['kernel']}, C={best_params['C']:.3f}, " +
      f"gamma={best_params['gamma'] if best_params['gamma'] is not None else '—'}")
print(f"Found in sample: {best_sample+1}")

In [None]:
# Visualize accuracy distribution by kernel type
# Combine all results from all samples
all_results = pd.concat(all_sample_data.values())

plt.figure(figsize=(12, 6))
sns.boxplot(x='kernel', y='accuracy', data=all_results)
plt.title('Accuracy Distribution by Kernel Type')
plt.xlabel('Kernel')
plt.ylabel('Accuracy')
plt.grid(axis='y', alpha=0.3)

# Save figure
kernel_fig_path = f'figures/kernel_comparison_{timestamp}.png'
plt.savefig(kernel_fig_path, dpi=300, bbox_inches='tight')
print(f"Kernel comparison plot saved to {kernel_fig_path}")

plt.show()

In [None]:
# Visualize the relationship between C parameter and accuracy for each kernel
plt.figure(figsize=(14, 8))

for i, kernel in enumerate(kernels):
    kernel_data = all_results[all_results['kernel'] == kernel]
    
    plt.subplot(1, 3, i+1)
    plt.scatter(kernel_data['C'], kernel_data['accuracy'], alpha=0.6)
    plt.xscale('log')
    plt.title(f'{kernel.capitalize()} Kernel')
    plt.xlabel('C value (log scale)')
    plt.ylabel('Accuracy')
    plt.grid(True, alpha=0.3)
    
plt.tight_layout()

# Save figure
c_param_fig_path = f'figures/c_parameter_effect_{timestamp}.png'
plt.savefig(c_param_fig_path, dpi=300, bbox_inches='tight')
print(f"C parameter effect plot saved to {c_param_fig_path}")

plt.show()

## 5. Train Final Model with Best Parameters

In [None]:
# Train a final model with the best parameters
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_seed)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create model with best parameters
if best_params['kernel'] == 'linear':
    final_model = SVC(kernel=best_params['kernel'], C=best_params['C'], random_state=random_seed)
else:
    final_model = SVC(kernel=best_params['kernel'], C=best_params['C'], 
                      gamma=best_params['gamma'], random_state=random_seed)

# Train and evaluate
final_model.fit(X_train_scaled, y_train)
train_acc = final_model.score(X_train_scaled, y_train)
test_acc = final_model.score(X_test_scaled, y_test)

print(f"Training accuracy: {train_acc:.3f}")
print(f"Testing accuracy: {test_acc:.3f}")

## 6. Evaluate Final Model

In [None]:
# Make predictions
y_pred = final_model.predict(X_test_scaled)

# Plot confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=data.target_names, yticklabels=data.target_names)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')

# Save confusion matrix
cm_fig_path = f'figures/confusion_matrix_{timestamp}.png'
plt.savefig(cm_fig_path, dpi=300, bbox_inches='tight')
print(f"Confusion matrix saved to {cm_fig_path}")

plt.show()

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=data.target_names))

## 7. Summary of Findings

In [None]:
# Create a summary of our findings
print("SVM Parameter Optimization Summary")
print("==================================\n")
print(f"Dataset: Breast Cancer Wisconsin ({X.shape[0]} samples, {X.shape[1]} features)")
print(f"Experiment: {n_samples} samples, {n_iterations} iterations per sample\n")
print("Best Results:")
print(f"- Best Accuracy: {best_accuracy:.3f}")
print(f"- Best Kernel: {best_params['kernel']}")
print(f"- Best C: {best_params['C']:.6f}")
if best_params['kernel'] != 'linear':
    print(f"- Best Gamma: {best_params['gamma']:.6f}")
print(f"- Found in Sample: {best_sample+1}\n")
print("Final Model Performance:")
print(f"- Training Accuracy: {train_acc:.3f}")
print(f"- Testing Accuracy: {test_acc:.3f}")

# Also save this summary to a file
summary_path = f'results/summary_{timestamp}.txt'
with open(summary_path, 'w') as f:
    f.write("SVM Parameter Optimization Summary\n")
    f.write("==================================\n\n")
    f.write(f"Dataset: Breast Cancer Wisconsin ({X.shape[0]} samples, {X.shape[1]} features)\n")
    f.write(f"Experiment: {n_samples} samples, {n_iterations} iterations per sample\n\n")
    f.write("Best Results:\n")
    f.write(f"- Best Accuracy: {best_accuracy:.3f}\n")
    f.write(f"- Best Kernel: {best_params['kernel']}\n")
    f.write(f"- Best C: {best_params['C']:.6f}\n")
    if best_params['kernel'] != 'linear':
        f.write(f"- Best Gamma: {best_params['gamma']:.6f}\n")
    f.write(f"- Found in Sample: {best_sample+1}\n\n")
    f.write("Final Model Performance:\n")
    f.write(f"- Training Accuracy: {train_acc:.3f}\n")
    f.write(f"- Testing Accuracy: {test_acc:.3f}\n")

print(f"\nSummary saved to {summary_path}")