# Notebook 06: SVM and Kernel Methods

Understand Support Vector Machines and kernel tricks.

## Learning Objectives
- Understand SVM fundamentals and margins
- Explore different kernels (linear, RBF, polynomial)
- Tune C and gamma parameters
- Visualize support vectors and decision boundaries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_classification, make_moons, make_circles, make_blobs
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC, SVR
from sklearn.metrics import accuracy_score, classification_report

np.random.seed(42)
plt.style.use('seaborn-v0_8-whitegrid')

## Part 1: Linear SVM Fundamentals

In [None]:
# Generate linearly separable data
X, y = make_blobs(n_samples=100, centers=2, cluster_std=1.0, random_state=42)

# Scale data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train Linear SVM
svm_linear = SVC(kernel='linear', C=1.0)
svm_linear.fit(X_scaled, y)

print(f"Number of support vectors: {len(svm_linear.support_vectors_)}")
print(f"Support vectors per class: {svm_linear.n_support_}")

In [None]:
# Visualize SVM with margins
def plot_svm_decision_boundary(model, X, y, ax, title):
    h = 0.02
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    
    Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    
    # Decision boundary and margins
    ax.contourf(xx, yy, Z, alpha=0.3, cmap=plt.cm.RdYlBu)
    
    # Plot decision function for margins
    if hasattr(model, 'decision_function'):
        Z_decision = model.decision_function(np.c_[xx.ravel(), yy.ravel()])
        Z_decision = Z_decision.reshape(xx.shape)
        ax.contour(xx, yy, Z_decision, colors='k', levels=[-1, 0, 1], 
                   alpha=0.5, linestyles=['--', '-', '--'])
    
    # Plot data points
    ax.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.RdYlBu, edgecolors='black', s=50)
    
    # Highlight support vectors
    ax.scatter(model.support_vectors_[:, 0], model.support_vectors_[:, 1],
               s=200, facecolors='none', edgecolors='green', linewidths=2)
    
    ax.set_title(title)

plt.figure(figsize=(10, 6))
ax = plt.gca()
plot_svm_decision_boundary(svm_linear, X_scaled, y, ax, 
                          f'Linear SVM\nSupport Vectors: {len(svm_linear.support_vectors_)}')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.show()

## Part 2: Effect of C Parameter (Regularization)

In [None]:
# Generate data with some overlap
X, y = make_blobs(n_samples=100, centers=2, cluster_std=1.5, random_state=42)
X_scaled = scaler.fit_transform(X)

# Compare different C values
C_values = [0.01, 0.1, 1.0, 10.0, 100.0]

fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.ravel()

for idx, C in enumerate(C_values):
    svm = SVC(kernel='linear', C=C)
    svm.fit(X_scaled, y)
    
    plot_svm_decision_boundary(svm, X_scaled, y, axes[idx], 
                              f'C = {C}\nSV: {len(svm.support_vectors_)}')

axes[-1].axis('off')
plt.tight_layout()
plt.show()

print("\nC parameter interpretation:")
print("- Small C: Wider margin, more misclassifications allowed (soft margin)")
print("- Large C: Narrow margin, fewer misclassifications (hard margin)")

## Part 3: Non-linear Data and Kernels

In [None]:
# Generate non-linearly separable data
X_moons, y_moons = make_moons(n_samples=200, noise=0.2, random_state=42)
X_circles, y_circles = make_circles(n_samples=200, noise=0.1, factor=0.5, random_state=42)

# Scale
X_moons_scaled = scaler.fit_transform(X_moons)
X_circles_scaled = scaler.fit_transform(X_circles)

# Visualize datasets
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

axes[0].scatter(X_moons[:, 0], X_moons[:, 1], c=y_moons, cmap='RdYlBu', edgecolors='black')
axes[0].set_title('Moons Dataset')

axes[1].scatter(X_circles[:, 0], X_circles[:, 1], c=y_circles, cmap='RdYlBu', edgecolors='black')
axes[1].set_title('Circles Dataset')

plt.tight_layout()
plt.show()

In [None]:
# Compare kernels on moons data
kernels = ['linear', 'poly', 'rbf', 'sigmoid']

fig, axes = plt.subplots(2, 2, figsize=(12, 10))
axes = axes.ravel()

for idx, kernel in enumerate(kernels):
    svm = SVC(kernel=kernel, C=1.0, gamma='scale')
    svm.fit(X_moons_scaled, y_moons)
    
    acc = svm.score(X_moons_scaled, y_moons)
    
    # Simple decision boundary plot
    h = 0.02
    x_min, x_max = X_moons_scaled[:, 0].min() - 0.5, X_moons_scaled[:, 0].max() + 0.5
    y_min, y_max = X_moons_scaled[:, 1].min() - 0.5, X_moons_scaled[:, 1].max() + 0.5
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    
    Z = svm.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)
    
    axes[idx].contourf(xx, yy, Z, alpha=0.8, cmap=plt.cm.RdYlBu)
    axes[idx].scatter(X_moons_scaled[:, 0], X_moons_scaled[:, 1], c=y_moons, 
                      cmap=plt.cm.RdYlBu, edgecolors='black', s=30)
    axes[idx].set_title(f'{kernel.upper()} Kernel\nAccuracy: {acc:.3f}')

plt.tight_layout()
plt.show()

## Part 4: RBF Kernel - Gamma Parameter

In [None]:
# Effect of gamma on RBF kernel
gamma_values = [0.01, 0.1, 1.0, 10.0, 100.0]

fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.ravel()

for idx, gamma in enumerate(gamma_values):
    svm = SVC(kernel='rbf', C=1.0, gamma=gamma)
    svm.fit(X_moons_scaled, y_moons)
    
    acc = svm.score(X_moons_scaled, y_moons)
    
    # Plot decision boundary
    h = 0.02
    x_min, x_max = X_moons_scaled[:, 0].min() - 0.5, X_moons_scaled[:, 0].max() + 0.5
    y_min, y_max = X_moons_scaled[:, 1].min() - 0.5, X_moons_scaled[:, 1].max() + 0.5
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    
    Z = svm.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)
    
    axes[idx].contourf(xx, yy, Z, alpha=0.8, cmap=plt.cm.RdYlBu)
    axes[idx].scatter(X_moons_scaled[:, 0], X_moons_scaled[:, 1], c=y_moons, 
                      cmap=plt.cm.RdYlBu, edgecolors='black', s=30)
    axes[idx].set_title(f'γ = {gamma}\nAcc: {acc:.3f}, SV: {len(svm.support_vectors_)}')

axes[-1].axis('off')
plt.tight_layout()
plt.show()

print("\nGamma parameter interpretation:")
print("- Small gamma: Smooth decision boundary, may underfit")
print("- Large gamma: Complex decision boundary, may overfit")

## Part 5: C and Gamma Grid Search

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X_moons_scaled, y_moons, test_size=0.2, random_state=42
)

# Parameter grid
C_range = np.logspace(-2, 3, 10)
gamma_range = np.logspace(-3, 2, 10)

# Grid search results
results = np.zeros((len(C_range), len(gamma_range)))

for i, C in enumerate(C_range):
    for j, gamma in enumerate(gamma_range):
        svm = SVC(kernel='rbf', C=C, gamma=gamma)
        svm.fit(X_train, y_train)
        results[i, j] = svm.score(X_test, y_test)

# Visualize as heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(results, annot=True, fmt='.2f', cmap='YlOrRd',
            xticklabels=[f'{g:.2e}' for g in gamma_range],
            yticklabels=[f'{c:.2e}' for c in C_range])
plt.xlabel('Gamma')
plt.ylabel('C')
plt.title('SVM RBF: Test Accuracy for C and Gamma')
plt.tight_layout()
plt.show()

# Find best parameters
best_idx = np.unravel_index(np.argmax(results), results.shape)
print(f"\nBest C: {C_range[best_idx[0]]:.4f}")
print(f"Best gamma: {gamma_range[best_idx[1]]:.4f}")
print(f"Best accuracy: {results[best_idx]:.4f}")

In [None]:
# Use GridSearchCV for proper cross-validation
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [0.01, 0.1, 1, 10],
    'kernel': ['rbf']
}

grid_search = GridSearchCV(SVC(), param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

print("GridSearchCV Results:")
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best CV score: {grid_search.best_score_:.4f}")
print(f"Test score: {grid_search.score(X_test, y_test):.4f}")

## Part 6: Polynomial Kernel

In [None]:
# Compare polynomial degrees
degrees = [2, 3, 4, 5]

fig, axes = plt.subplots(2, 2, figsize=(12, 10))
axes = axes.ravel()

for idx, degree in enumerate(degrees):
    svm = SVC(kernel='poly', degree=degree, C=1.0, coef0=1)
    svm.fit(X_moons_scaled, y_moons)
    
    acc = svm.score(X_moons_scaled, y_moons)
    
    # Plot
    h = 0.02
    x_min, x_max = X_moons_scaled[:, 0].min() - 0.5, X_moons_scaled[:, 0].max() + 0.5
    y_min, y_max = X_moons_scaled[:, 1].min() - 0.5, X_moons_scaled[:, 1].max() + 0.5
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    
    Z = svm.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)
    
    axes[idx].contourf(xx, yy, Z, alpha=0.8, cmap=plt.cm.RdYlBu)
    axes[idx].scatter(X_moons_scaled[:, 0], X_moons_scaled[:, 1], c=y_moons, 
                      cmap=plt.cm.RdYlBu, edgecolors='black', s=30)
    axes[idx].set_title(f'Polynomial Degree {degree}\nAccuracy: {acc:.3f}')

plt.tight_layout()
plt.show()

## Part 7: Multi-class Classification

In [None]:
# Generate multi-class data
X_multi, y_multi = make_classification(
    n_samples=500, n_features=20, n_informative=10,
    n_classes=4, n_clusters_per_class=1, random_state=42
)

X_train, X_test, y_train, y_test = train_test_split(
    X_multi, y_multi, test_size=0.2, random_state=42
)

# Scale
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train SVM
svm_multi = SVC(kernel='rbf', C=1.0, gamma='scale', decision_function_shape='ovr')
svm_multi.fit(X_train_scaled, y_train)

# Evaluate
y_pred = svm_multi.predict(X_test_scaled)

print("Multi-class SVM Classification Report:")
print(classification_report(y_test, y_pred))

## Part 8: SVM for Regression (SVR)

In [None]:
# Generate regression data
n_samples = 200
X_reg = np.sort(5 * np.random.rand(n_samples, 1), axis=0)
y_reg = np.sin(X_reg).ravel() + np.random.randn(n_samples) * 0.1

# Train different SVR models
svr_rbf = SVR(kernel='rbf', C=100, gamma=0.1, epsilon=0.1)
svr_linear = SVR(kernel='linear', C=100)
svr_poly = SVR(kernel='poly', C=100, degree=3)

# Fit models
svr_rbf.fit(X_reg, y_reg)
svr_linear.fit(X_reg, y_reg)
svr_poly.fit(X_reg, y_reg)

# Predict
X_plot = np.linspace(0, 5, 1000).reshape(-1, 1)
y_rbf = svr_rbf.predict(X_plot)
y_linear = svr_linear.predict(X_plot)
y_poly = svr_poly.predict(X_plot)

# Plot
plt.figure(figsize=(12, 6))
plt.scatter(X_reg, y_reg, c='gray', alpha=0.5, label='Data')
plt.plot(X_plot, y_rbf, 'r-', lw=2, label='RBF')
plt.plot(X_plot, y_linear, 'g-', lw=2, label='Linear')
plt.plot(X_plot, y_poly, 'b-', lw=2, label='Polynomial')
plt.xlabel('X')
plt.ylabel('y')
plt.title('SVR with Different Kernels')
plt.legend()
plt.show()

In [None]:
# Effect of epsilon (tube width)
epsilons = [0.01, 0.1, 0.5, 1.0]

fig, axes = plt.subplots(2, 2, figsize=(12, 10))
axes = axes.ravel()

for idx, eps in enumerate(epsilons):
    svr = SVR(kernel='rbf', C=100, gamma=0.1, epsilon=eps)
    svr.fit(X_reg, y_reg)
    
    y_pred = svr.predict(X_plot)
    
    axes[idx].scatter(X_reg, y_reg, c='gray', alpha=0.5)
    axes[idx].plot(X_plot, y_pred, 'r-', lw=2)
    
    # Show epsilon tube
    axes[idx].fill_between(X_plot.ravel(), y_pred - eps, y_pred + eps, alpha=0.2, color='red')
    
    axes[idx].set_title(f'ε = {eps}, SV: {len(svr.support_)}')
    axes[idx].set_xlabel('X')
    axes[idx].set_ylabel('y')

plt.tight_layout()
plt.show()

## Part 9: Practical Example with Real-ish Data

In [None]:
# Generate complex classification data
X, y = make_classification(
    n_samples=1000, n_features=20, n_informative=15,
    n_redundant=3, n_classes=3, n_clusters_per_class=2,
    random_state=42
)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Compare different SVM configurations
configs = [
    ('Linear', SVC(kernel='linear', C=1.0)),
    ('RBF (C=1)', SVC(kernel='rbf', C=1.0, gamma='scale')),
    ('RBF (C=10)', SVC(kernel='rbf', C=10.0, gamma='scale')),
    ('Poly (d=3)', SVC(kernel='poly', degree=3, C=1.0))
]

print("SVM Configuration Comparison:")
print("="*50)

for name, model in configs:
    model.fit(X_train_scaled, y_train)
    
    train_acc = model.score(X_train_scaled, y_train)
    test_acc = model.score(X_test_scaled, y_test)
    n_sv = len(model.support_vectors_)
    
    print(f"\n{name}:")
    print(f"  Train Accuracy: {train_acc:.4f}")
    print(f"  Test Accuracy: {test_acc:.4f}")
    print(f"  Support Vectors: {n_sv} ({n_sv/len(X_train)*100:.1f}%)")

## Summary

In this notebook, you learned:

### SVM Fundamentals
- Maximum margin classifier
- Support vectors determine decision boundary
- Works well with high-dimensional data

### Key Parameters
- **C**: Regularization (soft vs hard margin)
  - Small C → wider margin, more errors allowed
  - Large C → narrow margin, fewer errors
- **gamma**: RBF kernel parameter
  - Small γ → smooth boundary
  - Large γ → complex boundary
- **degree**: Polynomial kernel degree

### Kernels
- **Linear**: For linearly separable data
- **RBF**: Most versatile, good default
- **Polynomial**: Captures interactions

### Key Takeaways
- Always scale features before using SVM
- RBF kernel is usually a good starting point
- Grid search C and gamma together
- Number of support vectors indicates complexity

### Next Steps
Continue to **Notebook 07** for MLP fundamentals - the start of neural networks!