# 高斯 RBF 核 SVM 分类器

## 理论基础

RBF (Radial Basis Function) 核是最常用的非线性核函数，将数据隐式映射到无限维特征空间。

### 核函数定义

$$K(x, x') = \exp(-\gamma \|x - x'\|^2)$$

**参数 gamma ($\gamma$) 的作用:**
- $\gamma$ 大: 核函数"窄"，每个样本影响范围小，边界复杂
- $\gamma$ 小: 核函数"宽"，每个样本影响范围大，边界平滑

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.datasets import make_moons, make_circles
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)
plt.rcParams['font.sans-serif'] = ['Arial Unicode MS', 'SimHei', 'DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['figure.figsize'] = (10, 6)

In [None]:
# 生成数据
X, y = make_moons(n_samples=200, noise=0.15, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 可视化
fig, ax = plt.subplots(figsize=(10, 6))
ax.scatter(X[y==0, 0], X[y==0, 1], c='steelblue', s=50, edgecolors='white', label='类别 0')
ax.scatter(X[y==1, 0], X[y==1, 1], c='coral', s=50, edgecolors='white', label='类别 1')
ax.set_title('月牙形数据集', fontsize=14)
ax.legend()
ax.grid(True, alpha=0.3)
plt.show()

In [None]:
# RBF 核 SVM
rbf_svm = Pipeline([
    ('scaler', StandardScaler()),
    ('svm_clf', SVC(kernel='rbf', gamma=5, C=0.001, random_state=42))
])

rbf_svm.fit(X_train, y_train)

print(f"训练准确率: {accuracy_score(y_train, rbf_svm.predict(X_train)):.4f}")
print(f"测试准确率: {accuracy_score(y_test, rbf_svm.predict(X_test)):.4f}")
print(f"支持向量数: {rbf_svm.named_steps['svm_clf'].n_support_}")

In [None]:
# gamma 参数分析
def plot_boundary(model, X, y, ax, title):
    h = 0.02
    x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
    y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    Z = model.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)
    ax.contourf(xx, yy, Z, alpha=0.3, cmap='RdYlBu')
    ax.scatter(X[y==0, 0], X[y==0, 1], c='steelblue', s=30, edgecolors='white')
    ax.scatter(X[y==1, 0], X[y==1, 1], c='coral', s=30, edgecolors='white')
    ax.set_title(title, fontsize=11)
    ax.grid(True, alpha=0.3)

gamma_values = [0.1, 0.5, 1, 5, 10, 50]
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

for idx, gamma in enumerate(gamma_values):
    model = Pipeline([('scaler', StandardScaler()),
                      ('svm', SVC(kernel='rbf', gamma=gamma, C=1))])
    model.fit(X_train, y_train)
    acc = accuracy_score(y_test, model.predict(X_test))
    n_sv = sum(model.named_steps['svm'].n_support_)
    plot_boundary(model, X, y, axes[idx], f'gamma={gamma}\nAcc={acc:.2%}, SV={n_sv}')

plt.suptitle('gamma 参数对 RBF 核决策边界的影响', fontsize=14, y=1.02)
plt.tight_layout()
plt.show()

print("\ngamma 影响:")
print("- gamma 小: 边界平滑，可能欠拟合")
print("- gamma 大: 边界复杂，可能过拟合")

In [None]:
# C 和 gamma 的交互效应
C_values = [0.1, 1, 10]
gamma_values = [0.1, 1, 10]

fig, axes = plt.subplots(3, 3, figsize=(15, 15))

for i, C in enumerate(C_values):
    for j, gamma in enumerate(gamma_values):
        model = Pipeline([('scaler', StandardScaler()),
                          ('svm', SVC(kernel='rbf', gamma=gamma, C=C))])
        model.fit(X_train, y_train)
        acc = accuracy_score(y_test, model.predict(X_test))
        plot_boundary(model, X, y, axes[i, j], f'C={C}, gamma={gamma}\nAcc={acc:.2%}')

plt.suptitle('C 和 gamma 参数交互效应', fontsize=14, y=1.01)
plt.tight_layout()
plt.show()

In [None]:
# 网格搜索最佳参数
param_grid = {
    'svm__C': [0.1, 1, 10, 100],
    'svm__gamma': [0.1, 1, 10]
}

grid = GridSearchCV(
    Pipeline([('scaler', StandardScaler()), ('svm', SVC(kernel='rbf'))]),
    param_grid, cv=5, scoring='accuracy', n_jobs=-1, return_train_score=True
)
grid.fit(X_train, y_train)

print(f"最佳参数: {grid.best_params_}")
print(f"最佳 CV 准确率: {grid.best_score_:.4f}")
print(f"测试准确率: {accuracy_score(y_test, grid.predict(X_test)):.4f}")

# 可视化最佳模型
fig, ax = plt.subplots(figsize=(10, 6))
plot_boundary(grid.best_estimator_, X, y, ax, 
              f'最佳模型: C={grid.best_params_["svm__C"]}, gamma={grid.best_params_["svm__gamma"]}')
plt.tight_layout()
plt.show()

In [None]:
# 单元测试
def run_tests():
    tests = []
    
    try:
        m = Pipeline([('s', StandardScaler()), ('svm', SVC(kernel='rbf'))])
        m.fit(X_train, y_train)
        tests.append(("RBF SVM 训练", True, ""))
    except Exception as e:
        tests.append(("RBF SVM 训练", False, str(e)))
    
    try:
        pred = m.predict(X_test)
        assert pred.shape == y_test.shape
        tests.append(("预测输出", True, ""))
    except Exception as e:
        tests.append(("预测输出", False, str(e)))
    
    try:
        acc = accuracy_score(y_test, pred)
        assert acc > 0.8, f"准确率 {acc} < 0.8"
        tests.append(("准确率", True, f"{acc:.2%}"))
    except Exception as e:
        tests.append(("准确率", False, str(e)))
    
    try:
        sv = m.named_steps['svm'].support_vectors_
        assert len(sv) > 0
        tests.append(("支持向量", True, f"数量: {len(sv)}"))
    except Exception as e:
        tests.append(("支持向量", False, str(e)))
    
    print("="*50 + "\n单元测试结果\n" + "="*50)
    for name, ok, msg in tests:
        print(f"{'✓' if ok else '✗'} {name}" + (f" ({msg})" if msg else ""))
    print(f"\n通过: {sum(t[1] for t in tests)}/{len(tests)}")

run_tests()

## 知识总结

### RBF 核要点

| 参数 | 作用 | 调优建议 |
|------|------|----------|
| gamma | 核函数宽度 | 对数尺度搜索 [0.001, 1000] |
| C | 正则化强度 | 对数尺度搜索 [0.001, 1000] |

### 参数选择指南
1. **gamma 和 C 需要同时调优** (网格/随机搜索)
2. **特征必须标准化** (RBF 对尺度敏感)
3. **RBF 是"万能核"**，不确定时先试 RBF

### 适用场景
- 中小规模数据集
- 非线性决策边界
- 对数据结构无先验知识时