# 2025美赛C题：敏感性分析

本Notebook用于统一评估模型对关键参数与建模设定的敏感性，覆盖：

1. **正则化强度（alpha）**：比较Lasso/Ridge在不同alpha下的表现。
2. **模型类别对比**：线性模型 vs. 随机森林。
3. **采样扰动稳健性**：Bootstrap重采样的预测区间与指标稳定性。

> 输出图表与表格将在本Notebook中生成，并按项目规范保存为PDF。

In [3]:
# 基础库
import os
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

warnings.filterwarnings('ignore')

# 画图风格
plt.rcParams['font.sans-serif'] = ['Arial Unicode MS', 'SimHei', 'DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False
sns.set_theme(style='whitegrid')

# 数据路径（如有需要可改为实际数据位置）
DATA_DIR = os.path.join('..', '..')

# 兼容Notebook环境：__file__ 不存在时使用工作目录
try:
    base_dir = os.path.dirname(os.path.abspath(__file__))
except NameError:
    repo_root = os.getcwd()
    candidate = os.path.join(repo_root, 'Simulation', '25C', 'Coder', '敏感性分析')
    base_dir = candidate if os.path.exists(candidate) else repo_root

FIGURE_DIR = os.path.join(base_dir, 'figures')
os.makedirs(FIGURE_DIR, exist_ok=True)

print('✅ 环境初始化完成')

: 

In [None]:
# 数据加载
medal_path = os.path.join(DATA_DIR, 'processed_medal_data.csv')
if not os.path.exists(medal_path):
    raise FileNotFoundError(f'未找到数据文件: {medal_path}')

df = pd.read_csv(medal_path)
print('✅ 数据加载成功:', df.shape)

# 特征与目标
feature_columns = [
    'total_rolling3_mean',
    'gold_lag1',
    'total_lag1',
    'total_lag2',
    'is_host',
    'total_events',
    'participation_count'
]

target = 'Total'

X = df[feature_columns]
y = df[target]

# 训练/测试划分
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 标准化（线性模型需要）
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print('✅ 特征准备完成')

In [None]:
# 1) 正则化强度敏感性（Lasso / Ridge）
alpha_grid = np.logspace(-3, 2, 12)

records = []
for alpha in alpha_grid:
    ridge = Ridge(alpha=alpha)
    ridge.fit(X_train_scaled, y_train)
    y_pred_ridge = ridge.predict(X_test_scaled)
    records.append({
        'model': 'Ridge',
        'alpha': alpha,
        'rmse': np.sqrt(mean_squared_error(y_test, y_pred_ridge)),
        'mae': mean_absolute_error(y_test, y_pred_ridge),
        'r2': r2_score(y_test, y_pred_ridge)
    })

    lasso = Lasso(alpha=alpha, max_iter=5000)
    lasso.fit(X_train_scaled, y_train)
    y_pred_lasso = lasso.predict(X_test_scaled)
    records.append({
        'model': 'Lasso',
        'alpha': alpha,
        'rmse': np.sqrt(mean_squared_error(y_test, y_pred_lasso)),
        'mae': mean_absolute_error(y_test, y_pred_lasso),
        'r2': r2_score(y_test, y_pred_lasso)
    })

sens_df = pd.DataFrame(records)
print(sens_df.head())

# 可视化：RMSE vs alpha
plt.figure(figsize=(8, 5))
for name, g in sens_df.groupby('model'):
    plt.plot(g['alpha'], g['rmse'], marker='o', label=name)
plt.xscale('log')
plt.xlabel('alpha (log scale)')
plt.ylabel('RMSE')
plt.legend()
plt.tight_layout()
plt.savefig(os.path.join(FIGURE_DIR, 'fig_sensitivity_alpha_rmse.pdf'))
plt.show()

In [None]:
# 2) 模型类别敏感性
models = {
    'Linear': LinearRegression(),
    'Ridge(alpha=1.0)': Ridge(alpha=1.0),
    'Lasso(alpha=0.1)': Lasso(alpha=0.1, max_iter=5000),
}

metrics = []
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    pred = model.predict(X_test_scaled)
    metrics.append({
        'model': name,
        'rmse': np.sqrt(mean_squared_error(y_test, pred)),
        'mae': mean_absolute_error(y_test, pred),
        'r2': r2_score(y_test, pred)
    })

# 随机森林（无需标准化）
rf = RandomForestRegressor(
    n_estimators=300,
    random_state=42,
    min_samples_leaf=2
)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)
metrics.append({
    'model': 'Random Forest',
    'rmse': np.sqrt(mean_squared_error(y_test, rf_pred)),
    'mae': mean_absolute_error(y_test, rf_pred),
    'r2': r2_score(y_test, rf_pred)
})

metric_df = pd.DataFrame(metrics)
print(metric_df)

# 可视化：R2对比
plt.figure(figsize=(8, 4))
plt.bar(metric_df['model'], metric_df['r2'], color='steelblue')
plt.ylabel('R2')
plt.xticks(rotation=20, ha='right')
plt.tight_layout()
plt.savefig(os.path.join(FIGURE_DIR, 'fig_sensitivity_model_r2.pdf'))
plt.show()

In [None]:
# 3) Bootstrap稳健性评估（以Lasso为例）
np.random.seed(42)

bootstrap_rmse = []
bootstrap_r2 = []

n_boot = 200
for _ in range(n_boot):
    idx = np.random.choice(len(X_train_scaled), len(X_train_scaled), replace=True)
    Xb = X_train_scaled[idx]
    yb = y_train.iloc[idx]

    model = Lasso(alpha=0.1, max_iter=5000)
    model.fit(Xb, yb)
    pred = model.predict(X_test_scaled)

    bootstrap_rmse.append(np.sqrt(mean_squared_error(y_test, pred)))
    bootstrap_r2.append(r2_score(y_test, pred))

rmse_ci = (np.percentile(bootstrap_rmse, 2.5), np.percentile(bootstrap_rmse, 97.5))
r2_ci = (np.percentile(bootstrap_r2, 2.5), np.percentile(bootstrap_r2, 97.5))

print(f'RMSE 95% CI: {rmse_ci}')
print(f'R2   95% CI: {r2_ci}')

plt.figure(figsize=(8, 4))
plt.hist(bootstrap_rmse, bins=25, color='steelblue', edgecolor='black', alpha=0.75)
plt.xlabel('RMSE (bootstrap)')
plt.ylabel('Frequency')
plt.tight_layout()
plt.savefig(os.path.join(FIGURE_DIR, 'fig_sensitivity_bootstrap_rmse.pdf'))
plt.show()