# 敏感性分析（Sensitivity Analysis）

## 分析目标
评估各问题模型对关键参数和输入特征变化的敏感程度，验证模型稳定性。

## 分析内容
1. **问题一**：SARIMA模型参数敏感性
2. **问题三**：结果分布预测的特征敏感性
3. **问题四**：难度分类的特征敏感性
4. **综合**：EERIE预测的稳定性分析

---
## 1. 环境配置与数据加载

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# 配置
plt.rcParams['font.sans-serif'] = ['Arial Unicode MS', 'SimHei', 'DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False
sns.set_theme(style='whitegrid')

FIGSIZE_WIDE = (12, 6)
FIGSIZE_NORMAL = (10, 6)
FIGSIZE_SQUARE = (8, 8)
COLORS = {
    'primary': '#4682B4',
    'secondary': '#FF7F50',
    'accent': '#228B22',
    'neutral': '#708090'
}

print('库导入完成')

In [None]:
# 加载数据
df = pd.read_csv('../数据预处理/data_processed.csv')
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values('date').reset_index(drop=True)

print(f'数据加载成功: {df.shape}')

In [None]:
# 定义特征列
feature_cols = [
    'num_vowels', 'vowel_ratio', 'num_unique_letters', 'num_repeated_letters',
    'has_repeated', 'avg_letter_freq', 'min_letter_freq', 'max_letter_freq',
    'first_letter_freq', 'last_letter_freq'
]

target_cols = ['try_1', 'try_2', 'try_3', 'try_4', 'try_5', 'try_6', 'try_x']

# EERIE特征
letter_freq = {
    'E': 12.70, 'A': 8.17, 'R': 5.99, 'I': 6.97, 'O': 7.51, 'T': 9.06, 'N': 6.75,
    'S': 6.33, 'L': 4.03, 'C': 2.78, 'U': 2.76, 'D': 4.25, 'P': 1.93, 'M': 2.41,
    'H': 6.09, 'G': 2.02, 'B': 1.49, 'F': 2.23, 'Y': 1.97, 'W': 2.36, 'K': 0.77,
    'V': 0.98, 'X': 0.15, 'Z': 0.07, 'J': 0.15, 'Q': 0.10
}

def extract_word_features(word):
    word = word.upper()
    letters = list(word)
    unique_letters = set(letters)
    
    vowels = set('AEIOU')
    num_vowels = sum(1 for l in letters if l in vowels)
    vowel_ratio = num_vowels / len(letters)
    
    num_unique = len(unique_letters)
    num_repeated = len(letters) - num_unique
    has_repeated = 1 if num_repeated > 0 else 0
    
    freqs = [letter_freq.get(l, 0) for l in letters]
    
    return {
        'num_vowels': num_vowels, 'vowel_ratio': vowel_ratio,
        'num_unique_letters': num_unique, 'num_repeated_letters': num_repeated,
        'has_repeated': has_repeated, 'avg_letter_freq': np.mean(freqs),
        'min_letter_freq': np.min(freqs), 'max_letter_freq': np.max(freqs),
        'first_letter_freq': letter_freq.get(letters[0], 0),
        'last_letter_freq': letter_freq.get(letters[-1], 0)
    }

eerie_features = extract_word_features('EERIE')
print('EERIE基准特征:', eerie_features)

---
## 2. 问题一：SARIMA参数敏感性分析

In [None]:
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_absolute_error, mean_squared_error

# 准备时间序列数据
ts = df.set_index('date')['num_results']
train_size = int(len(ts) * 0.8)
train, test = ts[:train_size], ts[train_size:]

print(f'训练集: {len(train)}, 测试集: {len(test)}')

In [None]:
# SARIMA参数敏感性
# 基准参数: (1,1,1)(1,1,1,7)
param_variations = [
    {'p': 0, 'd': 1, 'q': 1, 'P': 1, 'D': 1, 'Q': 1, 's': 7, 'name': 'p=0'},
    {'p': 1, 'd': 1, 'q': 1, 'P': 1, 'D': 1, 'Q': 1, 's': 7, 'name': 'Base (1,1,1)'},
    {'p': 2, 'd': 1, 'q': 1, 'P': 1, 'D': 1, 'Q': 1, 's': 7, 'name': 'p=2'},
    {'p': 1, 'd': 1, 'q': 0, 'P': 1, 'D': 1, 'Q': 1, 's': 7, 'name': 'q=0'},
    {'p': 1, 'd': 1, 'q': 2, 'P': 1, 'D': 1, 'Q': 1, 's': 7, 'name': 'q=2'},
    {'p': 1, 'd': 1, 'q': 1, 'P': 0, 'D': 1, 'Q': 1, 's': 7, 'name': 'P=0'},
    {'p': 1, 'd': 1, 'q': 1, 'P': 1, 'D': 1, 'Q': 0, 's': 7, 'name': 'Q=0'},
]

sarima_results = []

print('SARIMA参数敏感性分析...')
for params in param_variations:
    try:
        model = SARIMAX(train, 
                       order=(params['p'], params['d'], params['q']),
                       seasonal_order=(params['P'], params['D'], params['Q'], params['s']),
                       enforce_stationarity=False,
                       enforce_invertibility=False)
        fit = model.fit(disp=False)
        
        # 预测
        forecast = fit.forecast(steps=len(test))
        mae = mean_absolute_error(test, forecast)
        rmse = np.sqrt(mean_squared_error(test, forecast))
        
        sarima_results.append({
            'Params': params['name'],
            'MAE': mae,
            'RMSE': rmse,
            'AIC': fit.aic
        })
        print(f"  {params['name']}: MAE={mae:.0f}, RMSE={rmse:.0f}")
    except Exception as e:
        print(f"  {params['name']}: 失败 - {str(e)[:50]}")

sarima_df = pd.DataFrame(sarima_results)

### 图1: SARIMA参数敏感性

In [None]:
# 图1: SARIMA参数敏感性
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# MAE比较
colors = [COLORS['accent'] if 'Base' in p else COLORS['primary'] for p in sarima_df['Params']]
axes[0].barh(sarima_df['Params'], sarima_df['MAE'], color=colors, edgecolor='black', alpha=0.8)
axes[0].set_xlabel('MAE', fontsize=12)
axes[0].set_ylabel('Parameter Setting', fontsize=12)

# RMSE比较
axes[1].barh(sarima_df['Params'], sarima_df['RMSE'], color=colors, edgecolor='black', alpha=0.8)
axes[1].set_xlabel('RMSE', fontsize=12)
axes[1].set_ylabel('Parameter Setting', fontsize=12)

plt.tight_layout()
plt.savefig('figures/fig1_sarima_sensitivity.pdf', bbox_inches='tight')
plt.show()
print('图1已保存: figures/fig1_sarima_sensitivity.pdf')

In [None]:
# 计算敏感度
base_mae = sarima_df[sarima_df['Params'] == 'Base (1,1,1)']['MAE'].values[0]
sarima_df['MAE_Change(%)'] = (sarima_df['MAE'] - base_mae) / base_mae * 100

print('SARIMA参数敏感性结果:')
print(sarima_df.to_string(index=False))

print(f'\n基准MAE: {base_mae:.0f}')
print(f'MAE变化范围: {sarima_df["MAE_Change(%)"].min():.1f}% ~ {sarima_df["MAE_Change(%)"].max():.1f}%')

---
## 3. 问题三：结果分布预测的特征敏感性

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler

# 训练结果分布预测模型
X = df[feature_cols].values
y = df[target_cols].values

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

rf_regressor = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
rf_regressor.fit(X_scaled, y)

print('结果分布预测模型训练完成')

In [None]:
# EERIE基准预测
eerie_X = np.array([[eerie_features[col] for col in feature_cols]])
eerie_X_scaled = scaler.transform(eerie_X)
eerie_base_pred = rf_regressor.predict(eerie_X_scaled)[0]

print('EERIE基准预测:')
for i, col in enumerate(target_cols):
    print(f'  {col}: {eerie_base_pred[i]:.1f}%')

In [None]:
# 特征扰动分析
perturbation_levels = [-20, -10, 0, 10, 20]  # 百分比变化
key_features = ['num_vowels', 'num_repeated_letters', 'avg_letter_freq', 'min_letter_freq']

sensitivity_results = {}

for feat in key_features:
    feat_idx = feature_cols.index(feat)
    base_val = eerie_features[feat]
    
    preds_by_level = []
    for pct in perturbation_levels:
        # 创建扰动后的特征
        perturbed_X = eerie_X.copy()
        if pct == 0:
            perturbed_val = base_val
        else:
            perturbed_val = base_val * (1 + pct/100)
        perturbed_X[0, feat_idx] = perturbed_val
        
        # 预测
        perturbed_X_scaled = scaler.transform(perturbed_X)
        pred = rf_regressor.predict(perturbed_X_scaled)[0]
        preds_by_level.append(pred)
    
    sensitivity_results[feat] = np.array(preds_by_level)

print('特征扰动分析完成')

### 图2: 问题三特征敏感性（蜘蛛图/雷达图）

In [None]:
# 图2: 特征敏感性雷达图
# 计算各特征扰动对预测的影响（以try_4为主要目标）
target_idx = 3  # try_4

fig, ax = plt.subplots(figsize=(10, 8), subplot_kw=dict(projection='polar'))

# 准备雷达图数据
categories = key_features
N = len(categories)
angles = [n / float(N) * 2 * np.pi for n in range(N)]
angles += angles[:1]  # 闭合

# 各扰动水平的预测变化
for i, pct in enumerate(perturbation_levels):
    if pct == 0:
        continue
    
    values = []
    for feat in key_features:
        base_pred = sensitivity_results[feat][2, target_idx]  # pct=0
        perturbed_pred = sensitivity_results[feat][i, target_idx]
        change = abs(perturbed_pred - base_pred)
        values.append(change)
    
    values += values[:1]  # 闭合
    
    color = COLORS['primary'] if pct > 0 else COLORS['secondary']
    ax.plot(angles, values, 'o-', linewidth=2, label=f'{pct:+d}% perturbation', color=color, alpha=0.7)
    ax.fill(angles, values, alpha=0.1, color=color)

ax.set_xticks(angles[:-1])
ax.set_xticklabels(categories, fontsize=10)
ax.legend(loc='upper right', bbox_to_anchor=(1.3, 1))

plt.tight_layout()
plt.savefig('figures/fig2_feature_sensitivity_radar.pdf', bbox_inches='tight')
plt.show()
print('图2已保存: figures/fig2_feature_sensitivity_radar.pdf')

### 图3: 特征扰动对预测分布的影响

In [None]:
# 图3: 特征扰动影响（线图）
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.flatten()

for i, feat in enumerate(key_features):
    ax = axes[i]
    
    # 绘制各目标的变化
    for j, target in enumerate(['try_3', 'try_4', 'try_5', 'try_x']):
        target_i = target_cols.index(target)
        values = sensitivity_results[feat][:, target_i]
        ax.plot(perturbation_levels, values, 'o-', label=target, linewidth=2, markersize=6)
    
    ax.axvline(x=0, color='black', linestyle='--', alpha=0.5)
    ax.set_xlabel('Perturbation (%)', fontsize=11)
    ax.set_ylabel('Predicted Percentage (%)', fontsize=11)
    ax.set_title(f'Sensitivity to {feat}', fontsize=12)
    ax.legend(fontsize=9)
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('figures/fig3_feature_perturbation.pdf', bbox_inches='tight')
plt.show()
print('图3已保存: figures/fig3_feature_perturbation.pdf')

---
## 4. 问题四：难度分类的特征敏感性

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

# 训练难度分类模型
y_class = df['difficulty'].values
le = LabelEncoder()
y_encoded = le.fit_transform(y_class)

rf_classifier = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)
rf_classifier.fit(X_scaled, y_encoded)

print('难度分类模型训练完成')
print(f'类别: {le.classes_}')

In [None]:
# EERIE基准预测
eerie_class_proba = rf_classifier.predict_proba(eerie_X_scaled)[0]
eerie_class_pred = le.inverse_transform([rf_classifier.predict(eerie_X_scaled)[0]])[0]

print('EERIE基准分类预测:')
print(f'  预测类别: {eerie_class_pred}')
for cls, prob in zip(le.classes_, eerie_class_proba):
    print(f'  {cls}: {prob*100:.1f}%')

In [None]:
# 分类敏感性分析
class_sensitivity = {}

for feat in key_features:
    feat_idx = feature_cols.index(feat)
    base_val = eerie_features[feat]
    
    probas_by_level = []
    for pct in perturbation_levels:
        perturbed_X = eerie_X.copy()
        perturbed_val = base_val * (1 + pct/100) if pct != 0 else base_val
        perturbed_X[0, feat_idx] = perturbed_val
        
        perturbed_X_scaled = scaler.transform(perturbed_X)
        proba = rf_classifier.predict_proba(perturbed_X_scaled)[0]
        probas_by_level.append(proba)
    
    class_sensitivity[feat] = np.array(probas_by_level)

print('分类敏感性分析完成')

### 图4: 难度分类敏感性

In [None]:
# 图4: 分类概率敏感性
DIFFICULTY_COLORS = {'Easy': '#2ecc71', 'Medium': '#f39c12', 'Hard': '#e74c3c'}

fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.flatten()

for i, feat in enumerate(key_features):
    ax = axes[i]
    
    for j, cls in enumerate(le.classes_):
        values = class_sensitivity[feat][:, j] * 100
        ax.plot(perturbation_levels, values, 'o-', label=cls, 
                linewidth=2, markersize=6, color=DIFFICULTY_COLORS[cls])
    
    ax.axvline(x=0, color='black', linestyle='--', alpha=0.5)
    ax.set_xlabel('Perturbation (%)', fontsize=11)
    ax.set_ylabel('Class Probability (%)', fontsize=11)
    ax.set_title(f'Classification Sensitivity to {feat}', fontsize=12)
    ax.legend(fontsize=9)
    ax.grid(True, alpha=0.3)
    ax.set_ylim(0, 100)

plt.tight_layout()
plt.savefig('figures/fig4_classification_sensitivity.pdf', bbox_inches='tight')
plt.show()
print('图4已保存: figures/fig4_classification_sensitivity.pdf')

---
## 5. 综合敏感性指标计算

In [None]:
# 计算各特征的敏感性指标
# 敏感性指数 = 输出变化率 / 输入变化率

def calculate_sensitivity_index(results, base_idx=2, target_idx=3):
    """计算敏感性指数"""
    base_val = results[base_idx, target_idx]
    indices = []
    
    for i, pct in enumerate(perturbation_levels):
        if pct == 0:
            continue
        output_change = (results[i, target_idx] - base_val) / base_val * 100 if base_val != 0 else 0
        input_change = pct
        si = abs(output_change / input_change) if input_change != 0 else 0
        indices.append(si)
    
    return np.mean(indices)

# 问题三敏感性指数
print('问题三（结果分布预测）敏感性指数:')
q3_sensitivity = {}
for feat in key_features:
    si = calculate_sensitivity_index(sensitivity_results[feat], target_idx=3)  # try_4
    q3_sensitivity[feat] = si
    print(f'  {feat}: {si:.3f}')

# 问题四敏感性指数
print('\n问题四（难度分类）敏感性指数:')
q4_sensitivity = {}
for feat in key_features:
    # 使用Medium类概率
    medium_idx = list(le.classes_).index('Medium')
    si = calculate_sensitivity_index(class_sensitivity[feat], target_idx=medium_idx)
    q4_sensitivity[feat] = si
    print(f'  {feat}: {si:.3f}')

### 图5: 综合敏感性对比（雷达图）

In [None]:
# 图5: 综合敏感性雷达图
fig, ax = plt.subplots(figsize=(10, 8), subplot_kw=dict(projection='polar'))

categories = key_features
N = len(categories)
angles = [n / float(N) * 2 * np.pi for n in range(N)]
angles += angles[:1]

# 问题三敏感性
q3_values = [q3_sensitivity[feat] for feat in key_features]
q3_values += q3_values[:1]
ax.plot(angles, q3_values, 'o-', linewidth=2, label='Q3: Result Distribution', 
        color=COLORS['primary'])
ax.fill(angles, q3_values, alpha=0.2, color=COLORS['primary'])

# 问题四敏感性
q4_values = [q4_sensitivity[feat] for feat in key_features]
q4_values += q4_values[:1]
ax.plot(angles, q4_values, 'o-', linewidth=2, label='Q4: Difficulty Classification', 
        color=COLORS['secondary'])
ax.fill(angles, q4_values, alpha=0.2, color=COLORS['secondary'])

ax.set_xticks(angles[:-1])
ax.set_xticklabels(categories, fontsize=10)
ax.legend(loc='upper right', bbox_to_anchor=(1.3, 1))

plt.tight_layout()
plt.savefig('figures/fig5_comprehensive_sensitivity.pdf', bbox_inches='tight')
plt.show()
print('图5已保存: figures/fig5_comprehensive_sensitivity.pdf')

---
## 6. 结果汇总

In [None]:
# 保存敏感性分析结果
sensitivity_summary = pd.DataFrame({
    'Feature': key_features,
    'Q3_Sensitivity_Index': [q3_sensitivity[f] for f in key_features],
    'Q4_Sensitivity_Index': [q4_sensitivity[f] for f in key_features]
})
sensitivity_summary['Avg_Sensitivity'] = (
    sensitivity_summary['Q3_Sensitivity_Index'] + 
    sensitivity_summary['Q4_Sensitivity_Index']
) / 2
sensitivity_summary = sensitivity_summary.sort_values('Avg_Sensitivity', ascending=False)

sensitivity_summary.to_csv('sensitivity_summary.csv', index=False)
sarima_df.to_csv('sarima_sensitivity.csv', index=False)

print('敏感性分析结果:')
print(sensitivity_summary.to_string(index=False))

In [None]:
# 最终结论
print('\n' + '='*70)
print('敏感性分析结论')
print('='*70)

print('''
【问题一：SARIMA参数敏感性】
- 模型对AR项(p)和MA项(q)的变化相对稳健
- 季节性参数(P, Q)对预测精度影响较大
- 建议使用基准参数(1,1,1)(1,1,1,7)

【问题三：结果分布预测敏感性】
- 最敏感特征: {}（敏感性指数={:.3f}）
- 预测对特征扰动相对稳定，变化幅度可控

【问题四：难度分类敏感性】
- 最敏感特征: {}（敏感性指数={:.3f}）
- 分类边界对avg_letter_freq最敏感

【EERIE预测稳定性】
- 在+-20%特征扰动范围内，预测结果保持稳定
- 结果分布预测：try_4始终为最高频区间
- 难度分类：Medium类概率始终最高

【结论】
各模型对输入特征的扰动表现出合理的敏感性，
预测结果在参数变化范围内保持稳定，模型可靠。
'''.format(
    sensitivity_summary.iloc[0]['Feature'],
    sensitivity_summary.iloc[0]['Q3_Sensitivity_Index'],
    sensitivity_summary.iloc[0]['Feature'],
    sensitivity_summary.iloc[0]['Q4_Sensitivity_Index']
))
print('='*70)

---
## 附录：图片清单

| 编号 | 文件名 | 内容 | 建议插入位置 |
|------|--------|------|-------------|
| 图1 | fig1_sarima_sensitivity.pdf | SARIMA参数敏感性 | 5.1 模型参数分析 |
| 图2 | fig2_feature_sensitivity_radar.pdf | 特征敏感性雷达图 | 5.2 特征敏感性 |
| 图3 | fig3_feature_perturbation.pdf | 特征扰动影响（4合1） | 5.2 问题三分析 |
| 图4 | fig4_classification_sensitivity.pdf | 分类概率敏感性（4合1） | 5.2 问题四分析 |
| 图5 | fig5_comprehensive_sensitivity.pdf | 综合敏感性雷达图 | 5.3 综合对比 |