In [None]:

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, RandomizedSearchCV, train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.inspection import permutation_importance
import matplotlib.pyplot as plt
import os
import joblib
from pathlib import Path 
CURRENT_DIR = Path.cwd()
PROJECT_ROOT = CURRENT_DIR.parent
DATA_DIR = PROJECT_ROOT / "data"
OUTPUT_DIR = PROJECT_ROOT / "output"

# --- 0. 全局配置 ---
# 数据路径
BASE_DATA_PATH = DATA_DIR
DEV_SET_FILE = DATA_DIR / "development_set_selected_features.xlsx"
TEST_SET_FILE = DATA_DIR / "final_test_set_selected_features.xlsx"
AUGMENTED_DATA_OUTPUT_FOLDER = DATA_DIR / "augmented_outputs_svr"# 保存中间增强数据 (SVR)
MODEL_OUTPUT_PATH = DATA_DIR / "trained_models_svr" # 保存训练好的模型 (SVR)
OUTPUT_PLOT_PATH = OUTPUT_DIR # 图表导出路径

os.makedirs(AUGMENTED_DATA_OUTPUT_FOLDER, exist_ok=True)
os.makedirs(MODEL_OUTPUT_PATH, exist_ok=True)
os.makedirs(OUTPUT_PLOT_PATH, exist_ok=True)

TARGET_COLUMN = 'Rowing distance'
RANDOM_STATE = 42
N_SPLITS_KFOLD = 5

# WGAN-GP 预设参数 (与之前相同)
# CHO_LEVELS = np.array([0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2])
DEFAULT_WGAN_PARAMS = {
    'latent_dim': 100,
    'lambda_gp': 10,
    'n_critic': 5,
    'lr': 0.00005, # WGAN-GP的学习率
    'batch_size': 32,
    'epochs_for_cv': 500, # K-Fold内部动态增强时WGAN的轮数
    'epochs_for_final': 2000 # 用于最终增强或HPO时WGAN的轮数 (可调整)
}

# SVR RandomizedSearchCV 参数网格
SVR_PARAM_GRID = {
    'svr__kernel': ['rbf', 'linear', 'poly'],
    'svr__C': [0.1, 1, 10, 50, 100, 200, 500], # 调整范围
    'svr__gamma': ['scale', 'auto', 0.001, 0.005, 0.01, 0.05, 0.1], # 调整范围
    'svr__epsilon': [0.01, 0.05, 0.1, 0.15, 0.2, 0.3], # 调整范围
    'svr__degree': [2, 3] # 仅对 poly 核
}
N_ITER_RANDOMIZED_SEARCH = 50 # RandomizedSearchCV的迭代次数 (可增加)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"将使用设备: {device}")

# --- 1. WGAN-GP 模型定义 (与之前相同) ---
class Generator(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(Generator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 128), nn.ReLU(),
            nn.Linear(128, 256), nn.ReLU(),
            nn.Linear(256, 512), nn.ReLU(),
            nn.Linear(512, output_dim)
        )
    def forward(self, z):
        return self.model(z)

class Critic(nn.Module):
    def __init__(self, input_dim):
        super(Critic, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 512), nn.LeakyReLU(0.2, inplace=True),
            nn.Linear(512, 256), nn.LeakyReLU(0.2, inplace=True),
            nn.Linear(256, 128), nn.LeakyReLU(0.2, inplace=True),
            nn.Linear(128, 1)
        )
    def forward(self, x):
        return self.model(x)

def gradient_penalty(critic_model, real_samples, fake_samples, device_in_use):
    batch_size_gp = real_samples.size(0)
    alpha = torch.rand(batch_size_gp, 1, device=device_in_use).expand_as(real_samples)
    interpolates = (alpha * real_samples + ((1 - alpha) * fake_samples)).requires_grad_(True)
    d_interpolates = critic_model(interpolates)
    fake_grad_output = torch.ones_like(d_interpolates, device=device_in_use)
    gradients = torch.autograd.grad(
        outputs=d_interpolates, inputs=interpolates, grad_outputs=fake_grad_output,
        create_graph=True, retain_graph=True, only_inputs=True
    )[0]
    gradients = gradients.view(gradients.size(0), -1)
    gradient_penalty_val = ((gradients.norm(2, dim=1) - 1) ** 2).mean()
    return gradient_penalty_val

# --- 2. WGAN-GP 训练与生成辅助函数 (基本与之前相同) ---
def train_and_generate_wgangp(input_original_df,
                              target_col_name,
                              wgan_hyperparams,
                              num_samples_to_generate,
                              current_device,
                              fold_num_for_logging=None,
                              output_augmented_data_path=None):
    # 此函数内部逻辑与XGBoost版本中的WGAN-GP部分基本一致
    # WGAN-GP本身进行标准化，生成后反标准化回原始尺度
    # 所以输出的 augmented_data_df 是在原始数据尺度上的

    log_prefix = f"[WGAN-GP"
    if isinstance(fold_num_for_logging, int):
        log_prefix += f" Fold {fold_num_for_logging}"
    elif isinstance(fold_num_for_logging, str):
        log_prefix += f" Stage {fold_num_for_logging}"
    log_prefix += "]"

    print(f"\n{log_prefix} 开始处理，输入数据形状: {input_original_df.shape}")

    all_feature_names = input_original_df.columns.tolist()
    original_data_values = input_original_df.values.astype(np.float32)

    data_mean = np.mean(original_data_values, axis=0)
    data_std = np.std(original_data_values, axis=0)
    data_std[data_std == 0] = 1
    standardized_data = (original_data_values - data_mean) / data_std

    data_tensor = torch.tensor(standardized_data)
    dataset = TensorDataset(data_tensor)
    
    current_batch_size = min(wgan_hyperparams['batch_size'], len(dataset))
    if current_batch_size == 0:
        print(f"{log_prefix} 错误：数据集为空或过小 ({len(dataset)} samples), 无法创建DataLoader。")
        return pd.DataFrame(columns=all_feature_names)
    
    use_drop_last = len(dataset) >= current_batch_size * 2
    dataloader = DataLoader(dataset, batch_size=current_batch_size, shuffle=True, drop_last=use_drop_last)
    
    if len(dataloader) == 0 and len(dataset) > 0:
        print(f"{log_prefix} 警告: DataLoader为空，但输入数据集不为空。将尝试 drop_last=False")
        dataloader = DataLoader(dataset, batch_size=current_batch_size, shuffle=True, drop_last=False)
        if len(dataloader) == 0 and len(dataset) > 0:
             print(f"{log_prefix} 错误: DataLoader仍然为空。无法继续GAN训练。")
             return pd.DataFrame(columns=all_feature_names)

    num_features = standardized_data.shape[1]
    generator = Generator(wgan_hyperparams['latent_dim'], num_features).to(current_device)
    critic = Critic(num_features).to(current_device)
    optimizer_G = optim.Adam(generator.parameters(), lr=wgan_hyperparams['lr'], betas=(0.5, 0.9))
    optimizer_C = optim.Adam(critic.parameters(), lr=wgan_hyperparams['lr'], betas=(0.5, 0.9))

    print(f"{log_prefix} 开始WGAN-GP训练 ({wgan_hyperparams['epochs']} 轮)...")
    critic_loss_val, generator_loss_val = torch.tensor(0.0), torch.tensor(0.0)
    for epoch in range(wgan_hyperparams['epochs']):
        for i, (real_samples_batch,) in enumerate(dataloader):
            if real_samples_batch.shape[0] == 0: continue
            real_samples_batch = real_samples_batch.to(current_device)
            current_real_batch_size = real_samples_batch.size(0)

            for _ in range(wgan_hyperparams['n_critic']):
                optimizer_C.zero_grad()
                z = torch.randn(current_real_batch_size, wgan_hyperparams['latent_dim'], device=current_device)
                fake_samples_batch = generator(z)
                critic_real = critic(real_samples_batch)
                critic_fake = critic(fake_samples_batch.detach())
                gp = gradient_penalty(critic, real_samples_batch, fake_samples_batch, current_device)
                critic_loss = torch.mean(critic_fake) - torch.mean(critic_real) + wgan_hyperparams['lambda_gp'] * gp
                critic_loss.backward()
                optimizer_C.step()
                critic_loss_val = critic_loss
            
            optimizer_G.zero_grad()
            z = torch.randn(current_real_batch_size, wgan_hyperparams['latent_dim'], device=current_device)
            generated_for_g_loss = generator(z)
            generator_loss = -torch.mean(critic(generated_for_g_loss))
            generator_loss.backward()
            optimizer_G.step()
            generator_loss_val = generator_loss

        if (epoch + 1) % (max(1, wgan_hyperparams['epochs'] // 10)) == 0:
             print(f"{log_prefix} [Epoch {epoch+1}/{wgan_hyperparams['epochs']}] Critic Loss: {critic_loss_val.item():.4f}, Gen Loss: {generator_loss_val.item():.4f}")

    print(f"{log_prefix} WGAN-GP训练完成。")
    print(f"{log_prefix} 正在生成 {num_samples_to_generate} 个增强样本...")
    generator.eval()
    generated_samples_list = []
    remaining_samples = num_samples_to_generate
    gen_batch_size = wgan_hyperparams['batch_size']
    
    with torch.no_grad():
        while remaining_samples > 0:
            current_gen_size = min(gen_batch_size, remaining_samples)
            z_generate = torch.randn(current_gen_size, wgan_hyperparams['latent_dim'], device=current_device)
            generated_batch_std = generator(z_generate).detach().cpu().numpy()
            generated_samples_list.append(generated_batch_std)
            remaining_samples -= current_gen_size
            
    generated_standardized_data_np = np.concatenate(generated_samples_list, axis=0)
    generated_data_original_scale_np = generated_standardized_data_np * data_std + data_mean
    generated_data_df = pd.DataFrame(generated_data_original_scale_np, columns=all_feature_names)

    # if 'CHO' in generated_data_df.columns:
    #     cho_column_generated = generated_data_df['CHO'].values
    #     processed_cho = np.array([CHO_LEVELS[np.abs(CHO_LEVELS - val).argmin()] for val in cho_column_generated])
    #     generated_data_df['CHO'] = processed_cho

    for col_name in all_feature_names:
        # if col_name == 'CHO': continue # <--- 删除或注释掉这一行，让CHO也参与通用裁剪
        original_col_values = input_original_df[col_name]
        col_min_original = original_col_values.min()
        col_max_original = original_col_values.max()
        col_range = col_max_original - col_min_original
    
        # 这部分原有的1%范围扩展逻辑是好的，保持不变
        clip_min_for_col = col_min_original - 0.01 * col_range if col_range != 0 else col_min_original
        clip_max_for_col = col_max_original + 0.01 * col_range if col_range != 0 else col_max_original
    
        # 在这里添加针对 CHO 和 PRO（以及其他您认为需要非负的列）的特殊处理
        if col_name in ['CHO', 'PRO']: # 如果有其他列也需要确保非负，可以加入此列表
            clip_min_for_col = max(0, clip_min_for_col)
    
        generated_data_df[col_name] = np.clip(generated_data_df[col_name], clip_min_for_col, clip_max_for_col)
    print(f"{log_prefix} 后处理完成。")

    if output_augmented_data_path:
        try:
            os.makedirs(os.path.dirname(output_augmented_data_path), exist_ok=True)
            generated_data_df.to_excel(output_augmented_data_path, index=False)
            print(f"{log_prefix} 增强数据已保存到: {output_augmented_data_path}")
        except Exception as e:
            print(f"{log_prefix} 保存增强数据时发生错误: {e}")
            
    return generated_data_df

# --- 3. 主流程开始 ---
try:
    development_df_original = pd.read_excel(DEV_SET_FILE)
    final_test_df_original = pd.read_excel(TEST_SET_FILE)
    print(f"开发集形状: {development_df_original.shape}, 最终测试集形状: {final_test_df_original.shape}")
except FileNotFoundError as e:
    print(f"错误: 开发集或测试集文件未找到。请检查路径: {e}")
    exit()

X_dev_original = development_df_original.drop(columns=[TARGET_COLUMN])
y_dev_original = development_df_original[TARGET_COLUMN]
X_final_test = final_test_df_original.drop(columns=[TARGET_COLUMN])
y_final_test = final_test_df_original[TARGET_COLUMN]

# --- 步骤三：Part 1 - 为SVR确定最佳超参数 ---
print("\n--- 步骤三：Part 1 - SVR 超参数调优 ---")
print("为超参数调优生成开发集的增强版本...")
num_augmented_samples_for_hpo = len(development_df_original) * 1 # 可以调整增强倍数
current_wgan_hpo_params = DEFAULT_WGAN_PARAMS.copy()
current_wgan_hpo_params['epochs'] = DEFAULT_WGAN_PARAMS['epochs_for_final'] # 用更长的轮数训练GAN
augmented_dev_for_hpo_output_path = os.path.join(AUGMENTED_DATA_OUTPUT_FOLDER, "augmented_dev_for_svr_hpo.xlsx")

augmented_dev_for_hpo_df = train_and_generate_wgangp(
    input_original_df=development_df_original.copy(),
    target_col_name=TARGET_COLUMN,
    wgan_hyperparams=current_wgan_hpo_params,
    num_samples_to_generate=num_augmented_samples_for_hpo,
    current_device=device,
    fold_num_for_logging="HPO_Dev_Set",
    output_augmented_data_path=augmented_dev_for_hpo_output_path
)
if augmented_dev_for_hpo_df.empty:
    print("错误：为SVR HPO生成的增强数据为空，无法继续。")
    exit()

X_augmented_dev_for_hpo = augmented_dev_for_hpo_df.drop(columns=[TARGET_COLUMN])
y_augmented_dev_for_hpo = augmented_dev_for_hpo_df[TARGET_COLUMN]
X_combined_dev_for_hpo = pd.concat([X_dev_original, X_augmented_dev_for_hpo], ignore_index=True)
y_combined_dev_for_hpo = pd.concat([y_dev_original, y_augmented_dev_for_hpo], ignore_index=True)
print(f"用于SVR HPO的总数据形状: {X_combined_dev_for_hpo.shape}")

# SVR需要Pipeline进行缩放
svr_pipeline_for_hpo = Pipeline([
    ('scaler', StandardScaler()),
    ('svr', SVR())
])

random_search_hpo_svr = RandomizedSearchCV(
    estimator=svr_pipeline_for_hpo, param_distributions=SVR_PARAM_GRID,
    n_iter=N_ITER_RANDOMIZED_SEARCH, cv=N_SPLITS_KFOLD,
    scoring='neg_mean_absolute_error', verbose=1, random_state=RANDOM_STATE, n_jobs=-1
)
print("开始SVR超参数搜索 (RandomizedSearchCV)...")
random_search_hpo_svr.fit(X_combined_dev_for_hpo, y_combined_dev_for_hpo)
best_overall_svr_pipeline_params = random_search_hpo_svr.best_params_
# 提取SVR的最佳参数，去除前缀 'svr__'
best_overall_svr_params = {key.split('__')[1]: value for key, value in best_overall_svr_pipeline_params.items() if key.startswith('svr__')}

print(f"找到的最佳SVR Pipeline参数: {best_overall_svr_pipeline_params}")
print(f"提取的最佳SVR参数: {best_overall_svr_params}")
print(f"最佳SVR HPO MAE (负值): {random_search_hpo_svr.best_score_}")

# --- 步骤三：Part 2 - K-折交叉验证与动态增强 (SVR) ---
print(f"\n--- 步骤三：Part 2 - 在开发集上进行 {N_SPLITS_KFOLD}-折交叉验证 (SVR, 动态WGAN-GP增强) ---")
kf = KFold(n_splits=N_SPLITS_KFOLD, shuffle=True, random_state=RANDOM_STATE)

kfold_cv_val_metrics_list_svr = []
kfold_cv_train_metrics_list_svr = []
# SVR没有XGBoost那样的evals_result()来画每轮的损失，这里存每折最终的MAE
cv_fold_train_maes_svr = []
cv_fold_val_maes_svr = []


current_wgan_cv_params = DEFAULT_WGAN_PARAMS.copy()
current_wgan_cv_params['epochs'] = DEFAULT_WGAN_PARAMS['epochs_for_cv']
augmentation_factor_cv = 1 # CV内部的增强倍数

for fold_idx, (train_indices, val_indices) in enumerate(kf.split(development_df_original)):
    print(f"\n--- K-Fold (SVR): 第 {fold_idx + 1}/{N_SPLITS_KFOLD} 折 ---")
    cv_train_original_fold_df = development_df_original.iloc[train_indices]
    cv_val_original_fold_df = development_df_original.iloc[val_indices]

    X_cv_val_fold = cv_val_original_fold_df.drop(columns=[TARGET_COLUMN])
    y_cv_val_fold = cv_val_original_fold_df[TARGET_COLUMN]

    print(f"当前CV训练集（原始）形状: {cv_train_original_fold_df.shape}")
    num_augmented_samples_cv_fold = len(cv_train_original_fold_df) * augmentation_factor_cv
    dynamic_wgan_cv_params = current_wgan_cv_params.copy()
    dynamic_wgan_cv_params['batch_size'] = min(current_wgan_cv_params['batch_size'], max(1, len(cv_train_original_fold_df) // 2 if len(cv_train_original_fold_df) > 1 else 1) )
    
    cv_augmented_fold_df = train_and_generate_wgangp(
        input_original_df=cv_train_original_fold_df.copy(),
        target_col_name=TARGET_COLUMN,
        wgan_hyperparams=dynamic_wgan_cv_params,
        num_samples_to_generate=num_augmented_samples_cv_fold,
        current_device=device,
        fold_num_for_logging=(fold_idx + 1),
        output_augmented_data_path=None
    )
    if cv_augmented_fold_df.empty:
        print(f"警告：Fold {fold_idx + 1} 的增强数据为空，跳过此折。")
        cv_fold_train_maes_svr.append(np.nan)
        cv_fold_val_maes_svr.append(np.nan)
        continue

    X_cv_train_original_fold = cv_train_original_fold_df.drop(columns=[TARGET_COLUMN])
    y_cv_train_original_fold = cv_train_original_fold_df[TARGET_COLUMN]
    X_cv_augmented_fold = cv_augmented_fold_df.drop(columns=[TARGET_COLUMN])
    y_cv_augmented_fold = cv_augmented_fold_df[TARGET_COLUMN]

    X_cv_train_combined_fold = pd.concat([X_cv_train_original_fold, X_cv_augmented_fold], ignore_index=True)
    y_cv_train_combined_fold = pd.concat([y_cv_train_original_fold, y_cv_augmented_fold], ignore_index=True)
    print(f"当前CV训练集（原始+增强后）形状: {X_cv_train_combined_fold.shape}")

    # 在每折内部创建和拟合Pipeline
    scaler_fold = StandardScaler()
    X_cv_train_combined_fold_scaled = scaler_fold.fit_transform(X_cv_train_combined_fold)
    X_cv_val_fold_scaled = scaler_fold.transform(X_cv_val_fold) # 用训练集拟合的scaler转换验证集

    model_fold_svr = SVR(**best_overall_svr_params) # 使用HPO找到的最佳SVR参数
    
    print(f"Fold {fold_idx + 1} (SVR): 开始训练SVR模型...")
    model_fold_svr.fit(X_cv_train_combined_fold_scaled, y_cv_train_combined_fold)

    y_pred_val_fold = model_fold_svr.predict(X_cv_val_fold_scaled)
    mae_val = mean_absolute_error(y_cv_val_fold, y_pred_val_fold)
    rmse_val = np.sqrt(mean_squared_error(y_cv_val_fold, y_pred_val_fold))
    r2_val = r2_score(y_cv_val_fold, y_pred_val_fold)
    kfold_cv_val_metrics_list_svr.append({'fold': fold_idx + 1, 'MAE': mae_val, 'RMSE': rmse_val, 'R2': r2_val})
    cv_fold_val_maes_svr.append(mae_val)

    y_pred_train_fold = model_fold_svr.predict(X_cv_train_combined_fold_scaled)
    mae_train = mean_absolute_error(y_cv_train_combined_fold, y_pred_train_fold)
    rmse_train = np.sqrt(mean_squared_error(y_cv_train_combined_fold, y_pred_train_fold))
    r2_train = r2_score(y_cv_train_combined_fold, y_pred_train_fold)
    kfold_cv_train_metrics_list_svr.append({'fold': fold_idx + 1, 'MAE': mae_train, 'RMSE': rmse_train, 'R2': r2_train})
    cv_fold_train_maes_svr.append(mae_train)
    
    print(f"Fold {fold_idx + 1} (SVR) - CV Train MAE: {mae_train:.4f}, R2: {r2_train:.4f} | CV Val MAE: {mae_val:.4f}, R2: {r2_val:.4f}")

avg_kfold_cv_val_metrics_df_svr = pd.DataFrame(kfold_cv_val_metrics_list_svr)
print("\n--- SVR K-折交叉验证平均CV验证性能 (开发集, WGAN-GP动态增强) ---")
if not avg_kfold_cv_val_metrics_df_svr.empty:
    avg_mae_cv_val_svr = avg_kfold_cv_val_metrics_df_svr['MAE'].mean()
    avg_rmse_cv_val_svr = avg_kfold_cv_val_metrics_df_svr['RMSE'].mean()
    avg_r2_cv_val_svr = avg_kfold_cv_val_metrics_df_svr['R2'].mean()
    print(f"平均 CV 验证集 MAE (SVR): {avg_mae_cv_val_svr:.4f}")
    print(f"平均 CV 验证集 RMSE (SVR): {avg_rmse_cv_val_svr:.4f}")
    print(f"平均 CV 验证集 R2 (SVR): {avg_r2_cv_val_svr:.4f}")
else:
    print("SVR K-Fold CV验证结果为空。")
    avg_mae_cv_val_svr, avg_r2_cv_val_svr = np.nan, np.nan

avg_kfold_cv_train_metrics_df_svr = pd.DataFrame(kfold_cv_train_metrics_list_svr)
print("\n--- SVR K-折交叉验证平均CV训练性能 (开发集, WGAN-GP动态增强) ---")
if not avg_kfold_cv_train_metrics_df_svr.empty:
    avg_mae_cv_train_svr = avg_kfold_cv_train_metrics_df_svr['MAE'].mean()
    avg_rmse_cv_train_svr = avg_kfold_cv_train_metrics_df_svr['RMSE'].mean() # Added for completeness
    avg_r2_cv_train_svr = avg_kfold_cv_train_metrics_df_svr['R2'].mean()
    print(f"平均 CV 训练集 MAE (SVR): {avg_mae_cv_train_svr:.4f}")
    print(f"平均 CV 训练集 R2 (SVR): {avg_r2_cv_train_svr:.4f}")
else:
    print("SVR K-Fold CV训练结果为空。")
    avg_mae_cv_train_svr, avg_r2_cv_train_svr = np.nan, np.nan

if not np.isnan(avg_mae_cv_val_svr) and not np.isnan(avg_mae_cv_train_svr):
    print("\n--- 步骤三 Part 2 结束 (SVR)：生成K-Fold性能图表 ---")
    metrics_plot_names_en = ['MAE', 'R2 Score']
    values_cv_val_plot_svr = [avg_mae_cv_val_svr, avg_r2_cv_val_svr]
    values_cv_train_plot_svr = [avg_mae_cv_train_svr, avg_r2_cv_train_svr]
    x_axis_plot = np.arange(len(metrics_plot_names_en))
    plt.figure(figsize=(10, 6))
    plt.bar(x_axis_plot - 0.2, values_cv_train_plot_svr, width=0.4, label='CV Train Avg.', align='center', color='skyblue')
    plt.bar(x_axis_plot + 0.2, values_cv_val_plot_svr, width=0.4, label='CV Validation Avg.', align='center', color='salmon')
    plt.xticks(x_axis_plot, metrics_plot_names_en)
    plt.ylabel('Score')
    plt.title('SVR: Average K-Fold CV Train vs. CV Validation Metrics (WGAN-GP Augmented)')
    plt.legend()
    plt.grid(True, linestyle='--', alpha=0.7)
    plot_filename_metrics_svr = os.path.join(OUTPUT_PLOT_PATH, "svr_kfold_avg_eval_metrics_wgangp.png")
    try:
        plt.savefig(plot_filename_metrics_svr, dpi=300, bbox_inches='tight')
        print(f"SVR图表1已保存到: {plot_filename_metrics_svr}")
    except Exception as e:
        print(f"保存SVR图表1时发生错误: {e}")
    plt.show()

    # 绘制每折的训练和测试MAE (SVR)
    if cv_fold_train_maes_svr and cv_fold_val_maes_svr:
        plt.figure(figsize=(12, 6))
        folds_svr = range(1, len(cv_fold_train_maes_svr) + 1)
        plt.plot(folds_svr, cv_fold_train_maes_svr, marker='o', linestyle='-', label='CV Train MAE per Fold', color='dodgerblue')
        plt.plot(folds_svr, cv_fold_val_maes_svr, marker='x', linestyle='--', label='CV Validation MAE per Fold', color='orangered')
        plt.xlabel('Fold Number')
        plt.ylabel('Mean Absolute Error (MAE)')
        plt.title('SVR: Training and CV Validation MAE Across Folds (WGAN-GP Augmented)')
        plt.xticks(folds_svr)
        plt.legend()
        plt.grid(True, linestyle='--', alpha=0.7)
        plot_filename_loss_svr = os.path.join(OUTPUT_PLOT_PATH, "svr_kfold_mae_per_fold_wgangp.png")
        try:
            plt.savefig(plot_filename_loss_svr, dpi=300, bbox_inches='tight')
            print(f"SVR图表2已保存到: {plot_filename_loss_svr}")
        except Exception as e:
            print(f"保存SVR图表2时发生错误: {e}")
        plt.show()
    else:
        print("没有收集到足够的SVR MAE数据用于绘制图表2。")
else:
    print("由于SVR K-Fold平均结果为空或NaN，跳过图表绘制。")


# --- 步骤四：训练最终SVR模型 ---
print("\n--- 步骤四：训练最终SVR模型 ---")
print("为最终SVR模型生成开发集的完整增强版本...")
final_wgan_params_svr = DEFAULT_WGAN_PARAMS.copy()
final_wgan_params_svr['epochs'] = DEFAULT_WGAN_PARAMS['epochs_for_final']
num_augmented_samples_final_svr = len(development_df_original) * 2 # 最终模型使用更多增强样本
final_augmented_dev_output_path_svr = os.path.join(AUGMENTED_DATA_OUTPUT_FOLDER, "augmented_dev_for_final_svr_model.xlsx")

final_augmented_dev_df_svr = train_and_generate_wgangp(
    input_original_df=development_df_original.copy(),
    target_col_name=TARGET_COLUMN,
    wgan_hyperparams=final_wgan_params_svr,
    num_samples_to_generate=num_augmented_samples_final_svr,
    current_device=device,
    fold_num_for_logging="Final_Dev_Set_Aug_SVR",
    output_augmented_data_path=final_augmented_dev_output_path_svr
)
if final_augmented_dev_df_svr.empty:
    print("错误：为最终SVR模型生成的增强数据为空，无法继续。")
    exit()

X_final_augmented_dev_svr = final_augmented_dev_df_svr.drop(columns=[TARGET_COLUMN])
y_final_augmented_dev_svr = final_augmented_dev_df_svr[TARGET_COLUMN]
X_train_final_model_svr = pd.concat([X_dev_original, X_final_augmented_dev_svr], ignore_index=True)
y_train_final_model_svr = pd.concat([y_dev_original, y_final_augmented_dev_svr], ignore_index=True)
print(f"用于训练最终SVR模型的总数据形状: {X_train_final_model_svr.shape}")

# 创建最终的Pipeline进行训练和保存
final_svr_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svr', SVR(**best_overall_svr_params)) # 使用HPO找到的最佳SVR参数
])

print("开始训练最终SVR模型 (Pipeline)...")
final_svr_pipeline.fit(X_train_final_model_svr, y_train_final_model_svr)
print("最终SVR模型训练完成。")

final_model_path_svr = os.path.join(MODEL_OUTPUT_PATH, "final_svr_wgangp_pipeline.joblib")
joblib.dump(final_svr_pipeline, final_model_path_svr)
print(f"最终SVR Pipeline已保存到: {final_model_path_svr}")

print("\n--- 步骤四结束 (SVR)：生成最终模型特征重要性图 ---")
# 特征重要性需要从Pipeline中提取SVR模型，并且在缩放后的数据上计算
# 或者，如果Pipeline在permutation_importance中被正确处理，可以直接传入Pipeline
# 我们将手动缩放数据以确保permutation_importance在正确的数据上运行
final_scaler_for_perm = final_svr_pipeline.named_steps['scaler']
final_svr_model_for_perm = final_svr_pipeline.named_steps['svr']
X_train_final_model_svr_scaled = final_scaler_for_perm.transform(X_train_final_model_svr) # 使用训练好的scaler

perm_importance_svr = permutation_importance(
    final_svr_model_for_perm, X_train_final_model_svr_scaled, y_train_final_model_svr,
    n_repeats=10, random_state=RANDOM_STATE, scoring='neg_mean_absolute_error', n_jobs=-1
)
sorted_idx_svr = perm_importance_svr.importances_mean.argsort()[::-1]
importance_df_svr = pd.DataFrame({
    'Feature': X_dev_original.columns[sorted_idx_svr], # 特征名来自原始开发集
    'Importance': perm_importance_svr.importances_mean[sorted_idx_svr]
})

num_features_to_plot = min(20, len(X_dev_original.columns))
plot_height = max(6, num_features_to_plot * 0.4)
plt.figure(figsize=(10, plot_height))
plt.barh(importance_df_svr['Feature'][:num_features_to_plot], importance_df_svr['Importance'][:num_features_to_plot], color='mediumseagreen')
plt.xlabel('Permutation Importance (SVR, decrease in MAE)')
plt.ylabel('Feature')
plt.title(f'SVR: Top {num_features_to_plot} Feature Importances (Final Model with WGAN-GP)')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.grid(True, axis='x', linestyle='--', alpha=0.7)
plot_filename_importance_svr = os.path.join(OUTPUT_PLOT_PATH, "final_svr_model_feature_importances_wgangp.png")
try:
    plt.savefig(plot_filename_importance_svr, dpi=300, bbox_inches='tight')
    print(f"最终SVR模型特征重要性图已保存到: {plot_filename_importance_svr}")
except Exception as e:
    print(f"保存最终SVR模型特征重要性图时发生错误: {e}")
plt.show()

# 如果最佳核是线性的，也可以显示系数
if best_overall_svr_params.get('kernel') == 'linear':
    try:
        linear_coeffs = final_svr_model_for_perm.coef_[0]
        # 注意：系数是针对缩放后的数据的，其大小也受缩放影响
        # 为了更直观地比较，通常与特征的标准差相乘，但这会复杂化
        # 这里我们只展示原始系数的绝对值大小
        coeff_df_svr = pd.DataFrame({
            'Feature': X_dev_original.columns,
            'Absolute Coefficient': np.abs(linear_coeffs)
        }).sort_values(by='Absolute Coefficient', ascending=False)

        plt.figure(figsize=(10, plot_height))
        plt.barh(coeff_df_svr['Feature'][:num_features_to_plot], coeff_df_svr['Absolute Coefficient'][:num_features_to_plot], color='lightcoral')
        plt.xlabel('Absolute Coefficient Value (SVR Linear Kernel)')
        plt.ylabel('Feature')
        plt.title(f'SVR: Top {num_features_to_plot} Abs. Coefficients (Linear Kernel, WGAN-GP)')
        plt.gca().invert_yaxis()
        plt.tight_layout()
        plt.grid(True, axis='x', linestyle='--', alpha=0.7)
        plot_filename_coeffs_svr = os.path.join(OUTPUT_PLOT_PATH, "final_svr_linear_coeffs_wgangp.png")
        plt.savefig(plot_filename_coeffs_svr, dpi=300, bbox_inches='tight')
        print(f"SVR线性核系数图已保存: {plot_filename_coeffs_svr}")
        plt.show()
    except Exception as e:
        print(f"尝试绘制SVR线性核系数时出错: {e}")


# --- 步骤五：最终无偏评估 (在“最终测试集”上使用SVR模型) ---
print("\n--- 步骤五：在最终测试集上进行SVR无偏评估 ---")
# final_svr_pipeline_loaded = joblib.load(final_model_path_svr) # 如果分步执行
y_pred_final_test_svr = final_svr_pipeline.predict(X_final_test) # Pipeline会自动处理缩放

mae_final_svr = mean_absolute_error(y_final_test, y_pred_final_test_svr)
rmse_final_svr = np.sqrt(mean_squared_error(y_final_test, y_pred_final_test_svr))
r2_final_svr = r2_score(y_final_test, y_pred_final_test_svr)
print("--- 最终SVR模型在最终测试集上的性能 ---")
print(f"MAE (SVR): {mae_final_svr:.4f}")
print(f"RMSE (SVR): {rmse_final_svr:.4f}")
print(f"R2 Score (SVR): {r2_final_svr:.4f}")



In [None]:

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, RandomizedSearchCV, train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.inspection import permutation_importance
import matplotlib.pyplot as plt
import os
import joblib
print("\n--- 步骤五结束 (SVR)：生成最终测试集真实值 vs 预测值图 ---")
plt.figure(figsize=(8, 8))
plt.scatter(y_final_test, y_pred_final_test_svr, alpha=0.7, edgecolors='w', linewidth=0.5)
min_val = min(y_final_test.min(), y_pred_final_test_svr.min())
max_val = max(y_final_test.max(), y_pred_final_test_svr.max())
plt.plot([min_val, max_val], [min_val, max_val], 'k--', lw=2)
plt.xlabel('Actual Rowing Distance')
plt.ylabel('Predicted Rowing Distance (SVR)')
plt.title('SVR Final Model: Actual vs. Predicted (Test Set, WGAN-GP Augmented)')
plt.grid(True, linestyle='--', alpha=0.7)
plot_filename_actual_vs_pred_svr = os.path.join(OUTPUT_PLOT_PATH, "final_svr_model_actual_vs_predicted_wgangp.png")
try:
    plt.savefig(plot_filename_actual_vs_pred_svr, dpi=300, bbox_inches='tight')
    print(f"最终SVR测试集真实值 vs 预测值图已保存到: {plot_filename_actual_vs_pred_svr}")
except Exception as e:
    print(f"保存最终SVR测试集真实值 vs 预测值图时发生错误: {e}")
plt.show()

print("\n--- 整体流程 (SVR + WGAN-GP) 执行完毕 ---")