In [None]:
# 基于原始数据 - 21 (已添加最终测试集评估)
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import KFold, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
import os # 导入os模块用于创建文件夹
from pathlib import Path 
CURRENT_DIR = Path.cwd()
PROJECT_ROOT = CURRENT_DIR.parent
DATA_DIR = PROJECT_ROOT / "data"
OUTPUT_DIR = PROJECT_ROOT / "output"

import xgboost as xgb
try:
    print("尝试使用GPU初始化一个简单的XGBoost模型...")
    temp_model = xgb.XGBClassifier(tree_method='gpu_hist')
    print("XGBoost GPU tree_method ('gpu_hist') 初始化成功。")
except Exception as e:
    print(f"XGBoost GPU 初始化或测试失败: {e}")

# --- 配置部分 ---
# 开发集路径 (80%的数据)
development_data_path = DATA_DIR / "development_set_selected_features.xlsx"
# 最终测试集路径 (20%的数据)
final_test_data_path = DATA_DIR / "final_test_set_selected_features.xlsx"

target_column_name = 'Rowing distance'
output_plot_path = OUTPUT_DIR
os.makedirs(output_plot_path, exist_ok=True)


# --- 数据加载 ---
# 只加载开发集进行模型训练和调优
original_data = pd.read_excel(development_data_path)
X_original = original_data.drop(columns=[target_column_name])
y_original = original_data[target_column_name]
print(f"开发集数据加载成功，形状: {original_data.shape}")

# --- XGBoost 超参数调优 (在开发集上) ---
param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [3, 5, 7, 9],
    'learning_rate': [0.01, 0.05, 0.1, 0.15, 0.2],
    'subsample': [0.7, 0.8, 0.9, 1],
    'colsample_bytree': [0.7, 0.8, 0.9, 1],
    'gamma': [0, 0.1, 0.2, 0.3],
    'reg_alpha': [0, 0.01, 0.1, 0.5, 1],
    'reg_lambda': [0.5, 1, 1.5, 2]
}

xgb_regressor_base = XGBRegressor(objective='reg:squarederror', tree_method='gpu_hist', random_state=42)

print("\n开始超参数调优 (RandomizedSearchCV)...")
random_search_original = RandomizedSearchCV(
    estimator=xgb_regressor_base,
    param_distributions=param_grid,
    n_iter=50,
    cv=5,
    scoring='neg_mean_absolute_error',
    verbose=1,
    random_state=42,
    n_jobs=-1
)
random_search_original.fit(X_original, y_original)

best_params_original = random_search_original.best_params_
print("最佳参数 (原始数据):", best_params_original)
print(f"最佳交叉验证得分 ({random_search_original.scoring}): {random_search_original.best_score_}")


# --- K折交叉验证评估 (在开发集上) ---
kf = KFold(n_splits=5, shuffle=True, random_state=42)
results_original_metrics = []

print("\n使用最佳参数进行K折交叉验证评估 (在开发集内部):")
for fold, (train_index, test_index) in enumerate(kf.split(X_original), 1):
    X_train, X_test = X_original.iloc[train_index], X_original.iloc[test_index]
    y_train, y_test = y_original.iloc[train_index], y_original.iloc[test_index]

    xgb_regressor_fold = XGBRegressor(**best_params_original, objective='reg:squarederror', tree_method='gpu_hist', random_state=42)
    eval_set = [(X_train, y_train), (X_test, y_test)]
    xgb_regressor_fold.fit(X_train, y_train, eval_metric='mae', eval_set=eval_set, early_stopping_rounds=10, verbose=False)

    y_pred_test = xgb_regressor_fold.predict(X_test)
    y_pred_train = xgb_regressor_fold.predict(X_train)

    mae_test = mean_absolute_error(y_test, y_pred_test)
    rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))
    r2_test = r2_score(y_test, y_pred_test)
    
    mae_train = mean_absolute_error(y_train, y_pred_train)
    r2_train = r2_score(y_train, y_pred_train)

    print(f"Fold {fold} Results: Train MAE: {mae_train:.4f}, Train R2: {r2_train:.4f} | CV-Validation MAE: {mae_test:.4f}, CV-Validation R2: {r2_test:.4f}")
    results_original_metrics.append({'mae_val': mae_test, 'rmse_val': rmse_test, 'r2_val': r2_test, 'mae_train': mae_train, 'r2_train': r2_train})

# --- 计算交叉验证的平均性能 ---
avg_mae_cv_val = np.mean([res['mae_val'] for res in results_original_metrics])
avg_rmse_cv_val = np.mean([res['rmse_val'] for res in results_original_metrics])
avg_r2_cv_val = np.mean([res['r2_val'] for res in results_original_metrics])
avg_mae_cv_train = np.mean([res['mae_train'] for res in results_original_metrics])
avg_r2_cv_train = np.mean([res['r2_train'] for res in results_original_metrics])

print("\nOriginal Data - Average CV Validation Performance (on Development Set):")
print(f"  MAE: {avg_mae_cv_val:.4f}")
print(f"  RMSE: {avg_rmse_cv_val:.4f}")
print(f"  R2 Score: {avg_r2_cv_val:.4f}")

print("\nOriginal Data - Average CV Train Performance (on Development Set):")
print(f"  MAE: {avg_mae_cv_train:.4f}")
print(f"  R2 Score: {avg_r2_cv_train:.4f}")



# --- 训练最终的基准模型 (在整个开发集上) ---
print("\n--- 训练最终基准模型 (在整个80%开发集上) ---")
final_model = XGBRegressor(**best_params_original, objective='reg:squarederror', tree_method='gpu_hist', random_state=42)
final_model.fit(X_original, y_original)
print("最终基准模型训练完成。")


# ==============================================================================
# ======================== 新增的最终评估代码块开始 ========================
# ==============================================================================

print("\n--- 最终无偏评估 (在20%最终留出测试集上) ---")

# --- 1. 加载最终测试集数据 ---
try:
    final_test_df = pd.read_excel(final_test_data_path)
    print(f"最终测试集数据加载成功，形状: {final_test_df.shape}")
    X_final_test = final_test_df.drop(columns=[target_column_name])
    y_final_test = final_test_df[target_column_name]
except FileNotFoundError:
    print(f"错误: 最终测试集文件未找到: {final_test_data_path}")
    exit()
except Exception as e:
    print(f"加载最终测试集时发生错误: {e}")
    exit()

# --- 2. 使用已训练的final_model进行预测 ---
# final_model 已经用整个开发集 (X_original, y_original) 训练好了
y_pred_final_test = final_model.predict(X_final_test)

# --- 3. 计算并打印最终性能指标 ---
mae_final = mean_absolute_error(y_final_test, y_pred_final_test)
rmse_final = np.sqrt(mean_squared_error(y_final_test, y_pred_final_test))
r2_final = r2_score(y_final_test, y_pred_final_test)

# 格式化输出，与您提供的增强模型结果格式一致
print("\n--- 最终模型在最终测试集上的性能 ---")
print(f"MAE: {mae_final:.4f}")
print(f"RMSE: {rmse_final:.4f}")
print(f"R2 Score: {r2_final:.4f}")

# (可选) 绘制一个真实值 vs 预测值的散点图，以供可视化
plt.figure(figsize=(8, 8))
plt.scatter(y_final_test, y_pred_final_test, alpha=0.7, edgecolors='w', linewidth=0.5)
min_val = min(y_final_test.min(), y_pred_final_test.min())
max_val = max(y_final_test.max(), y_pred_final_test.max())
plt.plot([min_val, max_val], [min_val, max_val], 'k--', lw=2) # y=x 对角线
plt.xlabel('Actual Rowing Distance')
plt.ylabel('Predicted Rowing Distance')
plt.title('Baseline Model: Actual vs. Predicted (Hold-Out Test Set)')
plt.grid(True, linestyle='--', alpha=0.7)
plot_filename_actual_vs_pred = os.path.join(output_plot_path, "baseline_model_actual_vs_predicted.png")
try:
    plt.savefig(plot_filename_actual_vs_pred, dpi=300, bbox_inches='tight')
    print(f"\n基准模型真实值 vs 预测值图已保存到: {plot_filename_actual_vs_pred}")
except Exception as e:
    print(f"保存基准模型真实值 vs 预测值图时发生错误: {e}")
plt.show()

# ==============================================================================
# ========================= 新增的最终评估代码块结束 =========================
# ==============================================================================

# 原有的特征重要性分析等部分可以继续保留，它分析的是基于开发集训练的最终模型
print("\n--- 特征重要性分析 (基于在整个开发集上训练的模型) ---")
feature_importances = final_model.feature_importances_
importance_df = pd.DataFrame({'Feature': X_original.columns, 'Importance': feature_importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)
print("特征重要性 (高到低):")
print(importance_df.head(21))

In [None]:
# --- 特征重要性分析 ---
print("\nFeature Importance Analysis:")
# 修改: 添加 tree_method='gpu_hist'
final_model = XGBRegressor(**best_params_original, objective='reg:squarederror', tree_method='gpu_hist', random_state=42)
final_model.fit(X_original, y_original)

feature_importances = final_model.feature_importances_
importance_df = pd.DataFrame({'Feature': X_original.columns, 'Importance': feature_importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

print("Feature Importances (High to Low):")
print(importance_df.head(min(20, len(X_original.columns)))) # 打印最重要的，最多20个

# 绘制特征重要性图
# 动态调整图像高度以容纳特征名称
num_features_to_plot = min(20, len(X_original.columns))
plot_height = max(6, num_features_to_plot * 0.4) # 根据特征数量调整高度
plt.figure(figsize=(10, plot_height))
plt.barh(importance_df['Feature'][:num_features_to_plot], importance_df['Importance'][:num_features_to_plot])
plt.xlabel('Feature Importance') # X轴标签改为英文
plt.ylabel('Feature') # Y轴标签改为英文
plt.title(f'Top {num_features_to_plot} Feature Importances') # 标题改为英文
plt.gca().invert_yaxis()
plt.tight_layout()
plt.grid(True, axis='x', linestyle='--', alpha=0.7) # 只显示X轴网格线
# 导出图表
plot_filename_importance = os.path.join(output_plot_path, "feature_importances.png")
plt.savefig(plot_filename_importance, dpi=300, bbox_inches='tight')
plt.show()

print(f"\n所有图表已尝试保存至: {output_plot_path}")