## 补全除了随机森林模型之外的其他模型的训练，将训练结果进行保存

## 1.随机森林模型

In [9]:
import pandas as pd
import numpy as np
import time
import random
import os
import joblib
from datetime import datetime
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import optuna
from optuna.samplers import TPESampler
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
import warnings

# 忽略警告
warnings.filterwarnings('ignore')

# 设置随机种子保证可重复性
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

# 全局文件路径常量
DATA_FILE_PATH = r"G:\Matlab\EX2025\AuxiliaryData\LFMC-gridMean-ML.xlsx"

def load_and_preprocess_selected_data():
    """数据加载函数 - 只选择6个VOD、LAI、Hveg和10个PFT特征，不进行归一化"""
    import pandas as pd
    import numpy as np
    import time

    print(f"加载数据集: {DATA_FILE_PATH}...")
    start_time = time.time()
    
    # 定义所有需要的列
    vod_columns = [
        'VOD_Ku_Hpol_Asc', 'VOD_Ku_Vpol_Asc',
        'VOD_X_Hpol_Asc', 'VOD_X_Vpol_Asc',
        'VOD_C_Hpol_Asc', 'VOD_C_Vpol_Asc'
    ]
    
    pft_columns = [
        'Grass_man', 'Grass_nat',
        'Shrub_bd', 'Shrub_be', 'Shrub_nd', 'Shrub_ne',
        'Tree_bd', 'Tree_be', 'Tree_nd', 'Tree_ne'
    ]
    
    required_columns = [
        'AGB', 'LFMCValue', 'SamplingDate',  # 用于计算目标变量
        'LAI', 'Hveg'  # 主要特征
    ] + vod_columns + pft_columns
    
    # 读取数据
    df = pd.read_excel(DATA_FILE_PATH, usecols=required_columns)
    
    # === 数据类型诊断 ===
    print("\n初始数据类型:")
    print(df.dtypes)
    
    # 确定需要转换为 float64 的列
    columns_to_convert = [col for col in df.columns if col != 'SamplingDate']
    
    # 清洗并转换为 float64
    for col in columns_to_convert:
        if df[col].dtype == 'object' or df[col].dtype.name == 'category':
            print(f"清洗并转换列: {col} (当前类型: {df[col].dtype}) 为 float64")
            # 转字符串，去掉空格、逗号、制表符等干扰字符
            df[col] = (
                df[col]
                .astype(str)
                .str.strip()
                .str.replace(r"[^\d\.\-eE]", "", regex=True)
                .replace({"": np.nan})
            )
            df[col] = pd.to_numeric(df[col], errors='coerce')
        elif df[col].dtype != 'float64':
            df[col] = df[col].astype('float64')
    
    # 检查缺失值
    if df.isnull().any().any():
        print("警告: 数据中存在缺失值，正在清理...")
        df = df.dropna()
        print(f"清理后样本数: {len(df)}")
    
    # 计算 VWC_sample
    df['VWC_sample'] = (df['AGB'] * df['LFMCValue']) / 1000
    df['VWC_sample'] = pd.to_numeric(df['VWC_sample'], errors='coerce')
    
    # 再次清理缺失值
    if df.isnull().any().any():
        print("警告: 类型转换后存在缺失值，正在清理...")
        df = df.dropna()
        print(f"清理后样本数: {len(df)}")
    
    # 过滤 VWC_sample
    df = df[df['VWC_sample'] <= 30]
    
    # 特征工程
    if 'SamplingDate' in df and pd.api.types.is_datetime64_any_dtype(df['SamplingDate']):
        df['Year_diff'] = df['SamplingDate'].dt.year.apply(lambda x: 2020 - x)
    else:
        print("警告: SamplingDate列不存在或不是日期类型，跳过年份差计算")
    
    # 定义特征列
    feature_columns = vod_columns + ['LAI', 'Hveg'] + pft_columns
    available_features = [col for col in feature_columns if col in df]
    missing_features = set(feature_columns) - set(available_features)
    
    if missing_features:
        print(f"警告: 以下特征不存在: {missing_features}")
    
    if not available_features:
        raise ValueError("错误: 没有找到任何特征列")
    
    X = df[available_features]
    y = df['VWC_sample']
    
    # 最终检查
    print("\n最终数据类型:")
    print(X.dtypes)
    print(f"目标变量类型: {y.dtype}")
    print(f"数据预处理完成, 耗时: {(time.time()-start_time)/60:.2f}分钟")
    print(f"使用特征: {len(available_features)}个 (6 VOD, 1 LAI, 1 Hveg, 10 PFT)")
    print(f"样本数量: {len(X)}")
    
    return X, y, df.index

def objective(trial, X_train, y_train):
    """贝叶斯优化目标函数"""
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 300, 1000, step=100),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2']),
        'max_depth': trial.suggest_int('max_depth', 10, 50, step=10),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 8),
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False])
    }
    
    model = RandomForestRegressor(**params, random_state=SEED, n_jobs=-1)
    
    # 五折交叉验证
    kf = KFold(n_splits=5, shuffle=True, random_state=SEED)
    rmse_scores = []

    for train_idx, val_idx in kf.split(X_train):
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
        
        model.fit(X_tr, y_tr)
        y_pred = model.predict(X_val)
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        rmse_scores.append(rmse)
    
    return np.mean(rmse_scores)

def plot_results_all(y_true, y_pred, filename):
    """结果可视化函数（更新文件名后缀）"""
    plt.rcParams['font.family'] = 'Times New Roman'
    plt.figure(figsize=(6, 6))
    
    # 计算RMSE和R2指标
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    
    # 绘制散点图
    plt.scatter(
        y_true, y_pred,
        marker='x',
        color='#FF0000',
        linewidths=0.5,
        s=40,
        alpha=0.8,
        zorder=2
    )
    
    # 添加1:1参考线
    max_val = max(np.max(y_true), np.max(y_pred))
    plt.plot([0, max_val], [0, max_val], 'k--', lw=1.5, alpha=0.7, zorder=1)
    
    # 设置坐标轴范围和标签
    plt.xlim(0, max_val + 1)
    plt.ylim(0, max_val + 1)
    plt.xlabel('Insitu VWC (kg/m2)', fontsize=12, fontweight='bold')
    plt.ylabel('RF VWC (kg/m2)', fontsize=12, fontweight='bold')
    
    # 添加标题（更新后缀）
    plt.title("6 VOD + LAI + PFTs + Hveg Random Forest Model", fontsize=16, pad=20, fontweight='bold')
    
    # 添加指标文本
    plt.text(0.05, 0.95,
             f'RMSE = {rmse:.3f} kg/m²\nR² = {r2:.4f}',
             transform=plt.gca().transAxes,
             fontsize=12,
             fontweight='bold',
             verticalalignment='top')
    
    # 网格线和样式调整
    plt.grid(True, linestyle='--', alpha=0.5)
    plt.gca().set_axisbelow(True)
    plt.tight_layout()
    
    # 保存图像（更新后缀）
    os.makedirs("figures", exist_ok=True)
    plot_path = f"figures/{filename}_6VOD_LAI_PFTs_Hveg.png"  # 更新后缀
    plt.savefig(plot_path, dpi=300, bbox_inches='tight')
    print(f"预测图保存至: {plot_path}")
    plt.close()

def plot_optimization_history(study, filename_prefix):
    """绘制贝叶斯优化过程历史图并保存数据"""
    # 创建优化历史数据框
    history_df = pd.DataFrame({
        'trial_number': [t.number for t in study.trials],
        'value': [t.value for t in study.trials],
        'params': [t.params for t in study.trials],
        'state': [t.state for t in study.trials]
    })
    
    # 保存优化历史到CSV
    os.makedirs("optimization_history", exist_ok=True)
    csv_path = f"optimization_history/{filename_prefix}_history.csv"
    history_df.to_csv(csv_path, index=False)
    print(f"优化历史数据已保存至: {csv_path}")
    
    # 提取所有有效试验的值
    valid_trials = [t for t in study.trials if t.state == optuna.trial.TrialState.COMPLETE]
    values = [t.value for t in valid_trials]
    best_values = [min(values[:i+1]) for i in range(len(values))]
    
    # 绘制优化过程图
    plt.figure(figsize=(10, 6))
    
    # 绘制当前试验值和历史最佳值
    plt.plot(range(1, len(values)+1), values, 'o-', color='blue', alpha=0.5, label='当前试验RMSE')
    plt.plot(range(1, len(values)+1), best_values, 'r-', linewidth=2, label='历史最佳RMSE')
    
    # 标记全局最佳值
    best_value = min(values)
    best_index = values.index(best_value) + 1
    plt.scatter(best_index, best_value, marker='*', s=200, color='red', 
                label=f'全局最佳 (试验#{best_index})')
    
    # 设置图表元素
    plt.xlabel('试验次数', fontsize=12)
    plt.ylabel('RMSE', fontsize=12)
    plt.title('贝叶斯优化过程 (6 VOD + LAI + PFTs + Hveg)', fontsize=14)
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.legend()
    
    # 保存图像
    plot_path = f"optimization_history/{filename_prefix}_history.png"
    plt.savefig(plot_path, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"优化过程图保存至: {plot_path}")
    
    return history_df

def plot_optimization_contour(study, filename_prefix):
    """绘制贝叶斯优化等高线图（添加详细错误处理）"""
    try:
        import plotly
        import optuna.visualization as vis
        
        # 使用Optuna内置可视化工具
        fig = vis.plot_contour(study, params=['n_estimators', 'max_depth'])
        if fig:
            fig.update_layout(
                title='贝叶斯优化参数关系 (6 VOD + LAI + PFTs + Hveg)',
                font=dict(size=12),
                width=800,
                height=600
            )
            
            # 保存为HTML格式以便后续交互查看
            os.makedirs("optimization_history", exist_ok=True)
            html_path = f"optimization_history/{filename_prefix}_contour.html"
            fig.write_html(html_path)
            print(f"优化等高线图已保存至: {html_path}")
            
            # 尝试保存为静态图片
            img_path = f"optimization_history/{filename_prefix}_contour.png"
            try:
                # 明确指定使用kaleido引擎
                fig.write_image(img_path, engine="kaleido")
                print(f"优化等高线图已保存至: {img_path}")
            except Exception as e:
                print(f"警告: 使用kaleido引擎保存静态图片失败: {str(e)}")
                print("尝试使用orca引擎...")
                try:
                    fig.write_image(img_path, engine="orca")
                    print(f"使用orca引擎保存成功: {img_path}")
                except Exception as e2:
                    print(f"使用orca引擎也失败: {str(e2)}")
                    print("跳过静态图片保存")
    except ImportError:
        print("警告: plotly未安装，跳过等高线图绘制")
    except Exception as e:
        print(f"绘制等高线图时发生错误: {str(e)}")

def visualize_optimization_from_csv(csv_path):
    """从CSV文件重新绘制优化历史图"""
    try:
        history_df = pd.read_csv(csv_path)
        
        # 过滤有效试验
        history_df = history_df[history_df['state'] == 'COMPLETE'].copy()
        
        if history_df.empty:
            print("警告: CSV中没有有效的试验数据")
            return
            
        # 提取值和最佳值
        values = history_df['value'].tolist()
        best_values = [min(values[:i+1]) for i in range(len(values))]
        
        # 绘制优化过程图
        plt.figure(figsize=(10, 6))
        plt.plot(range(1, len(values)+1), values, 'o-', color='blue', alpha=0.5, label='当前试验RMSE')
        plt.plot(range(1, len(values)+1), best_values, 'r-', linewidth=2, label='历史最佳RMSE')
        
        # 标记全局最佳值
        best_value = min(values)
        best_index = values.index(best_value) + 1
        plt.scatter(best_index, best_value, marker='*', s=200, color='red', 
                    label=f'全局最佳 (试验#{best_index})')
        
        # 设置图表元素
        plt.xlabel('试验次数', fontsize=12)
        plt.ylabel('RMSE', fontsize=12)
        plt.title('贝叶斯优化过程 (从CSV文件生成)', fontsize=14)
        plt.grid(True, linestyle='--', alpha=0.7)
        plt.legend()
        
        # 保存和显示
        plt_path = csv_path.replace('.csv', '_from_csv.png')
        plt.savefig(plt_path, dpi=300, bbox_inches='tight')
        plt.show()
        print(f"优化过程图已保存至: {plt_path}")
        return True
        
    except Exception as e:
        print(f"可视化失败: {str(e)}")
        return False

def train_selected_model():
    """训练使用6 VOD + LAI + PFTs + Hveg的新模型"""
    print(f"\n{'='*80}")
    print("开始训练模型: 6 VOD + LAI + 10 PFTs + Hveg")
    print(f"{'='*80}\n")
    
    # 1. 数据加载与预处理（添加Hveg，去除归一化）
    X, y, processed_indices = load_and_preprocess_selected_data()
    
    # 2. 数据划分
    X_train, X_test, y_train, y_test, train_idx, test_idx = train_test_split(
        X, y, processed_indices,
        test_size=0.333,
        random_state=SEED,
        shuffle=True
    )
    
    print(f"训练集样本数: {len(X_train)}")
    print(f"测试集样本数: {len(X_test)}")
    
    # 3. 贝叶斯优化调参
    print("\n开始贝叶斯优化调参...")
    study = optuna.create_study(
        study_name="VWC_6VOD_LAI_PFTs_Hveg_Optimization",
        direction='minimize',
        sampler=TPESampler(seed=SEED)
    )
    
    study.optimize(lambda trial: objective(trial, X_train, y_train), 
                   n_trials=100)
    
    # 获取最佳参数
    best_params = study.best_params
    print("\n最佳参数组合:")
    for key, value in best_params.items():
        print(f"{key}: {value}")
    print(f"最佳验证RMSE: {study.best_value:.4f}")
    
    # 4. 优化过程可视化（保存图像和数据）
    history_df = plot_optimization_history(study, "optuna_study_6VOD_LAI_PFTs_Hveg")
    
    # 尝试绘制等高线图
    try:
        plot_optimization_contour(study, "optuna_study_6VOD_LAI_PFTs_Hveg")
    except Exception as e:
        print(f"绘制等高线图时发生错误: {str(e)}")
    
    # 5. 使用最佳参数训练最终模型
    print("\n训练最终模型...")
    final_model = RandomForestRegressor(
        **best_params,
        random_state=SEED,
        n_jobs=-1
    )
    final_model.fit(X_train, y_train)
    
    # 6. 测试集评估
    print("\n测试集评估...")
    y_pred = final_model.predict(X_test)
    test_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    test_r2 = r2_score(y_test, y_pred)
    print(f"测试集 RMSE: {test_rmse:.4f}")
    print(f"测试集 R²: {test_r2:.4f}")
    
    # 7. 保存测试集对应原始行（更新后缀）
    test_data_dir = "test_data"
    os.makedirs(test_data_dir, exist_ok=True)

    # 读取原始表格（仅测试集对应行）
    # 重新读取原始数据以获取完整列
    full_df = pd.read_excel(DATA_FILE_PATH)
    test_rows = full_df.loc[test_idx]
    test_rows['y_pred'] = y_pred
    test_rows['y_true'] = y_test.values

    # 保存精简数据集（更新后缀）
    test_data_path = f"{test_data_dir}/test_rows_6VOD_LAI_PFTs_Hveg_Model.csv"
    test_rows.to_csv(test_data_path, index=False)
    print(f"测试集对应原始行已保存至: {test_data_path}") 
    
    # 8. 可视化预测结果（更新后缀）
    plot_results_all(y_test, y_pred, "prediction_results_6VOD_LAI_PFTs_Hveg")
    
    # 9. 保存模型（更新后缀）
    os.makedirs("models", exist_ok=True)
    model_path = "models/RFR_6VOD_LAI_PFTs_Hveg.pkl"  # 更新后缀
    joblib.dump(final_model, model_path)
    print(f"模型已保存至: {model_path}")
    
    # 10. 特征重要性分析
    if hasattr(final_model, 'feature_importances_'):
        feature_importances = pd.Series(final_model.feature_importances_, index=X.columns)
        feature_importances = feature_importances.sort_values(ascending=False)
        
        plt.figure(figsize=(10, 8))
        feature_importances.plot(kind='barh')
        plt.title('Feature Importance - 6 VOD + LAI + PFTs + Hveg Model', fontsize=16, fontweight='bold')
        plt.xlabel('Importance Score', fontsize=12, fontweight='bold')
        plt.tight_layout()
        
        # 保存图像（更新后缀）
        importance_path = f"figures/feature_importance_6VOD_LAI_PFTs_Hveg.png"
        plt.savefig(importance_path, dpi=300, bbox_inches='tight')
        plt.close()
        print(f"特征重要性图保存至: {importance_path}")
        
        # 保存特征重要性数据
        feature_imp_df = pd.DataFrame({
            'feature': feature_importances.index,
            'importance': feature_importances.values
        }).sort_values('importance', ascending=False)
        
        feature_imp_path = f"figures/feature_importance_6VOD_LAI_PFTs_Hveg.csv"
        feature_imp_df.to_csv(feature_imp_path, index=False)
        print(f"特征重要性数据已保存至: {feature_imp_path}")
        
        # 打印关键特征重要性
        print("\nTop 10特征重要性:")
        for i, (feature, importance) in enumerate(feature_importances.head(10).items()):
            print(f"{i+1}. {feature}: {importance:.4f}")
    else:
        print("警告: 模型没有feature_importances_属性，跳过特征重要性分析")

if __name__ == "__main__":
    train_selected_model()
    print("\n" + "="*80 + "\n模型训练完成!\n" + "="*80)
    
    # 示例：如何使用可视化函数从CSV重新绘图
    # 注释掉以下两行以跳过示例
    # csv_path = "optimization_history/optuna_study_6VOD_LAI_PFTs_Hveg_history.csv"
    # visualize_optimization_from_csv(csv_path)


开始训练模型: 6 VOD + LAI + 10 PFTs + Hveg

加载数据集: G:\Matlab\EX2025\AuxiliaryData\LFMC-gridMean-ML.xlsx...

初始数据类型:
SamplingDate       datetime64[ns]
LFMCValue                  object
AGB                       float64
Hveg                      float64
Grass_man                 float64
Grass_nat                 float64
Shrub_bd                  float64
Shrub_be                   object
Shrub_nd                  float64
Shrub_ne                  float64
Tree_bd                    object
Tree_be                    object
Tree_nd                    object
Tree_ne                    object
LAI                       float64
VOD_Ku_Hpol_Asc           float64
VOD_X_Hpol_Asc            float64
VOD_C_Hpol_Asc            float64
VOD_Ku_Vpol_Asc           float64
VOD_X_Vpol_Asc            float64
VOD_C_Vpol_Asc            float64
dtype: object
清洗并转换列: LFMCValue (当前类型: object) 为 float64
清洗并转换列: Shrub_be (当前类型: object) 为 float64
清洗并转换列: Tree_bd (当前类型: object) 为 float64
清洗并转换列: Tree_be (当前类型: object) 为 fl

[I 2025-09-28 15:03:19,980] A new study created in memory with name: VWC_6VOD_LAI_PFTs_Hveg_Optimization


警告: 数据中存在缺失值，正在清理...
清理后样本数: 16099

最终数据类型:
VOD_Ku_Hpol_Asc    float64
VOD_Ku_Vpol_Asc    float64
VOD_X_Hpol_Asc     float64
VOD_X_Vpol_Asc     float64
VOD_C_Hpol_Asc     float64
VOD_C_Vpol_Asc     float64
LAI                float64
Hveg               float64
Grass_man          float64
Grass_nat          float64
Shrub_bd           float64
Shrub_be           float64
Shrub_nd           float64
Shrub_ne           float64
Tree_bd            float64
Tree_be            float64
Tree_nd            float64
Tree_ne            float64
dtype: object
目标变量类型: float64
数据预处理完成, 耗时: 0.61分钟
使用特征: 18个 (6 VOD, 1 LAI, 1 Hveg, 10 PFT)
样本数量: 16013
训练集样本数: 10680
测试集样本数: 5333

开始贝叶斯优化调参...


[I 2025-09-28 15:03:27,702] Trial 0 finished with value: 1.7484255120906451 and parameters: {'n_estimators': 500, 'max_features': 'sqrt', 'max_depth': 30, 'min_samples_split': 3, 'min_samples_leaf': 2, 'bootstrap': False}. Best is trial 0 with value: 1.7484255120906451.
[I 2025-09-28 15:03:38,939] Trial 1 finished with value: 1.7610052873815731 and parameters: {'n_estimators': 700, 'max_features': 'sqrt', 'max_depth': 50, 'min_samples_split': 9, 'min_samples_leaf': 2, 'bootstrap': False}. Best is trial 0 with value: 1.7484255120906451.
[I 2025-09-28 15:03:47,268] Trial 2 finished with value: 1.75415351649218 and parameters: {'n_estimators': 500, 'max_features': 'sqrt', 'max_depth': 20, 'min_samples_split': 7, 'min_samples_leaf': 2, 'bootstrap': False}. Best is trial 0 with value: 1.7484255120906451.
[I 2025-09-28 15:03:55,241] Trial 3 finished with value: 1.7992555819007472 and parameters: {'n_estimators': 600, 'max_features': 'sqrt', 'max_depth': 30, 'min_samples_split': 7, 'min_sampl


最佳参数组合:
n_estimators: 900
max_features: log2
max_depth: 40
min_samples_split: 5
min_samples_leaf: 1
bootstrap: False
最佳验证RMSE: 1.7380
优化历史数据已保存至: optimization_history/optuna_study_6VOD_LAI_PFTs_Hveg_history.csv
优化过程图保存至: optimization_history/optuna_study_6VOD_LAI_PFTs_Hveg_history.png
警告: plotly未安装，跳过等高线图绘制

训练最终模型...

测试集评估...
测试集 RMSE: 1.7245
测试集 R²: 0.8817
测试集对应原始行已保存至: test_data/test_rows_6VOD_LAI_PFTs_Hveg_Model.csv
预测图保存至: figures/prediction_results_6VOD_LAI_PFTs_Hveg_6VOD_LAI_PFTs_Hveg.png
模型已保存至: models/RFR_6VOD_LAI_PFTs_Hveg.pkl
特征重要性图保存至: figures/feature_importance_6VOD_LAI_PFTs_Hveg.png
特征重要性数据已保存至: figures/feature_importance_6VOD_LAI_PFTs_Hveg.csv

Top 10特征重要性:
1. Hveg: 0.2533
2. Tree_ne: 0.2206
3. Grass_nat: 0.1632
4. LAI: 0.1055
5. Shrub_ne: 0.0504
6. VOD_X_Hpol_Asc: 0.0367
7. VOD_C_Hpol_Asc: 0.0321
8. VOD_Ku_Hpol_Asc: 0.0272
9. Grass_man: 0.0268
10. Tree_bd: 0.0219

模型训练完成!


## 2.LightGBM模型

In [8]:
import pandas as pd
import numpy as np
import time
import random
import os
import joblib
from datetime import datetime
import lightgbm as lgb  # 替换为LightGBM
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import optuna
from optuna.samplers import TPESampler
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
import warnings

# 忽略警告
warnings.filterwarnings('ignore')

# 设置随机种子保证可重复性
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

# 全局文件路径常量
DATA_FILE_PATH = r"G:\Matlab\EX2025\AuxiliaryData\LFMC-gridMean-ML.xlsx"

def load_and_preprocess_selected_data():
    """数据加载函数 - 只选择6个VOD、LAI、Hveg和10个PFT特征，不进行归一化"""
    import pandas as pd
    import numpy as np
    import time

    print(f"加载数据集: {DATA_FILE_PATH}...")
    start_time = time.time()
    
    # 定义所有需要的列
    vod_columns = [
        'VOD_Ku_Hpol_Asc', 'VOD_Ku_Vpol_Asc',
        'VOD_X_Hpol_Asc', 'VOD_X_Vpol_Asc',
        'VOD_C_Hpol_Asc', 'VOD_C_Vpol_Asc'
    ]
    
    pft_columns = [
        'Grass_man', 'Grass_nat',
        'Shrub_bd', 'Shrub_be', 'Shrub_nd', 'Shrub_ne',
        'Tree_bd', 'Tree_be', 'Tree_nd', 'Tree_ne'
    ]
    
    required_columns = [
        'AGB', 'LFMCValue', 'SamplingDate',  # 用于计算目标变量
        'LAI', 'Hveg'  # 主要特征
    ] + vod_columns + pft_columns
    
    # 读取数据
    df = pd.read_excel(DATA_FILE_PATH, usecols=required_columns)
    
    # === 数据类型诊断 ===
    print("\n初始数据类型:")
    print(df.dtypes)
    
    # 确定需要转换为 float64 的列
    columns_to_convert = [col for col in df.columns if col != 'SamplingDate']
    
    # 清洗并转换为 float64
    for col in columns_to_convert:
        if df[col].dtype == 'object' or df[col].dtype.name == 'category':
            print(f"清洗并转换列: {col} (当前类型: {df[col].dtype}) 为 float64")
            # 转字符串，去掉空格、逗号、制表符等干扰字符
            df[col] = (
                df[col]
                .astype(str)
                .str.strip()
                .str.replace(r"[^\d\.\-eE]", "", regex=True)
                .replace({"": np.nan})
            )
            df[col] = pd.to_numeric(df[col], errors='coerce')
        elif df[col].dtype != 'float64':
            df[col] = df[col].astype('float64')
    
    # 检查缺失值
    if df.isnull().any().any():
        print("警告: 数据中存在缺失值，正在清理...")
        df = df.dropna()
        print(f"清理后样本数: {len(df)}")
    
    # 计算 VWC_sample
    df['VWC_sample'] = (df['AGB'] * df['LFMCValue']) / 1000
    df['VWC_sample'] = pd.to_numeric(df['VWC_sample'], errors='coerce')
    
    # 再次清理缺失值
    if df.isnull().any().any():
        print("警告: 类型转换后存在缺失值，正在清理...")
        df = df.dropna()
        print(f"清理后样本数: {len(df)}")
    
    # 过滤 VWC_sample
    df = df[df['VWC_sample'] <= 30]
    
    # 特征工程
    if 'SamplingDate' in df and pd.api.types.is_datetime64_any_dtype(df['SamplingDate']):
        df['Year_diff'] = df['SamplingDate'].dt.year.apply(lambda x: 2020 - x)
    else:
        print("警告: SamplingDate列不存在或不是日期类型，跳过年份差计算")
    
    # 定义特征列
    feature_columns = vod_columns + ['LAI', 'Hveg'] + pft_columns
    available_features = [col for col in feature_columns if col in df]
    missing_features = set(feature_columns) - set(available_features)
    
    if missing_features:
        print(f"警告: 以下特征不存在: {missing_features}")
    
    if not available_features:
        raise ValueError("错误: 没有找到任何特征列")
    
    X = df[available_features]
    y = df['VWC_sample']
    
    # 最终检查
    print("\n最终数据类型:")
    print(X.dtypes)
    print(f"目标变量类型: {y.dtype}")
    print(f"数据预处理完成, 耗时: {(time.time()-start_time)/60:.2f}分钟")
    print(f"使用特征: {len(available_features)}个 (6 VOD, 1 LAI, 1 Hveg, 10 PFT)")
    print(f"样本数量: {len(X)}")
    
    return X, y, df.index

def objective(trial, X_train, y_train):
    """贝叶斯优化目标函数 - 修改为LightGBM超参数"""
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000, step=50),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'num_leaves': trial.suggest_int('num_leaves', 15, 255),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 10),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 10),
        'random_state': SEED,
        'n_jobs': -1,
        'verbosity': -1  # 静默模式
    }
    
    model = lgb.LGBMRegressor(**params)  # 使用LightGBM模型
    
    # 五折交叉验证
    kf = KFold(n_splits=5, shuffle=True, random_state=SEED)
    rmse_scores = []
    
    for train_idx, val_idx in kf.split(X_train):
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
        
        model.fit(X_tr, y_tr)
        y_pred = model.predict(X_val)
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        rmse_scores.append(rmse)
    
    return np.mean(rmse_scores)

def plot_results_all(y_true, y_pred, filename):
    """结果可视化函数（更新文件名后缀）"""
    plt.rcParams['font.family'] = 'Times New Roman'
    plt.figure(figsize=(6, 6))
    
    # 计算RMSE和R2指标
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    
    # 绘制散点图
    plt.scatter(
        y_true, y_pred,
        marker='x',
        color='#FF0000',
        linewidths=0.5,
        s=40,
        alpha=0.8,
        zorder=2
    )
    
    # 添加1:1参考线
    max_val = max(np.max(y_true), np.max(y_pred))
    plt.plot([0, max_val], [0, max_val], 'k--', lw=1.5, alpha=0.7, zorder=1)
    
    # 设置坐标轴范围和标签
    plt.xlim(0, max_val + 1)
    plt.ylim(0, max_val + 1)
    plt.xlabel('Insitu VWC (kg/m2)', fontsize=12, fontweight='bold')
    plt.ylabel('LightGBM VWC (kg/m2)', fontsize=12, fontweight='bold')  # 修改ylabel
    
    # 添加标题（更新后缀）
    plt.title("6 VOD + LAI + PFTs + Hveg LightGBM Model", fontsize=16, pad=20, fontweight='bold')  # 修改标题
    
    # 添加指标文本
    plt.text(0.05, 0.95,
             f'RMSE = {rmse:.3f} kg/m²\nR² = {r2:.4f}',
             transform=plt.gca().transAxes,
             fontsize=12,
             fontweight='bold',
             verticalalignment='top')
    
    # 网格线和样式调整
    plt.grid(True, linestyle='--', alpha=0.5)
    plt.gca().set_axisbelow(True)
    plt.tight_layout()
    
    # 保存图像（更新后缀）
    os.makedirs("figures", exist_ok=True)
    plot_path = f"figures/{filename}_6VOD_LAI_PFTs_Hveg.png"  # 更新后缀
    plt.savefig(plot_path, dpi=300, bbox_inches='tight')
    print(f"预测图保存至: {plot_path}")
    plt.close()

def plot_optimization_history(study, filename_prefix):
    """绘制贝叶斯优化过程历史图并保存数据"""
    # 创建优化历史数据框
    history_df = pd.DataFrame({
        'trial_number': [t.number for t in study.trials],
        'value': [t.value for t in study.trials],
        'params': [t.params for t in study.trials],
        'state': [t.state for t in study.trials]
    })
    
    # 保存优化历史到CSV
    os.makedirs("optimization_history", exist_ok=True)
    csv_path = f"optimization_history/{filename_prefix}_history.csv"
    history_df.to_csv(csv_path, index=False)
    print(f"优化历史数据已保存至: {csv_path}")
    
    # 提取所有有效试验的值
    valid_trials = [t for t in study.trials if t.state == optuna.trial.TrialState.COMPLETE]
    values = [t.value for t in valid_trials]
    best_values = [min(values[:i+1]) for i in range(len(values))]
    
    # 绘制优化过程图
    plt.figure(figsize=(10, 6))
    
    # 绘制当前试验值和历史最佳值
    plt.plot(range(1, len(values)+1), values, 'o-', color='blue', alpha=0.5, label='当前试验RMSE')
    plt.plot(range(1, len(values)+1), best_values, 'r-', linewidth=2, label='历史最佳RMSE')
    
    # 标记全局最佳值
    best_value = min(values)
    best_index = values.index(best_value) + 1
    plt.scatter(best_index, best_value, marker='*', s=200, color='red', 
                label=f'全局最佳 (试验#{best_index})')
    
    # 设置图表元素
    plt.xlabel('试验次数', fontsize=12)
    plt.ylabel('RMSE', fontsize=12)
    plt.title('贝叶斯优化过程 (6 VOD + LAI + PFTs + Hveg)', fontsize=14)
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.legend()
    
    # 保存图像
    plot_path = f"optimization_history/{filename_prefix}_history.png"
    plt.savefig(plot_path, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"优化过程图保存至: {plot_path}")
    
    return history_df

def plot_optimization_contour(study, filename_prefix):
    """绘制贝叶斯优化等高线图（添加详细错误处理）"""
    try:
        import plotly
        import optuna.visualization as vis
        
        # 使用Optuna内置可视化工具
        fig = vis.plot_contour(study, params=['n_estimators', 'max_depth'])
        if fig:
            fig.update_layout(
                title='贝叶斯优化参数关系 (6 VOD + LAI + PFTs + Hveg)',
                font=dict(size=12),
                width=800,
                height=600
            )
            
            # 保存为HTML格式以便后续交互查看
            os.makedirs("optimization_history", exist_ok=True)
            html_path = f"optimization_history/{filename_prefix}_contour.html"
            fig.write_html(html_path)
            print(f"优化等高线图已保存至: {html_path}")
            
            # 尝试保存为静态图片
            img_path = f"optimization_history/{filename_prefix}_contour.png"
            try:
                # 明确指定使用kaleido引擎
                fig.write_image(img_path, engine="kaleido")
                print(f"优化等高线图已保存至: {img_path}")
            except Exception as e:
                print(f"警告: 使用kaleido引擎保存静态图片失败: {str(e)}")
                print("尝试使用orca引擎...")
                try:
                    fig.write_image(img_path, engine="orca")
                    print(f"使用orca引擎保存成功: {img_path}")
                except Exception as e2:
                    print(f"使用orca引擎也失败: {str(e2)}")
                    print("跳过静态图片保存")
    except ImportError:
        print("警告: plotly未安装，跳过等高线图绘制")
    except Exception as e:
        print(f"绘制等高线图时发生错误: {str(e)}")

def visualize_optimization_from_csv(csv_path):
    """从CSV文件重新绘制优化历史图"""
    try:
        history_df = pd.read_csv(csv_path)
        
        # 过滤有效试验
        history_df = history_df[history_df['state'] == 'COMPLETE'].copy()
        
        if history_df.empty:
            print("警告: CSV中没有有效的试验数据")
            return
            
        # 提取值和最佳值
        values = history_df['value'].tolist()
        best_values = [min(values[:i+1]) for i in range(len(values))]
        
        # 绘制优化过程图
        plt.figure(figsize=(10, 6))
        plt.plot(range(1, len(values)+1), values, 'o-', color='blue', alpha=0.5, label='当前试验RMSE')
        plt.plot(range(1, len(values)+1), best_values, 'r-', linewidth=2, label='历史最佳RMSE')
        
        # 标记全局最佳值
        best_value = min(values)
        best_index = values.index(best_value) + 1
        plt.scatter(best_index, best_value, marker='*', s=200, color='red', 
                    label=f'全局最佳 (试验#{best_index})')
        
        # 设置图表元素
        plt.xlabel('试验次数', fontsize=12)
        plt.ylabel('RMSE', fontsize=12)
        plt.title('贝叶斯优化过程 (从CSV文件生成)', fontsize=14)
        plt.grid(True, linestyle='--', alpha=0.7)
        plt.legend()
        
        # 保存和显示
        plt_path = csv_path.replace('.csv', '_from_csv.png')
        plt.savefig(plt_path, dpi=300, bbox_inches='tight')
        plt.show()
        print(f"优化过程图已保存至: {plt_path}")
        return True
        
    except Exception as e:
        print(f"可视化失败: {str(e)}")
        return False

def train_selected_model():
    """训练使用6 VOD + LAI + PFTs + Hveg的新模型"""
    print(f"\n{'='*80}")
    print("开始训练模型: 6 VOD + LAI + 10 PFTs + Hveg (LightGBM)")
    print(f"{'='*80}\n")
    
    # 1. 数据加载与预处理（添加Hveg，去除归一化）
    X, y, processed_indices = load_and_preprocess_selected_data()
    
    # 2. 数据划分
    X_train, X_test, y_train, y_test, train_idx, test_idx = train_test_split(
        X, y, processed_indices,
        test_size=0.333,
        random_state=SEED,
        shuffle=True
    )
    
    print(f"训练集样本数: {len(X_train)}")
    print(f"测试集样本数: {len(X_test)}")
    
    # 3. 贝叶斯优化调参
    print("\n开始贝叶斯优化调参...")
    study = optuna.create_study(
        study_name="VWC_6VOD_LAI_PFTs_Hveg_Optimization_LGBM",  # 更新study名称
        direction='minimize',
        sampler=TPESampler(seed=SEED)
    )
    
    study.optimize(lambda trial: objective(trial, X_train, y_train), 
                   n_trials=100)
    
    # 获取最佳参数
    best_params = study.best_params
    print("\n最佳参数组合:")
    for key, value in best_params.items():
        print(f"{key}: {value}")
    print(f"最佳验证RMSE: {study.best_value:.4f}")
    
    # 4. 优化过程可视化（保存图像和数据）
    history_df = plot_optimization_history(study, "optuna_study_6VOD_LAI_PFTs_Hveg_LGBM")  # 更新文件名
    
    # 尝试绘制等高线图
    try:
        plot_optimization_contour(study, "optuna_study_6VOD_LAI_PFTs_Hveg_LGBM")  # 更新文件名
    except Exception as e:
        print(f"绘制等高线图时发生错误: {str(e)}")
    
    # 5. 使用最佳参数训练最终模型
    print("\n训练最终模型...")
    final_model = lgb.LGBMRegressor(  # 使用LightGBM
        **best_params,
        random_state=SEED,
        n_jobs=-1
    )
    final_model.fit(X_train, y_train)
    
    # 6. 测试集评估
    print("\n测试集评估...")
    y_pred = final_model.predict(X_test)
    test_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    test_r2 = r2_score(y_test, y_pred)
    print(f"测试集 RMSE: {test_rmse:.4f}")
    print(f"测试集 R²: {test_r2:.4f}")
    
    # 7. 保存测试集对应原始行（更新后缀）
    test_data_dir = "test_data"
    os.makedirs(test_data_dir, exist_ok=True)

    # 读取原始表格（仅测试集对应行）
    # 重新读取原始数据以获取完整列
    full_df = pd.read_excel(DATA_FILE_PATH)
    test_rows = full_df.loc[test_idx]
    test_rows['y_pred'] = y_pred
    test_rows['y_true'] = y_test.values

    # 保存精简数据集（更新后缀）
    test_data_path = f"{test_data_dir}/test_rows_6VOD_LAI_PFTs_Hveg_LGBM_Model.csv"  # 更新后缀
    test_rows.to_csv(test_data_path, index=False)
    print(f"测试集对应原始行已保存至: {test_data_path}") 
    
    # 8. 可视化预测结果（更新后缀）
    plot_results_all(y_test, y_pred, "prediction_results_6VOD_LAI_PFTs_Hveg_LGBM")  # 更新后缀
    
    # 9. 保存模型（更新后缀）
    os.makedirs("models", exist_ok=True)
    model_path = "models/LGBM_6VOD_LAI_PFTs_Hveg.pkl"  # 更新后缀为LGBM
    joblib.dump(final_model, model_path)
    print(f"模型已保存至: {model_path}")
    
    # 10. 特征重要性分析
    if hasattr(final_model, 'feature_importances_'):
        feature_importances = pd.Series(final_model.feature_importances_, index=X.columns)
        feature_importances = feature_importances.sort_values(ascending=False)
        
        plt.figure(figsize=(10, 8))
        feature_importances.plot(kind='barh')
        plt.title('Feature Importance - 6 VOD + LAI + PFTs + Hveg LightGBM Model', fontsize=16, fontweight='bold')  # 更新标题
        plt.xlabel('Importance Score', fontsize=12, fontweight='bold')
        plt.tight_layout()
        
        # 保存图像（更新后缀）
        importance_path = f"figures/feature_importance_6VOD_LAI_PFTs_Hveg_LGBM.png"  # 更新后缀
        plt.savefig(importance_path, dpi=300, bbox_inches='tight')
        plt.close()
        print(f"特征重要性图保存至: {importance_path}")
        
        # 保存特征重要性数据
        feature_imp_df = pd.DataFrame({
            'feature': feature_importances.index,
            'importance': feature_importances.values
        }).sort_values('importance', ascending=False)
        
        feature_imp_path = f"figures/feature_importance_6VOD_LAI_PFTs_Hveg_LGBM.csv"  # 更新后缀
        feature_imp_df.to_csv(feature_imp_path, index=False)
        print(f"特征重要性数据已保存至: {feature_imp_path}")
        
        # 打印关键特征重要性
        print("\nTop 10特征重要性:")
        for i, (feature, importance) in enumerate(feature_importances.head(10).items()):
            print(f"{i+1}. {feature}: {importance:.4f}")
    else:
        print("警告: 模型没有feature_importances_属性，跳过特征重要性分析")

if __name__ == "__main__":
    train_selected_model()
    print("\n" + "="*80 + "\n模型训练完成!\n" + "="*80)
    
    # 示例：如何使用可视化函数从CSV重新绘图
    # 注释掉以下两行以跳过示例
    # csv_path = "optimization_history/optuna_study_6VOD_LAI_PFTs_Hveg_LGBM_history.csv"  # 更新路径
    # visualize_optimization_from_csv(csv_path)


开始训练模型: 6 VOD + LAI + 10 PFTs + Hveg (LightGBM)

加载数据集: G:\Matlab\EX2025\AuxiliaryData\LFMC-gridMean-ML.xlsx...

初始数据类型:
SamplingDate       datetime64[ns]
LFMCValue                  object
AGB                       float64
Hveg                      float64
Grass_man                 float64
Grass_nat                 float64
Shrub_bd                  float64
Shrub_be                   object
Shrub_nd                  float64
Shrub_ne                  float64
Tree_bd                    object
Tree_be                    object
Tree_nd                    object
Tree_ne                    object
LAI                       float64
VOD_Ku_Hpol_Asc           float64
VOD_X_Hpol_Asc            float64
VOD_C_Hpol_Asc            float64
VOD_Ku_Vpol_Asc           float64
VOD_X_Vpol_Asc            float64
VOD_C_Vpol_Asc            float64
dtype: object
清洗并转换列: LFMCValue (当前类型: object) 为 float64
清洗并转换列: Shrub_be (当前类型: object) 为 float64
清洗并转换列: Tree_bd (当前类型: object) 为 float64
清洗并转换列: Tree_be (当前类型: o

[I 2025-09-28 14:44:27,080] A new study created in memory with name: VWC_6VOD_LAI_PFTs_Hveg_Optimization_LGBM


清洗并转换列: Tree_ne (当前类型: object) 为 float64
警告: 数据中存在缺失值，正在清理...
清理后样本数: 16099

最终数据类型:
VOD_Ku_Hpol_Asc    float64
VOD_Ku_Vpol_Asc    float64
VOD_X_Hpol_Asc     float64
VOD_X_Vpol_Asc     float64
VOD_C_Hpol_Asc     float64
VOD_C_Vpol_Asc     float64
LAI                float64
Hveg               float64
Grass_man          float64
Grass_nat          float64
Shrub_bd           float64
Shrub_be           float64
Shrub_nd           float64
Shrub_ne           float64
Tree_bd            float64
Tree_be            float64
Tree_nd            float64
Tree_ne            float64
dtype: object
目标变量类型: float64
数据预处理完成, 耗时: 0.60分钟
使用特征: 18个 (6 VOD, 1 LAI, 1 Hveg, 10 PFT)
样本数量: 16013
训练集样本数: 10680
测试集样本数: 5333

开始贝叶斯优化调参...


[I 2025-09-28 14:44:29,777] Trial 0 finished with value: 1.8549321169755921 and parameters: {'n_estimators': 450, 'learning_rate': 0.2536999076681772, 'max_depth': 12, 'num_leaves': 159, 'min_child_samples': 19, 'subsample': 0.662397808134481, 'colsample_bytree': 0.6232334448672797, 'reg_alpha': 8.661761457749352, 'reg_lambda': 6.011150117432088}. Best is trial 0 with value: 1.8549321169755921.
[I 2025-09-28 14:44:41,556] Trial 1 finished with value: 1.8268650409299259 and parameters: {'n_estimators': 750, 'learning_rate': 0.010725209743171996, 'max_depth': 15, 'num_leaves': 215, 'min_child_samples': 25, 'subsample': 0.6727299868828402, 'colsample_bytree': 0.6733618039413735, 'reg_alpha': 3.0424224295953772, 'reg_lambda': 5.247564316322379}. Best is trial 1 with value: 1.8268650409299259.
[I 2025-09-28 14:44:45,449] Trial 2 finished with value: 1.8586187038101496 and parameters: {'n_estimators': 500, 'learning_rate': 0.02692655251486473, 'max_depth': 10, 'num_leaves': 48, 'min_child_sa


最佳参数组合:
n_estimators: 800
learning_rate: 0.018829031543052318
max_depth: 15
num_leaves: 202
min_child_samples: 7
subsample: 0.790482146446866
colsample_bytree: 0.6231804830313077
reg_alpha: 2.095487778593813
reg_lambda: 0.4035925553970658
最佳验证RMSE: 1.7745
优化历史数据已保存至: optimization_history/optuna_study_6VOD_LAI_PFTs_Hveg_LGBM_history.csv
优化过程图保存至: optimization_history/optuna_study_6VOD_LAI_PFTs_Hveg_LGBM_history.png
警告: plotly未安装，跳过等高线图绘制

训练最终模型...

测试集评估...
测试集 RMSE: 1.7920
测试集 R²: 0.8722
测试集对应原始行已保存至: test_data/test_rows_6VOD_LAI_PFTs_Hveg_LGBM_Model.csv
预测图保存至: figures/prediction_results_6VOD_LAI_PFTs_Hveg_LGBM_6VOD_LAI_PFTs_Hveg.png
模型已保存至: models/LGBM_6VOD_LAI_PFTs_Hveg.pkl
特征重要性图保存至: figures/feature_importance_6VOD_LAI_PFTs_Hveg_LGBM.png
特征重要性数据已保存至: figures/feature_importance_6VOD_LAI_PFTs_Hveg_LGBM.csv

Top 10特征重要性:
1. LAI: 16151.0000
2. VOD_Ku_Vpol_Asc: 15417.0000
3. VOD_X_Vpol_Asc: 14521.0000
4. VOD_Ku_Hpol_Asc: 14261.0000
5. VOD_C_Vpol_Asc: 13688.0000
6. VOD_C_Hpol_Asc: 1334

# 3.XGBoost模型

In [None]:
import pandas as pd
import numpy as np
import time
import random
import os
import joblib
from datetime import datetime
import xgboost as xgb  # 替换为XGBoost
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import optuna
from optuna.samplers import TPESampler
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
import warnings

# 忽略警告
warnings.filterwarnings('ignore')

# 设置随机种子保证可重复性
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

# 全局文件路径常量
DATA_FILE_PATH = r"G:\Matlab\EX2025\AuxiliaryData\LFMC-gridMean-ML.xlsx"

def load_and_preprocess_selected_data():
    """数据加载函数 - 只选择6个VOD、LAI、Hveg和10个PFT特征，不进行归一化"""
    import pandas as pd
    import numpy as np
    import time

    print(f"加载数据集: {DATA_FILE_PATH}...")
    start_time = time.time()
    
    # 定义所有需要的列
    vod_columns = [
        'VOD_Ku_Hpol_Asc', 'VOD_Ku_Vpol_Asc',
        'VOD_X_Hpol_Asc', 'VOD_X_Vpol_Asc',
        'VOD_C_Hpol_Asc', 'VOD_C_Vpol_Asc'
    ]
    
    pft_columns = [
        'Grass_man', 'Grass_nat',
        'Shrub_bd', 'Shrub_be', 'Shrub_nd', 'Shrub_ne',
        'Tree_bd', 'Tree_be', 'Tree_nd', 'Tree_ne'
    ]
    
    required_columns = [
        'AGB', 'LFMCValue', 'SamplingDate',  # 用于计算目标变量
        'LAI', 'Hveg'  # 主要特征
    ] + vod_columns + pft_columns
    
    # 读取数据
    df = pd.read_excel(DATA_FILE_PATH, usecols=required_columns)
    
    # === 数据类型诊断 ===
    print("\n初始数据类型:")
    print(df.dtypes)
    
    # 确定需要转换为 float64 的列
    columns_to_convert = [col for col in df.columns if col != 'SamplingDate']
    
    # 清洗并转换为 float64
    for col in columns_to_convert:
        if df[col].dtype == 'object' or df[col].dtype.name == 'category':
            print(f"清洗并转换列: {col} (当前类型: {df[col].dtype}) 为 float64")
            # 转字符串，去掉空格、逗号、制表符等干扰字符
            df[col] = (
                df[col]
                .astype(str)
                .str.strip()
                .str.replace(r"[^\d\.\-eE]", "", regex=True)
                .replace({"": np.nan})
            )
            df[col] = pd.to_numeric(df[col], errors='coerce')
        elif df[col].dtype != 'float64':
            df[col] = df[col].astype('float64')
    
    # 检查缺失值
    if df.isnull().any().any():
        print("警告: 数据中存在缺失值，正在清理...")
        df = df.dropna()
        print(f"清理后样本数: {len(df)}")
    
    # 计算 VWC_sample
    df['VWC_sample'] = (df['AGB'] * df['LFMCValue']) / 1000
    df['VWC_sample'] = pd.to_numeric(df['VWC_sample'], errors='coerce')
    
    # 再次清理缺失值
    if df.isnull().any().any():
        print("警告: 类型转换后存在缺失值，正在清理...")
        df = df.dropna()
        print(f"清理后样本数: {len(df)}")
    
    # 过滤 VWC_sample
    df = df[df['VWC_sample'] <= 30]
    
    # 特征工程
    if 'SamplingDate' in df and pd.api.types.is_datetime64_any_dtype(df['SamplingDate']):
        df['Year_diff'] = df['SamplingDate'].dt.year.apply(lambda x: 2020 - x)
    else:
        print("警告: SamplingDate列不存在或不是日期类型，跳过年份差计算")
    
    # 定义特征列
    feature_columns = vod_columns + ['LAI', 'Hveg'] + pft_columns
    available_features = [col for col in feature_columns if col in df]
    missing_features = set(feature_columns) - set(available_features)
    
    if missing_features:
        print(f"警告: 以下特征不存在: {missing_features}")
    
    if not available_features:
        raise ValueError("错误: 没有找到任何特征列")
    
    X = df[available_features]
    y = df['VWC_sample']
    
    # 最终检查
    print("\n最终数据类型:")
    print(X.dtypes)
    print(f"目标变量类型: {y.dtype}")
    print(f"数据预处理完成, 耗时: {(time.time()-start_time)/60:.2f}分钟")
    print(f"使用特征: {len(available_features)}个 (6 VOD, 1 LAI, 1 Hveg, 10 PFT)")
    print(f"样本数量: {len(X)}")
    
    return X, y, df.index

def objective(trial, X_train, y_train):
    """贝叶斯优化目标函数 - 修改为XGBoost超参数"""
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000, step=50),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 10),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 10),
        'random_state': SEED,
        'n_jobs': -1,
        'verbosity': 0  # 静默模式
    }
    
    model = xgb.XGBRegressor(**params)  # 使用XGBoost模型
    
    # 五折交叉验证
    kf = KFold(n_splits=5, shuffle=True, random_state=SEED)
    rmse_scores = []
    
    for train_idx, val_idx in kf.split(X_train):
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
        
        model.fit(X_tr, y_tr)
        y_pred = model.predict(X_val)
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        rmse_scores.append(rmse)
    
    return np.mean(rmse_scores)

def plot_results_all(y_true, y_pred, filename):
    """结果可视化函数（更新文件名后缀）"""
    plt.rcParams['font.family'] = 'Times New Roman'
    plt.figure(figsize=(6, 6))
    
    # 计算RMSE和R2指标
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    
    # 绘制散点图
    plt.scatter(
        y_true, y_pred,
        marker='x',
        color='#FF0000',
        linewidths=0.5,
        s=40,
        alpha=0.8,
        zorder=2
    )
    
    # 添加1:1参考线
    max_val = max(np.max(y_true), np.max(y_pred))
    plt.plot([0, max_val], [0, max_val], 'k--', lw=1.5, alpha=0.7, zorder=1)
    
    # 设置坐标轴范围和标签
    plt.xlim(0, max_val + 1)
    plt.ylim(0, max_val + 1)
    plt.xlabel('Insitu VWC (kg/m2)', fontsize=12, fontweight='bold')
    plt.ylabel('XGBoost VWC (kg/m2)', fontsize=12, fontweight='bold')  # 修改ylabel
    
    # 添加标题（更新后缀）
    plt.title("6 VOD + LAI + PFTs + Hveg XGBoost Model", fontsize=16, pad=20, fontweight='bold')  # 修改标题
    
    # 添加指标文本
    plt.text(0.05, 0.95,
             f'RMSE = {rmse:.3f} kg/m²\nR² = {r2:.4f}',
             transform=plt.gca().transAxes,
             fontsize=12,
             fontweight='bold',
             verticalalignment='top')
    
    # 网格线和样式调整
    plt.grid(True, linestyle='--', alpha=0.5)
    plt.gca().set_axisbelow(True)
    plt.tight_layout()
    
    # 保存图像（更新后缀）
    os.makedirs("figures", exist_ok=True)
    plot_path = f"figures/{filename}_6VOD_LAI_PFTs_Hveg.png"  # 更新后缀
    plt.savefig(plot_path, dpi=300, bbox_inches='tight')
    print(f"预测图保存至: {plot_path}")
    plt.close()

def plot_optimization_history(study, filename_prefix):
    """绘制贝叶斯优化过程历史图并保存数据"""
    # 创建优化历史数据框
    history_df = pd.DataFrame({
        'trial_number': [t.number for t in study.trials],
        'value': [t.value for t in study.trials],
        'params': [t.params for t in study.trials],
        'state': [t.state for t in study.trials]
    })
    
    # 保存优化历史到CSV
    os.makedirs("optimization_history", exist_ok=True)
    csv_path = f"optimization_history/{filename_prefix}_history.csv"
    history_df.to_csv(csv_path, index=False)
    print(f"优化历史数据已保存至: {csv_path}")
    
    # 提取所有有效试验的值
    valid_trials = [t for t in study.trials if t.state == optuna.trial.TrialState.COMPLETE]
    values = [t.value for t in valid_trials]
    best_values = [min(values[:i+1]) for i in range(len(values))]
    
    # 绘制优化过程图
    plt.figure(figsize=(10, 6))
    
    # 绘制当前试验值和历史最佳值
    plt.plot(range(1, len(values)+1), values, 'o-', color='blue', alpha=0.5, label='当前试验RMSE')
    plt.plot(range(1, len(values)+1), best_values, 'r-', linewidth=2, label='历史最佳RMSE')
    
    # 标记全局最佳值
    best_value = min(values)
    best_index = values.index(best_value) + 1
    plt.scatter(best_index, best_value, marker='*', s=200, color='red', 
                label=f'全局最佳 (试验#{best_index})')
    
    # 设置图表元素
    plt.xlabel('试验次数', fontsize=12)
    plt.ylabel('RMSE', fontsize=12)
    plt.title('贝叶斯优化过程 (6 VOD + LAI + PFTs + Hveg)', fontsize=14)
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.legend()
    
    # 保存图像
    plot_path = f"optimization_history/{filename_prefix}_history.png"
    plt.savefig(plot_path, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"优化过程图保存至: {plot_path}")
    
    return history_df

def plot_optimization_contour(study, filename_prefix):
    """绘制贝叶斯优化等高线图（添加详细错误处理）"""
    try:
        import plotly
        import optuna.visualization as vis
        
        # 使用Optuna内置可视化工具
        fig = vis.plot_contour(study, params=['n_estimators', 'max_depth'])
        if fig:
            fig.update_layout(
                title='贝叶斯优化参数关系 (6 VOD + LAI + PFTs + Hveg)',
                font=dict(size=12),
                width=800,
                height=600
            )
            
            # 保存为HTML格式以便后续交互查看
            os.makedirs("optimization_history", exist_ok=True)
            html_path = f"optimization_history/{filename_prefix}_contour.html"
            fig.write_html(html_path)
            print(f"优化等高线图已保存至: {html_path}")
            
            # 尝试保存为静态图片
            img_path = f"optimization_history/{filename_prefix}_contour.png"
            try:
                # 明确指定使用kaleido引擎
                fig.write_image(img_path, engine="kaleido")
                print(f"优化等高线图已保存至: {img_path}")
            except Exception as e:
                print(f"警告: 使用kaleido引擎保存静态图片失败: {str(e)}")
                print("尝试使用orca引擎...")
                try:
                    fig.write_image(img_path, engine="orca")
                    print(f"使用orca引擎保存成功: {img_path}")
                except Exception as e2:
                    print(f"使用orca引擎也失败: {str(e2)}")
                    print("跳过静态图片保存")
    except ImportError:
        print("警告: plotly未安装，跳过等高线图绘制")
    except Exception as e:
        print(f"绘制等高线图时发生错误: {str(e)}")

def visualize_optimization_from_csv(csv_path):
    """从CSV文件重新绘制优化历史图"""
    try:
        history_df = pd.read_csv(csv_path)
        
        # 过滤有效试验
        history_df = history_df[history_df['state'] == 'COMPLETE'].copy()
        
        if history_df.empty:
            print("警告: CSV中没有有效的试验数据")
            return
            
        # 提取值和最佳值
        values = history_df['value'].tolist()
        best_values = [min(values[:i+1]) for i in range(len(values))]
        
        # 绘制优化过程图
        plt.figure(figsize=(10, 6))
        plt.plot(range(1, len(values)+1), values, 'o-', color='blue', alpha=0.5, label='当前试验RMSE')
        plt.plot(range(1, len(values)+1), best_values, 'r-', linewidth=2, label='历史最佳RMSE')
        
        # 标记全局最佳值
        best_value = min(values)
        best_index = values.index(best_value) + 1
        plt.scatter(best_index, best_value, marker='*', s=200, color='red', 
                    label=f'全局最佳 (试验#{best_index})')
        
        # 设置图表元素
        plt.xlabel('试验次数', fontsize=12)
        plt.ylabel('RMSE', fontsize=12)
        plt.title('贝叶斯优化过程 (从CSV文件生成)', fontsize=14)
        plt.grid(True, linestyle='--', alpha=0.7)
        plt.legend()
        
        # 保存和显示
        plt_path = csv_path.replace('.csv', '_from_csv.png')
        plt.savefig(plt_path, dpi=300, bbox_inches='tight')
        plt.show()
        print(f"优化过程图已保存至: {plt_path}")
        return True
        
    except Exception as e:
        print(f"可视化失败: {str(e)}")
        return False

def train_selected_model():
    """训练使用6 VOD + LAI + PFTs + Hveg的新模型"""
    print(f"\n{'='*80}")
    print("开始训练模型: 6 VOD + LAI + 10 PFTs + Hveg (XGBoost)")
    print(f"{'='*80}\n")
    
    # 1. 数据加载与预处理
    X, y, processed_indices = load_and_preprocess_selected_data()
    
    # 2. 数据划分
    X_train, X_test, y_train, y_test, train_idx, test_idx = train_test_split(
        X, y, processed_indices,
        test_size=0.333,
        random_state=SEED,
        shuffle=True
    )
    
    print(f"训练集样本数: {len(X_train)}")
    print(f"测试集样本数: {len(X_test)}")
    
    # 3. 贝叶斯优化调参
    print("\n开始贝叶斯优化调参...")
    study = optuna.create_study(
        study_name="VWC_6VOD_LAI_PFTs_Hveg_Optimization_XGB",  # 更新study名称
        direction='minimize',
        sampler=TPESampler(seed=SEED)
    )
    
    study.optimize(lambda trial: objective(trial, X_train, y_train), 
                   n_trials=100)
    
    # 获取最佳参数
    best_params = study.best_params
    print("\n最佳参数组合:")
    for key, value in best_params.items():
        print(f"{key}: {value}")
    print(f"最佳验证RMSE: {study.best_value:.4f}")
    
    # 4. 优化过程可视化
    history_df = plot_optimization_history(study, "optuna_study_6VOD_LAI_PFTs_Hveg_XGB")  # 更新文件名
    
    # 尝试绘制等高线图
    try:
        plot_optimization_contour(study, "optuna_study_6VOD_LAI_PFTs_Hveg_XGB")  # 更新文件名
    except Exception as e:
        print(f"绘制等高线图时发生错误: {str(e)}")
    
    # 5. 使用最佳参数训练最终模型
    print("\n训练最终模型...")
    final_model = xgb.XGBRegressor(  # 使用XGBoost
        **best_params,
        random_state=SEED,
        n_jobs=-1
    )
    final_model.fit(X_train, y_train)
    
    # 6. 测试集评估
    print("\n测试集评估...")
    y_pred = final_model.predict(X_test)
    test_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    test_r2 = r2_score(y_test, y_pred)
    print(f"测试集 RMSE: {test_rmse:.4f}")
    print(f"测试集 R²: {test_r2:.4f}")
    
    # 7. 保存测试集对应原始行
    test_data_dir = "test_data"
    os.makedirs(test_data_dir, exist_ok=True)

    # 读取原始表格（仅测试集对应行）
    full_df = pd.read_excel(DATA_FILE_PATH)
    test_rows = full_df.loc[test_idx]
    test_rows['y_pred'] = y_pred
    test_rows['y_true'] = y_test.values

    # 保存精简数据集
    test_data_path = f"{test_data_dir}/test_rows_6VOD_LAI_PFTs_Hveg_XGB_Model.csv"  # 更新后缀
    test_rows.to_csv(test_data_path, index=False)
    print(f"测试集对应原始行已保存至: {test_data_path}") 
    
    # 8. 可视化预测结果
    plot_results_all(y_test, y_pred, "prediction_results_6VOD_LAI_PFTs_Hveg_XGB")  # 更新后缀
    
    # 9. 保存模型
    os.makedirs("models", exist_ok=True)
    model_path = "models/XGB_6VOD_LAI_PFTs_Hveg.pkl"  # 更新后缀为XGB
    joblib.dump(final_model, model_path)
    print(f"模型已保存至: {model_path}")
    
    # 10. 特征重要性分析
    if hasattr(final_model, 'feature_importances_'):
        feature_importances = pd.Series(final_model.feature_importances_, index=X.columns)
        feature_importances = feature_importances.sort_values(ascending=False)
        
        plt.figure(figsize=(10, 8))
        feature_importances.plot(kind='barh')
        plt.title('Feature Importance - 6 VOD + LAI + PFTs + Hveg XGBoost Model', fontsize=16, fontweight='bold')  # 更新标题
        plt.xlabel('Importance Score', fontsize=12, fontweight='bold')
        plt.tight_layout()
        
        # 保存图像
        importance_path = f"figures/feature_importance_6VOD_LAI_PFTs_Hveg_XGB.png"  # 更新后缀
        plt.savefig(importance_path, dpi=300, bbox_inches='tight')
        plt.close()
        print(f"特征重要性图保存至: {importance_path}")
        
        # 保存特征重要性数据
        feature_imp_df = pd.DataFrame({
            'feature': feature_importances.index,
            'importance': feature_importances.values
        }).sort_values('importance', ascending=False)
        
        feature_imp_path = f"figures/feature_importance_6VOD_LAI_PFTs_Hveg_XGB.csv"  # 更新后缀
        feature_imp_df.to_csv(feature_imp_path, index=False)
        print(f"特征重要性数据已保存至: {feature_imp_path}")
        
        # 打印关键特征重要性
        print("\nTop 10特征重要性:")
        for i, (feature, importance) in enumerate(feature_importances.head(10).items()):
            print(f"{i+1}. {feature}: {importance:.4f}")
    else:
        print("警告: 模型没有feature_importances_属性，跳过特征重要性分析")

if __name__ == "__main__":
    train_selected_model()
    print("\n" + "="*80 + "\n模型训练完成!\n" + "="*80)
    
    # 示例：如何使用可视化函数从CSV重新绘图
    # 注释掉以下两行以跳过示例
    # csv_path = "optimization_history/optuna_study_6VOD_LAI_PFTs_Hveg_XGB_history.csv"  # 更新路径
    # visualize_optimization_from_csv(csv_path)

<!-- # 4.TabTransformer -->