In [1]:
# Ku波段H极化的VOD输入，LightGBM版本，贝叶斯优化超参数
import pandas as pd
import numpy as np
import time
import random
import os
import joblib
from datetime import datetime
import lightgbm as lgb
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import optuna
from optuna.samplers import TPESampler
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
import warnings
warnings.filterwarnings('ignore', category=UserWarning)

# 设置随机种子保证可重复性
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

# ============================== 数据预处理部分 ==============================
def load_and_preprocess_data(file_path, sheet_name):
    """数据加载和预处理函数，添加静态特征"""
    print("开始数据加载和预处理...")
    start_time = time.time()
    
    # 定义要提取的列
    columns_to_extract = [
        'AGB', 'SM_Asc', 'Grass_man', 'Grass_nat',
        'Shrub_bd', 'Shrub_be', 'Shrub_nd', 'Shrub_ne',
        'Tree_bd', 'Tree_be', 'Tree_nd', 'Tree_ne',
        'LAI', 'LFMCValue', 'VOD_Ku_Hpol_Asc', 'SamplingDate',
        'Hveg', 'clay', 'silt', 'sand'  # 添加静态特征
    ]
    
    # 读取数据
    df = pd.read_excel(file_path, sheet_name=sheet_name, usecols=columns_to_extract)
    df = df.dropna()
    
    # 计算VWC_sample
    df['VWC_sample'] = (df['AGB'] * df['LFMCValue']) / 1000
    
    # =========== 移除VWC_sample > 30的样本（异常值） ===========
    original_count = len(df)
    df = df[df['VWC_sample'] <= 30]  # 过滤条件
    filtered_count = len(df)
    print(f"  移除VWC_sample > 30的样本: 原始样本数 {original_count}, 过滤后样本数 {filtered_count} (移除 {original_count-filtered_count} 个样本)")
    # ==========================================================
    
    df.drop(['AGB', 'LFMCValue'], axis=1, inplace=True)
    
    # 提取年份差
    df['Year_diff'] = df['SamplingDate'].dt.year.apply(lambda x: 2020 - x)
    
    # 归一化处理
    df['LAI'] = df['LAI'].clip(0, 6) / 6
    df['VOD'] = df['VOD_Ku_Hpol_Asc'].clip(0, 2) / 2
    df['Hveg'] = df['Hveg'] / 10  # 树高归一化
    
    # 土壤特征处理
    df['soil_total'] = df[['clay', 'silt', 'sand']].sum(axis=1)
    df['clay_ratio'] = df['clay'] / df['soil_total']
    df['silt_ratio'] = df['silt'] / df['soil_total']
    df['sand_ratio'] = df['sand'] / df['soil_total']
    
    # 创建交互特征
    df['LAI_SM'] = df['LAI'] * df['SM_Asc']
    df['VOD_Hveg'] = df['VOD'] * df['Hveg']
    
    # 重命名列
    df = df.rename(columns={'SM_Asc': 'SM'})
    
    # 定义特征和目标列
    feature_columns = [
        'VOD', 'LAI', 'SM', 'Hveg',
        'Grass_man', 'Grass_nat',
        'Shrub_bd', 'Shrub_be', 'Shrub_nd', 'Shrub_ne',
        'Tree_bd', 'Tree_be', 'Tree_nd', 'Tree_ne',
        'clay_ratio', 'silt_ratio', 'sand_ratio',
        'LAI_SM', 'VOD_Hveg'
    ]
    
    X = df[feature_columns]
    y = df['VWC_sample']
    
    # 数据质量分析
    print("\n数据特征分析 (过滤后):")
    print(f"特征数量: {len(feature_columns)}")
    print(f"样本数量: {len(X)}")
    print(f"VWC范围: {y.min():.2f} - {y.max():.2f} kg/m²")
    
    # 检测特征相关性
    corr_matrix = pd.concat([X, y], axis=1).corr()
    print("\n特征相关性分析 (top 5):")
    print(corr_matrix['VWC_sample'].abs().sort_values(ascending=False).head(10))
    
    elapsed_time = (time.time() - start_time) / 60
    print(f"数据预处理完成，耗时: {elapsed_time:.2f} 分钟")
    
    return X, y, df  # 返回原始df用于可视化

# ============================== 模型训练与评估部分 ==============================
def train_and_evaluate(X, y, df):
    """模型训练与评估主函数 - LightGBM版本"""
    print("\n开始模型训练与评估流程(使用LightGBM)...")
    
    # 1. 数据划分（2:1比例）
    print("划分训练集和测试集...")
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, 
        test_size=0.333, 
        random_state=SEED,
        shuffle=True
    )
    train_idx, test_idx = train_test_split(
        np.arange(len(X)), 
        test_size=0.333, 
        random_state=SEED
    )
    
    print(f"训练集样本数: {len(X_train)}, 测试集样本数: {len(X_test)}")
    
    # 创建LightGBM数据集
    lgb_train = lgb.Dataset(X_train, label=y_train)
    lgb_test = lgb.Dataset(X_test, label=y_test, reference=lgb_train)
    
    # 2. 贝叶斯优化调参
    print("\n开始贝叶斯优化调参...")
    study = optuna.create_study(
        study_name="VWC_LGBM_Optimization_KuHpol",
        direction='minimize',
        sampler=TPESampler(seed=SEED)
    )
    
    # 使用自定义评估函数进行优化
    study.optimize(
        lambda trial: objective(trial, X_train, y_train, X_test, y_test), 
        n_trials=100,
        show_progress_bar=True
    )
    
    # 获取最佳参数
    best_params = study.best_params
    print("\n最佳参数组合:")
    for key, value in best_params.items():
        print(f"{key}: {value}")
    print(f"最佳验证RMSE: {study.best_value:.4f}")
    
    # 3. 使用最佳参数训练最终模型
    print("\n训练最终模型...")
    final_model = lgb.train(
        {**best_params, 
         'objective': 'regression',
         'metric': 'rmse',
         'verbosity': -1,
         'seed': SEED},
        lgb_train,
        num_boost_round=1000,
        valid_sets=[lgb_test],
        callbacks=[
            lgb.early_stopping(stopping_rounds=50),
            lgb.log_evaluation(period=100)
        ]
    )
    
    # 4. 测试集评估
    print("\n测试集评估...")
    y_pred = final_model.predict(X_test)
    test_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    test_r2 = r2_score(y_test, y_pred)
    print(f"测试集RMSE: {test_rmse:.4f}")
    print(f"测试集R²: {test_r2:.4f}")
    
    # 5. 高级模型诊断
    model_diagnosis(final_model, X_test, y_test, df.iloc[test_idx])
    
    # 6. 可视化预测结果
    plot_results(y_test, y_pred)
    
    # 7. 保存模型
    os.makedirs("models", exist_ok=True)
    model_path = "models/LGBM_Ku_Hpol_Type1.txt"
    final_model.save_model(model_path)
    print(f"\n模型已保存至: {model_path}")
    
    return final_model

# ============================== 贝叶斯优化目标函数 ==============================
def objective(trial, X_train, y_train, X_test, y_test):
    """贝叶斯优化目标函数 - LightGBM版本"""
    params = {
        'num_leaves': trial.suggest_int('num_leaves', 20, 200, step=10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'max_depth': trial.suggest_int('max_depth', 4, 12),
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 100),
        'subsample': trial.suggest_float('subsample', 0.7, 1.0, step=0.05),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 1.0, step=0.05),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 1.0),
        'min_split_gain': trial.suggest_float('min_split_gain', 0.0, 0.3),
        'max_bin': trial.suggest_int('max_bin', 100, 500, step=50)
    }
    
    # 使用早停策略训练模型
    lgb_train = lgb.Dataset(X_train, label=y_train)
    lgb_val = lgb.Dataset(X_test, label=y_test)
    
    model = lgb.train(
        {**params, 
         'objective': 'regression',
         'metric': 'rmse',
         'verbosity': -1,
         'seed': SEED},
        lgb_train,
        num_boost_round=1000,
        valid_sets=[lgb_val],
        callbacks=[
            lgb.early_stopping(stopping_rounds=30),
            lgb.log_evaluation(False)
        ]
    )
    
    # 获取验证集最佳RMSE
    best_rmse = model.best_score['valid_0']['rmse']
    
    # 特征重要性分析（用于优化过程）
    importances = model.feature_importance(importance_type='gain')
    top_features = sorted(zip(X_train.columns, importances), key=lambda x: x[1], reverse=True)[:5]
    
    # 将重要特征保存到trial中
    for i, (feature, importance) in enumerate(top_features):
        trial.set_user_attr(f"top_feature_{i+1}", feature)
        trial.set_user_attr(f"importance_{i+1}", importance)
    
    return best_rmse

# ============================== 模型诊断函数 ==============================
def model_diagnosis(model, X_test, y_test, test_df):
    """高级模型诊断函数"""
    print("\n高级模型诊断:")
    
    # 1. 特征重要性分析
    importance_df = pd.DataFrame({
        'Feature': X_test.columns,
        'Importance': model.feature_importance(importance_type='gain')
    }).sort_values('Importance', ascending=False)
    
    print("\nTop 10特征重要性:")
    print(importance_df.head(10))
    
    # 2. 按植被类型分析
    for veg_type in ['Grass', 'Shrub', 'Tree']:
        mask = test_df[f"{veg_type}_man"] + test_df[f"{veg_type}_nat"] > 0.5
        if mask.sum() > 0:
            sub_pred = model.predict(X_test[mask])
            sub_rmse = np.sqrt(mean_squared_error(y_test[mask], sub_pred))
            print(f"{veg_type}植被区域RMSE: {sub_rmse:.4f} (样本数: {mask.sum()})")
    
    # 3. 按土壤类型分析
    soil_primary = test_df[['clay_ratio', 'silt_ratio', 'sand_ratio']].idxmax(axis=1)
    for soil_type in ['clay_ratio', 'silt_ratio', 'sand_ratio']:
        mask = soil_primary == soil_type
        if mask.sum() > 0:
            sub_pred = model.predict(X_test[mask])
            sub_rmse = np.sqrt(mean_squared_error(y_test[mask], sub_pred))
            soil_name = soil_type.replace('_ratio', '')
            print(f"{soil_name}土壤区域RMSE: {sub_rmse:.4f} (样本数: {mask.sum()})")
    
    # 4. 树高与预测精度的关系
    height_bins = pd.cut(test_df['Hveg'], bins=5)
    print("\n树高与预测精度关系:")
    for bin_group in height_bins.cat.categories:
        mask = height_bins == bin_group
        sub_pred = model.predict(X_test[mask])
        sub_rmse = np.sqrt(mean_squared_error(y_test[mask], sub_pred))
        print(f"树高 {bin_group}: RMSE {sub_rmse:.4f} (样本数: {mask.sum()})")
        
    # 5. VWC_sample分布分析（关键新增）
    vwc_bins = pd.cut(test_df['VWC_sample'], bins=5)
    print("\nVWC区间预测精度分析:")
    for bin_group in vwc_bins.cat.categories:
        mask = vwc_bins == bin_group
        if mask.sum() > 0:
            sub_pred = model.predict(X_test[mask])
            sub_rmse = np.sqrt(mean_squared_error(y_test[mask], sub_pred))
            vwc_mean = test_df.loc[mask, 'VWC_sample'].mean()
            print(f"VWC区间 {bin_group} (均值{vwc_mean:.2f}): RMSE {sub_rmse:.4f} (样本数: {mask.sum()})")

# ============================== 可视化函数 ==============================
def plot_results(y_true, y_pred):
    """结果可视化函数 - 改进版"""
    plt.rcParams['font.family'] = 'Times New Roman'
    plt.figure(figsize=(8, 8))
    
    # 计算RMSE和R²
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    
    # 绘制散点图（增加透明度）
    plt.scatter(
        y_true, y_pred, 
        alpha=0.5,
        c='#1f77b4',
        edgecolors='none'
    )
    
    # 添加1:1参考线
    max_val = max(max(y_true), max(y_pred)) * 1.05
    plt.plot([0, max_val], [0, max_val], 'k--', lw=1.5, alpha=0.7)   
    
    # 设置坐标轴范围和标签
    plt.xlim(0, 30)
    plt.ylim(0, 30)
    plt.xlabel('Insitu VWC (kg/m²)', fontsize=14, fontweight='bold')
    plt.ylabel('LGBM Predicted VWC (kg/m²)', fontsize=14, fontweight='bold')
    
    # 设置标题
    plt.title('Ku-Band, H-pol - LightGBM Prediction (VWC≤30)', fontsize=16, pad=20, fontweight='bold')
    
    # 添加统计指标文本
    plt.text(0.05, 0.95, 
             f'RMSE = {rmse:.3f} kg/m²\nR² = {r2:.3f}', 
             transform=plt.gca().transAxes,
             fontsize=13,
             fontweight='bold',
             verticalalignment='top',
             bbox=dict(boxstyle='round', facecolor='white', alpha=0.7))
    
    # 添加密度等高线
    try:
        from scipy.stats import gaussian_kde
        xy = np.vstack([y_true, y_pred])
        z = gaussian_kde(xy)(xy)
        idx = z.argsort()
        plt.scatter(
            y_true, y_pred, 
            c=z[idx], 
            cmap='viridis', 
            alpha=0.4, 
            s=10
        )
        plt.colorbar(label='Density')
    except ImportError:
        pass
    
    # 网格线和样式调整
    plt.grid(True, linestyle='--', alpha=0.3)
    plt.tight_layout()
    
    # 保存图像
    os.makedirs("figures", exist_ok=True)
    plot_path = "figures/lgbm_prediction_results_Ku_Hpol_Type1_filtered.png"
    plt.savefig(plot_path, dpi=300, bbox_inches='tight')
    print(f"Prediction plot saved to: {plot_path}")
    plt.close()

# ============================== 主执行流程 ==============================
if __name__ == "__main__":
    # 数据加载与预处理
    file_path = r"E:\Matlab\EX2025\AuxiliaryData\VWC_ML_Data.xlsx"
    sheet_name = "VOD_Ku_Hpol_Asc_Cleaned_Type1"
    print(f"\n{'='*50}")
    print(f"处理数据集: {sheet_name} (添加VWC_sample≤30过滤)")
    print(f"{'='*50}")
    
    X, y, df = load_and_preprocess_data(file_path, sheet_name)
    
    # 模型训练与评估
    model = train_and_evaluate(X, y, df)
    
    # 输出训练完成信息
    print("\n" + "="*50)
    print("模型训练完成 (添加VWC_sample>30过滤)!")
    print("="*50)


处理数据集: VOD_Ku_Hpol_Asc_Cleaned_Type1 (添加VWC_sample≤30过滤)
开始数据加载和预处理...


ValueError: Usecols do not match columns, columns expected but not found: ['clay', 'sand', 'silt'] (sheet: VOD_Ku_Hpol_Asc_Cleaned_Type1)