In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, RepeatedKFold
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, mutual_info_regression
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import xgboost as xgb
from skopt import BayesSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
import shap

# 设置科学风格的绘图样式
def set_sci_style():
    sns.set_style("whitegrid")
    plt.rcParams['font.sans-serif'] = ['Arial']
    plt.rcParams['font.size'] = 36  # 增加字体大小
    plt.rcParams['axes.labelsize'] = 36
    plt.rcParams['axes.titlesize'] = 40  # 增加标题字体大小
    plt.rcParams['xtick.labelsize'] = 36
    plt.rcParams['ytick.labelsize'] = 36
    plt.rcParams['legend.fontsize'] = 36
    plt.rcParams['figure.titlesize'] = 40

# 保持原有的颜色方案
colors = {
    'RandomForest': ('#e41a1c', '#990000'),
    'XGBoost': ('#377eb8', '#00468B'),
    'ElasticNet': ('#4daf4a', '#006400'),
    'SVR': ('#984ea3', '#5f0080'),
    'GradientBoosting': ('#ff7f00', '#b35900'),
}

# 自定义特征选择器
class CustomFeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, k=10):
        self.k = k
        self.selector = SelectKBest(mutual_info_regression, k=self.k)
    
    def fit(self, X, y):
        self.selector.fit(X, y)
        return self
    
    def transform(self, X):
        return self.selector.transform(X)

# 读取Excel文件
excel_path = r"D:\张雨林\电子科大合作文章\电磁三步筛选后特征\电磁特征建模.xlsx"
df = pd.read_excel(excel_path, sheet_name="Sheet1", header=0)
output_folder = os.path.dirname(excel_path)

# 修改绘图函数
def plot_prediction(y_train, y_train_pred, y_test, y_test_pred, name, r2_train, r2_test):
    set_sci_style()
    plt.figure(figsize=(12, 10))  # 增加图形尺寸
    
    plt.scatter(y_train, y_train_pred, color=colors[name][0], alpha=0.3, label='Training Set', s=120)
    plt.scatter(y_test, y_test_pred, color=colors[name][1], alpha=0.6, label='Test Set', s=120)
    
    min_val = min(y_train.min(), y_test.min())
    max_val = max(y_train.max(), y_test.max())
    plt.plot([min_val, max_val], [min_val, max_val], 'k--', lw=3)
    
    plt.xlabel('Observed Values', fontsize=36)
    plt.ylabel('Predicted Values', fontsize=36)
    
    plt.text(0.05, 0.95, f'Train R² = {r2_train:.3f}\nTest R² = {r2_test:.3f}', 
             transform=plt.gca().transAxes, fontsize=36, 
             verticalalignment='top', fontweight='bold')
    
    plt.legend(fontsize=36, loc='lower right')
    
    plt.gca().set_facecolor('#f0f0f0')
    plt.grid(True, linestyle='--', alpha=0.7)
    
    plt.title(name, fontsize=40, fontweight='bold', pad=20)
    
    plt.tick_params(axis='both', which='major', labelsize=36)
    
    plt.tight_layout()

    filename = f'{name}_comparison_plot.png'
    output_path = os.path.join(output_folder, filename)
    plt.savefig(output_path, dpi=600, bbox_inches='tight')
    print(f"Saved plot for {name} at {output_path}")
    plt.close()

# 修改特征重要性绘图函数
def plot_feature_importance(importance_df, model_name):
    set_sci_style()
    plt.figure(figsize=(14, 10))  # 增加图形尺寸
    sns.barplot(x='Importance', y='Feature', data=importance_df.head(6), 
                palette=[colors[model_name][0], colors[model_name][1]], edgecolor='black')
    plt.title(f'Top 6 Features - {model_name}', fontweight='bold', fontsize=40)
    plt.xlabel('Importance Score', fontweight='bold', fontsize=36)
    plt.ylabel('', fontweight='bold')
    plt.tick_params(axis='both', which='major', labelsize=36)
    plt.tight_layout()
    output_path = os.path.join(output_folder, f'{model_name}_Feature_Importance.png')
    plt.savefig(output_path, dpi=600, bbox_inches='tight')
    print(f"Saved feature importance plot at {output_path}")
    plt.close()

# 修改相关性热图绘图函数
def plot_correlation_heatmap(corr_matrix, model_name):
    set_sci_style()
    plt.figure(figsize=(14, 12))  # 增加图形尺寸
    sns.heatmap(corr_matrix, annot=True, cmap='PuOr', vmin=-1, vmax=1, center=0, 
                square=True, linewidths=0.5, cbar_kws={"shrink": .8}, fmt='.2f',
                annot_kws={"size": 30})  # 增加注释字体大小
    plt.title(f'Correlation Heatmap of Top 6 Features - {model_name}', fontweight='bold', fontsize=40)
    plt.tick_params(axis='both', which='major', labelsize=36)
    plt.xticks(rotation=90)
    plt.yticks(rotation=0)
    plt.tight_layout()
    output_path = os.path.join(output_folder, f'{model_name}_Correlation_Heatmap.png')
    plt.savefig(output_path, dpi=600, bbox_inches='tight')
    print(f"Saved correlation heatmap at {output_path}")
    plt.close()

# SHAP值分析函数
def plot_shap_values(model, X, feature_names, model_name, feature_importance):
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X)
    
    # 按照特征重要性排序
    feature_order = feature_importance['Feature'].tolist()
    feature_idx = [feature_names.index(f) for f in feature_order if f in feature_names]
    
    set_sci_style()
    plt.figure(figsize=(14, 12))  # 增加图形尺寸
    shap.summary_plot(shap_values[:, feature_idx], X.iloc[:, feature_idx], 
                      plot_type="dot", feature_names=[feature_names[i] for i in feature_idx], 
                      show=False, plot_size=(14, 12), color=colors[model_name][0])
    plt.title(f'SHAP Feature Impact - {model_name}', fontweight='bold', fontsize=40)
    plt.tick_params(axis='both', which='major', labelsize=36)
    plt.tight_layout()
    output_path = os.path.join(output_folder, f'{model_name}_SHAP_Feature_Impact.png')
    plt.savefig(output_path, dpi=600, bbox_inches='tight')
    print(f"Saved SHAP feature impact plot at {output_path}")
    plt.close()

# 数据预处理
df = df.apply(pd.to_numeric, errors='coerce')
df = df.fillna(0)

# 分离特征和目标变量
X = df.iloc[:, :-1]
y = df.iloc[:, -1]
feature_names = X.columns.tolist()

print(f"\nInitial number of features: {X.shape[1]}")

# 创建训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")

# 定义模型管道
def create_pipeline(model):
    return Pipeline([
        ('scaler', StandardScaler()),
        ('feature_selector', CustomFeatureSelector()),
        ('model', model)
    ])

# 优化后的模型配置
models = {
    'XGBoost': (create_pipeline(xgb.XGBRegressor(random_state=42)), {
        'feature_selector__k': [5, 10, 15],
        'model__n_estimators': [100, 200, 300],
        'model__max_depth': [3, 4, 5],
        'model__learning_rate': [0.01, 0.05, 0.1],
        'model__subsample': [0.8, 0.9, 1.0],
        'model__colsample_bytree': [0.8, 0.9, 1.0],
        'model__reg_alpha': [0.1, 0.5, 1.0],
        'model__reg_lambda': [0.1, 0.5, 1.0]
    }),
    'SVR': (create_pipeline(SVR()), {
        'feature_selector__k': [5, 10, 15],
        'model__kernel': ['rbf', 'poly'],
        'model__C': [0.1, 1, 10],
        'model__gamma': ['scale', 'auto'],
        'model__epsilon': [0.05, 0.1, 0.2]
    }),
    'RandomForest': (create_pipeline(RandomForestRegressor(random_state=42)), {
        'feature_selector__k': [5, 10, 15],
        'model__n_estimators': [100, 200, 300],
        'model__max_depth': [5, 10, 15],
        'model__min_samples_split': [2, 5, 10]
    }),
    'GradientBoosting': (create_pipeline(GradientBoostingRegressor(random_state=42)), {
        'feature_selector__k': [5, 10, 15],
        'model__n_estimators': [100, 200, 300],
        'model__max_depth': [3, 4, 5],
        'model__learning_rate': [0.01, 0.05, 0.1],
        'model__subsample': [0.8, 0.9, 1.0],
    })
}

# 训练和评估模型
best_models = {}
best_r2 = -np.inf
best_model_name = ''
results = {}

for name, (model, params) in models.items():
    print(f"\nTraining {name}...")
    opt = BayesSearchCV(
        model,
        params,
        cv=RepeatedKFold(n_splits=5, n_repeats=3),
        scoring='r2',
        n_jobs=-1,
        n_iter=50  # 减少迭代次数以加快速度
    )
    
    opt.fit(X_train, y_train)
    
    best_model = opt.best_estimator_
    best_models[name] = best_model

    y_train_pred = best_model.predict(X_train)
    y_test_pred = best_model.predict(X_test)
    
    r2_train = r2_score(y_train, y_train_pred)
    r2_test = r2_score(y_test, y_test_pred)
    mae_train = mean_absolute_error(y_train, y_train_pred)
    mae_test = mean_absolute_error(y_test, y_test_pred)
    
    # 保存结果
    results[name] = {
        'r2_train': r2_train,
        'r2_test': r2_test,
        'mae_train': mae_train,
        'mae_test': mae_test,
        'y_train_pred': y_train_pred,
        'y_test_pred': y_test_pred
    }
    
    print(f'{name} - Train R²: {r2_train:.3f}, Test R²: {r2_test:.3f}')
    print(f'Train MAE: {mae_train:.3f}, Test MAE: {mae_test:.3f}')
    
    if r2_test > best_r2:
        best_r2 = r2_test
        best_model_name = name

# 对最佳模型进行详细分析
print(f"\nBest Model: {best_model_name} with Test R²: {best_r2:.3f}")
best_model = best_models[best_model_name]

# 获取最佳模型的预测结果
y_train_pred = results[best_model_name]['y_train_pred']
y_test_pred = results[best_model_name]['y_test_pred']

# 绘制预测图
plot_prediction(y_train, y_train_pred, y_test, y_test_pred, best_model_name, 
                results[best_model_name]['r2_train'], results[best_model_name]['r2_test'])

# 输出预测数据到CSV（修复长度不匹配问题，采用行拼接方式）
# 训练集数据
train_data = pd.DataFrame({
    'Set': 'Training',
    'Sample_ID': [f'Train_{i+1}' for i in range(len(y_train))],
    'Actual_Value': y_train.values,
    'Predicted_Value': y_train_pred
})

# 测试集数据
test_data = pd.DataFrame({
    'Set': 'Test',
    'Sample_ID': [f'Test_{i+1}' for i in range(len(y_test))],
    'Actual_Value': y_test.values,
    'Predicted_Value': y_test_pred
})

# 合并训练集和测试集（行拼接，确保长度一致）
prediction_data = pd.concat([train_data, test_data], ignore_index=True)

prediction_csv_path = os.path.join(output_folder, f'{best_model_name}_Prediction_Data.csv')
prediction_data.to_csv(prediction_csv_path, index=False)
print(f"Saved prediction data for {best_model_name} at {prediction_csv_path}")

# 获取特征名称和重要性
feature_selector = best_model.named_steps['feature_selector']
selected_features = feature_selector.selector.get_support(indices=True)
selected_feature_names = [feature_names[i] for i in selected_features]

# 特征重要性分析
if hasattr(best_model.named_steps['model'], 'feature_importances_'):
    importances = best_model.named_steps['model'].feature_importances_
elif hasattr(best_model.named_steps['model'], 'coef_'):
    importances = np.abs(best_model.named_steps['model'].coef_)
else:
    print(f"Cannot determine feature importance for {best_model_name}")
    importances = np.zeros(len(selected_feature_names))

importance_df = pd.DataFrame({'Feature': selected_feature_names, 'Importance': importances})
importance_df = importance_df.sort_values('Importance', ascending=False).reset_index(drop=True)

# 输出特征重要性数据到CSV（用于Origin作图）
importance_csv_path = os.path.join(output_folder, f'{best_model_name}_Feature_Importance.csv')
importance_df.to_csv(importance_csv_path, index=False)
print(f"Saved feature importance data for {best_model_name} at {importance_csv_path}")

# 绘制特征重要性图
plot_feature_importance(importance_df, best_model_name)

# 相关性分析
top_features = importance_df['Feature'].head(6).tolist()
X_train_df = pd.DataFrame(X_train, columns=feature_names)
X_test_df = pd.DataFrame(X_test, columns=feature_names)

# 提取Top6特征的原始数据（含样本ID）
top_features_train_data = X_train_df[top_features].copy()
top_features_train_data.insert(0, 'Sample_ID', [f'Train_{i+1}' for i in range(len(top_features_train_data))])

top_features_test_data = X_test_df[top_features].copy()
top_features_test_data.insert(0, 'Sample_ID', [f'Test_{i+1}' for i in range(len(top_features_test_data))])

# 计算相关性矩阵
corr_matrix = X_train_df[top_features].corr()

# 输出相关性相关数据到CSV（用于Origin作图）
# 1. 相关性矩阵数据
correlation_matrix_csv = os.path.join(output_folder, f'{best_model_name}_Correlation_Matrix.csv')
corr_matrix.to_csv(correlation_matrix_csv)
print(f"Saved correlation matrix data for {best_model_name} at {correlation_matrix_csv}")

# 2. Top6特征训练集原始数据
top_features_train_csv = os.path.join(output_folder, f'{best_model_name}_Top6_Features_Train.csv')
top_features_train_data.to_csv(top_features_train_csv, index=False)
print(f"Saved Top6 features training data for {best_model_name} at {top_features_train_csv}")

# 3. Top6特征测试集原始数据
top_features_test_csv = os.path.join(output_folder, f'{best_model_name}_Top6_Features_Test.csv')
top_features_test_data.to_csv(top_features_test_csv, index=False)
print(f"Saved Top6 features test data for {best_model_name} at {top_features_test_csv}")

# 绘制相关性热图
plot_correlation_heatmap(corr_matrix, best_model_name)

# SHAP值分析（仅适用于树模型）
if hasattr(best_model.named_steps['model'], 'feature_importances_'):
    X_test_selected = X_test[selected_feature_names]
    plot_shap_values(best_model.named_steps['model'], X_test_selected, selected_feature_names, best_model_name, importance_df)
else:
    print(f"SHAP analysis is not applicable to {best_model_name}")

# 输出模型性能汇总
performance_df = pd.DataFrame.from_dict({k: [v['r2_train'], v['r2_test'], v['mae_train'], v['mae_test']] 
                                         for k, v in results.items()}, 
                                        orient='index', 
                                        columns=['R2_Train', 'R2_Test', 'MAE_Train', 'MAE_Test'])
performance_csv_path = os.path.join(output_folder, 'Model_Performance_Summary.csv')
performance_df.to_csv(performance_csv_path)
print(f"Saved model performance summary at {performance_csv_path}")

print("\nAll plots and data tables have been generated.")
    