导入包

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import shap
import pickle
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor,GradientBoostingRegressor,VotingRegressor 
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
import xgboost as XGB
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.base import BaseEstimator, RegressorMixin
from joblib import Parallel, delayed
warnings.filterwarnings('ignore')

In [None]:
data = pd.read_excel('')
X = data.iloc[:,1:-2]
Y = data.iloc[:,-2]
x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.2,random_state=0)

In [None]:
cross_Valid = KFold(n_splits=5, shuffle=True, random_state=0)

#Define the hyperparameter grid for each model.
parameter_XGBR = {
}

parameter_RF = {
}

parameter_CBR = {
}

parameter_LGBM = {
}

parameter_ABR = {
}

parameter_GBRT = {
}

# Define model dictionary
estimators = {
    'XGBR': XGB.XGBRegressor(random_state=0),
    'RF': RandomForestRegressor(random_state=0),
    'CBR': CatBoostRegressor(verbose=False, random_state=0),
    'LGBM': LGBMRegressor(random_state=0, verbosity=-1),
    'ABR': AdaBoostRegressor(random_state=0),
    'GBRT': GradientBoostingRegressor(random_state=0)
}

# Map model names to corresponding hyperparameters.
params_mapping = {
    'XGBR': parameter_XGBR,
    'RF': parameter_RF,
    'CBR': parameter_CBR,
    'LGBM': parameter_LGBM,
    'ABR': parameter_ABR,
    'GBRT': parameter_GBRT
}

grid_searches = {}
for name, estimator in estimators.items():
    params = params_mapping[name]
    grid_searches[name] = GridSearchCV(estimator, params, scoring='r2', cv=cross_Valid, n_jobs=-1)

# Train the model and find the best parameters.
for name, grid in grid_searches.items():
    grid.fit(x_train, y_train)
    print(f"{name} best parameters: {grid.best_params_}")

In [None]:
# Define a function to initialize the model dictionary.
def initialize_best_estimators(grid_searches):
    return {
        'XGBR': XGB.XGBRegressor(**grid_searches['XGBR'].best_params_, random_state=42),
        'RF': RandomForestRegressor(**grid_searches['RF'].best_params_, random_state=42),
        'CBR': CatBoostRegressor(**grid_searches['CBR'].best_params_, verbose=False, random_state=42),
        'LGBM': LGBMRegressor(**grid_searches['LGBM'].best_params_, random_state=42, verbosity=-1),
        'ABR': AdaBoostRegressor(**grid_searches['ABR'].best_params_, random_state=42),
        'GBRT': GradientBoostingRegressor(**grid_searches['GBRT'].best_params_, random_state=42)
    }

# Define a function to calculate model performance and feature importance.
def evaluate_models(seed, X, Y, grid_searches, submodel_r2_sums, submodel_rmse_sums, num_seeds):
    cross_validator = KFold(n_splits=10, shuffle=True, random_state=seed)
    best_estimators = initialize_best_estimators(grid_searches)

   # Create a VotingRegressor
    submodels = [(name, estimator) for name, estimator in best_estimators.items() if estimator is not None]
    voting_regressor = VotingRegressor(submodels)

    # Calculate the mean values of R2 and RMSE for each submodel.
    submodel_r2_means = {}
    submodel_rmse_means = {}
    rmse_scorer = make_scorer(mean_squared_error, squared=False)
    
    # Used to store the feature importance of each model.
    submodel_feature_importances = {}
    feature_importances_weighted_sum = np.zeros(X.shape[1])
    total_r2 = 0

    for name, estimator in submodels:
        r2_scores = cross_val_score(estimator, X, Y, cv=cross_validator, scoring='r2', n_jobs=-1)
        rmse_scores = cross_val_score(estimator, X, Y, cv=cross_validator, scoring=rmse_scorer, n_jobs=-1)
        submodel_r2_means[name] = np.mean(r2_scores)
        submodel_rmse_means[name] = np.mean(rmse_scores)
        submodel_r2_sums[name] += submodel_r2_means[name]
        submodel_rmse_sums[name] += submodel_rmse_means[name]

        # Calculate feature importance
        estimator.fit(X, Y)
        importances = estimator.feature_importances_
        
        # Normalization processing
        importances_normalized = importances / np.sum(importances)
        
        submodel_feature_importances[name] = dict(zip(X.columns, importances_normalized))
        feature_importances_weighted_sum += importances_normalized * submodel_r2_means[name]
        total_r2 += submodel_r2_means[name]

    # Calculate the mean R2 and weighted RMSE of the fusion model
    voting_regressor_r2_mean = np.mean(cross_val_score(voting_regressor, X, Y, cv=cross_validator, scoring='r2', n_jobs=-1))
    voting_regressor_rmse_mean = np.mean(cross_val_score(voting_regressor, X, Y, cv=cross_validator, scoring=rmse_scorer, n_jobs=-1))

    # Calculate weighted feature importance
    weighted_feature_importances = feature_importances_weighted_sum / total_r2 if total_r2 != 0 else feature_importances_weighted_sum

    weighted_feature_importances_dict = dict(zip(X.columns, weighted_feature_importances))

    return {
        'submodel_r2_means': submodel_r2_means,
        'submodel_rmse_means': submodel_rmse_means,
        'voting_regressor_r2_mean': voting_regressor_r2_mean,
        'voting_regressor_rmse_mean': voting_regressor_rmse_mean,
        'submodel_feature_importances': submodel_feature_importances,
        'weighted_feature_importances': weighted_feature_importances_dict
    }

# Define to save all seed results.
all_results = []
seeds = range(100)

# Initialize the cumulative sum of R2 and RMSE of submodels.
submodel_r2_sums = {name: 0.0 for name in initialize_best_estimators(grid_searches).keys()}
submodel_rmse_sums = {name: 0.0 for name in initialize_best_estimators(grid_searches).keys()}

# Initialize the cumulative sum of weighted feature importances of the ensemble model.
weighted_feature_importances_sums = np.zeros(len(X.columns))

# Traverse seeds and save results
for seed in seeds:
    result = evaluate_models(seed, X, Y, grid_searches, submodel_r2_sums, submodel_rmse_sums, len(seeds))
    all_results.append(result)
    weighted_feature_importances_sums += np.array(list(result['weighted_feature_importances'].values()))
    print(f"")
    print(f"Seed {seed} - Voting Regressor R2 Mean: {result['voting_regressor_r2_mean']}, RMSE Mean: {result['voting_regressor_rmse_mean']}")
    print("Submodel R2 and RMSE Means:")
    for name in result['submodel_r2_means'].keys():
        print(f"{name}: R2 Mean = {result['submodel_r2_means'][name]}, RMSE Mean = {result['submodel_rmse_means'][name]}")

# Calculate the average value of the weighted feature importance of the fusion model for all seeds.
avg_weighted_feature_importances = weighted_feature_importances_sums / len(seeds)

# Bind the feature names and the corresponding weighted feature importances and sort them in descending order of importance.
sorted_feature_importances = sorted(zip(X.columns, avg_weighted_feature_importances), key=lambda x: x[1], reverse=True)

Output the average R2, RMSE of the fusion model and the sorted feature importance.
avg_voting_regressor_r2_mean = np.mean([result['voting_regressor_r2_mean'] for result in all_results])
avg_voting_regressor_rmse_mean = np.mean([result['voting_regressor_rmse_mean'] for result in all_results])

print(f"")
print(f"Average Voting Regressor R2 Mean: {avg_voting_regressor_r2_mean}")
print(f"Average Voting Regressor Weighted RMSE Mean: {avg_voting_regressor_rmse_mean}")
print("Average Submodel R2 and RMSE Means:")
for name in submodel_r2_sums.keys():
    print(f"{name}: R2 Mean = {submodel_r2_sums[name] / len(seeds)}, RMSE Mean = {submodel_rmse_sums[name] / len(seeds)}")
    
# Draw a bar chart of feature importances
features, importances = zip(*sorted_feature_importances)
plt.figure(figsize=(10, 8))
sns.barplot(x=list(importances), y=list(features))
plt.xlabel('Feature Importance')
plt.ylabel('Feature')
plt.title('Feature Importances Sorted by Importance')
plt.show()

print("Sorted Feature Importances:")
for feature, importance in sorted_feature_importances:
    print(f"{feature}: {importance}")
