In [1]:
from bayes_opt import BayesianOptimization
from sklearn.ensemble import RandomForestRegressor
import pandas as pd
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
from sklearn.model_selection import ShuffleSplit, cross_val_score
import numpy as np
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.model_selection import GroupShuffleSplit, GroupKFold
from skopt.space import Real, Categorical, Integer
from skopt import BayesSearchCV
import pickle

In [2]:
def evaluate_performance(best_model, X, y_true):
    y_pred = best_model.predict(X)

    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    performance = {
        'MAE': mae,
        'MSE': mse,
        'R2': r2,
    }
    return performance

In [3]:
# Load data
data = pd.read_csv("data02.csv")
X = data.iloc[:, 2:]
y = data['value']
group = data['group']

# Initialize lists to store performance metrics
train_performance = []
val_performance = []
test_performance = []

all_test_preds = []

gkf = GroupKFold(n_splits=10)

val_test_diff_list = []
param_list = []

round_id = 1

# Loop through each fold
for train_index, test_index in gkf.split(X, y, group):
    print(f"\nNew round {round_id}")
    round_id += 1
    
    remain_data = data.iloc[train_index]
    test_data = data.iloc[test_index]

    X_remain=remain_data.iloc[:, 2:]
    y_remain=remain_data['value']
    group_remain=remain_data['group']

    X_test=test_data.iloc[:, 2:]
    y_test=test_data['value']
    group_test=test_data['group']

    # 随机森林参数空间
    p_grid = {
        'n_estimators': Integer(100, 400),
        'criterion': Categorical(['squared_error', 'absolute_error']),
        'max_depth': Integer(5, 15),
        'min_samples_split': Integer(2, 8),
        'min_samples_leaf': Integer(1, 4),
        'max_features': Categorical(['log2', 'sqrt'])
    }
    
    # configure the cross-validation procedure - inner loop (validation set/HP optimization)
    cv_inner = GroupKFold(n_splits=10)  

    model = RandomForestRegressor(random_state=42, n_jobs=-1, verbose=0)
    
    # define search space
    search = BayesSearchCV(model, p_grid, n_iter=100, verbose=0, scoring='r2', cv=cv_inner,  n_jobs= -1, refit=True) # should be 100
    
    # execute search
    result = search.fit(X_remain, y_remain, groups=group_remain)
        
    # get the best performing model fit on the whole training set
    # 十次交叉验证找出最优参数
    best_model = result.best_estimator_           

    # Fit the best model on the entire training set
    # 在整个训练集上拟合最优模型best_model_remain，得到训练集和测试集的性能
    best_model_remain=best_model.fit(X_remain, y_remain)
    performance_train = evaluate_performance(best_model_remain, X_remain, y_remain)
    performance_test = evaluate_performance(best_model_remain, X_test, y_test)

    # 添加 test 预测结果
    y_test_pred = best_model_remain.predict(X_test)
    test_pred_df = pd.DataFrame({
        'true_value': y_test.values,
        'predicted_value': y_test_pred,
        'group': test_data['group'].values,  # 可选：保留 group 信息
        'time': test_data['Time'].values  # 可选：保留时间信息
    })
    all_test_preds.append(test_pred_df)
        
    allfold_performance_val = []
    
    # 记录验证集结果，在内层X_remain交叉验证中计算验证集性能
    for train_idx, valid_idx in cv_inner.split(X_remain, y_remain, groups=group_remain):
        X_train = X_remain.iloc[train_idx]
        y_train = y_remain.iloc[train_idx]
        X_val = X_remain.iloc[valid_idx]
        y_val = y_remain.iloc[valid_idx]

        model_train = best_model.fit(X_train, y_train)
        
        fold_performance_val = evaluate_performance(model_train, X_val, y_val)

        allfold_performance_val.append(fold_performance_val)

        performance_val = pd.DataFrame(allfold_performance_val).mean()

    # 存储性能
    train_performance.append(performance_train)
    val_performance.append(performance_val)
    test_performance.append(performance_test)
    
    print("performance_train=")
    print(performance_train)
    print("performance_val=")
    print(performance_val)
    print("performance_test=")
    print(performance_test)

    val_r2 = performance_val['R2']
    test_r2 = performance_test['R2']
    val_test_diff = abs(val_r2 - test_r2)
    val_test_diff_list.append(val_test_diff)
    param_list.append(result.best_params_)


# 汇总10折预测结果
final_test_result = pd.concat(all_test_preds, ignore_index=True)
final_test_result.to_csv('test_results_RF.csv', index=False)

# Calculate average performance metrics
avg_train_performance = pd.DataFrame(train_performance).mean()
avg_val_performance = pd.DataFrame(val_performance).mean()
avg_test_performance = pd.DataFrame(test_performance).mean()
std_train_performance = pd.DataFrame(train_performance).std()
std_val_performance = pd.DataFrame(val_performance).std()
std_test_performance = pd.DataFrame(test_performance).std()

# Print av erage performance metrics
print("\n" +"Average Train Performance:" + str(avg_train_performance) + "+/-" + str(std_train_performance))
print("\n" +"Average Validation Performance:", avg_val_performance, "+/-", std_val_performance)
print("\n" +"Average Test Performance:", avg_test_performance, "+/-", std_test_performance)


New round 1




performance_train=
{'MAE': 0.15548131643756657, 'MSE': 0.059679544431365826, 'R2': 0.9275785236040583}
performance_val=
MAE    0.463318
MSE    0.391725
R2     0.514815
dtype: float64
performance_test=
{'MAE': 0.45631645444907226, 'MSE': 0.4070887165529903, 'R2': 0.3457431826177063}

New round 2
performance_train=
{'MAE': 0.16205673491807762, 'MSE': 0.06705721635846246, 'R2': 0.9149156762011823}
performance_val=
MAE    0.460320
MSE    0.395615
R2     0.483369
dtype: float64
performance_test=
{'MAE': 0.47771703570459245, 'MSE': 0.3854781951340959, 'R2': 0.5951982025152358}

New round 3
performance_train=
{'MAE': 0.16793467485824562, 'MSE': 0.07206842680792173, 'R2': 0.9087499285899423}
performance_val=
MAE    0.460582
MSE    0.391546
R2     0.500336
dtype: float64
performance_test=
{'MAE': 0.5081780595753719, 'MSE': 0.42488346481060385, 'R2': 0.5456975617243909}

New round 4
performance_train=
{'MAE': 0.17281848388390955, 'MSE': 0.07484060306183696, 'R2': 0.9090028898685903}
performance_



performance_train=
{'MAE': 0.17394886815597665, 'MSE': 0.07792500643258259, 'R2': 0.9043608826685285}
performance_val=
MAE    0.455464
MSE    0.380153
R2     0.518793
dtype: float64
performance_test=
{'MAE': 0.48005817117144384, 'MSE': 0.4430370250140133, 'R2': 0.393200339084117}

New round 6




performance_train=
{'MAE': 0.17846481362623187, 'MSE': 0.0815220523269626, 'R2': 0.8967667717668629}
performance_val=
MAE    0.466934
MSE    0.396877
R2     0.485578
dtype: float64
performance_test=
{'MAE': 0.46209889955871875, 'MSE': 0.4145836589498106, 'R2': 0.5273953575342973}

New round 7




performance_train=
{'MAE': 0.15715555429511185, 'MSE': 0.06341487160243638, 'R2': 0.9190527745021919}
performance_val=
MAE    0.462229
MSE    0.395546
R2     0.483489
dtype: float64
performance_test=
{'MAE': 0.5171868597798439, 'MSE': 0.45100575646041596, 'R2': 0.5520087781870278}

New round 8




performance_train=
{'MAE': 0.15473166775964642, 'MSE': 0.061530526283365757, 'R2': 0.9246230114095741}
performance_val=
MAE    0.452969
MSE    0.377077
R2     0.518439
dtype: float64
performance_test=
{'MAE': 0.49072563809704006, 'MSE': 0.39898666454269466, 'R2': 0.4415494697321859}

New round 9




performance_train=
{'MAE': 0.15700383682297472, 'MSE': 0.06175668484224683, 'R2': 0.9242942685744596}
performance_val=
MAE    0.466569
MSE    0.397231
R2     0.506276
dtype: float64
performance_test=
{'MAE': 0.40773138229084444, 'MSE': 0.3421669883747001, 'R2': 0.5186949207631912}

New round 10
performance_train=
{'MAE': 0.15582748563882473, 'MSE': 0.06350606085574546, 'R2': 0.9221743577191774}
performance_val=
MAE    0.473989
MSE    0.407695
R2     0.489351
dtype: float64
performance_test=
{'MAE': 0.39311113956017957, 'MSE': 0.27679767607935524, 'R2': 0.5929058856978588}

Average Train Performance:MAE    0.163542
MSE    0.068330
R2     0.915152
dtype: float64+/-MAE    0.008970
MSE    0.007716
R2     0.010141
dtype: float64

Average Validation Performance: MAE    0.463660
MSE    0.393786
R2     0.500046
dtype: float64 +/- MAE    0.007007
MSE    0.009476
R2     0.014231
dtype: float64

Average Test Performance: MAE    0.460327
MSE    0.386305
R2     0.501464
dtype: float64 +/- MAE    0.

In [4]:
# 创建性能汇总 DataFrame（包含最优参数）
performance_by_fold = pd.DataFrame({
    'Fold': list(range(1, len(train_performance) + 1)),
    'Train_R2': [p['R2'] for p in train_performance],
    'Val_R2': [p['R2'] for p in val_performance],
    'Test_R2': [p['R2'] for p in test_performance],
    'Train_MAE': [p['MAE'] for p in train_performance],
    'Val_MAE': [p['MAE'] for p in val_performance],
    'Test_MAE': [p['MAE'] for p in test_performance],
    'Train_MSE': [p['MSE'] for p in train_performance],
    'Val_MSE': [p['MSE'] for p in val_performance],
    'Test_MSE': [p['MSE'] for p in test_performance],
    'Val_Test_R2_Diff': val_test_diff_list,
    'Best_Params': param_list
})

# 保存为 CSV
performance_by_fold.to_csv("RF_results.csv", index=False)

In [5]:
# 找到 val-test 差值最小的一折
best_param_idx = int(np.argmin(val_test_diff_list))
best_params = param_list[best_param_idx]

# 全部数据重新训练最终模型
final_model = RandomForestRegressor(**best_params)
final_model.fit(X, y)

# 保存模型为 pickle
import pickle
with open("RF_model.pkl", "wb") as f:
    pickle.dump(final_model, f)

print(f"\n✅ 最优模型已保存：第 {best_param_idx + 1} 折参数（val-test最接近）用于全数据训练")



✅ 最优模型已保存：第 4 折参数（val-test最接近）用于全数据训练
