In [1]:
from bayes_opt import BayesianOptimization
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
from sklearn.model_selection import ShuffleSplit, cross_val_score
from sklearn.tree import DecisionTreeRegressor
import numpy as np
from sklearn.model_selection import GroupKFold
from skopt.space import Real, Categorical, Integer
from skopt import BayesSearchCV
import pickle

In [2]:
def evaluate_performance(best_model, X, y_true):
    y_pred = best_model.predict(X)

    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    performance = {
        'MAE': mae,
        'MSE': mse,
        'R2': r2,
    }
    return performance

In [None]:
# Load data
data = pd.read_csv("data02.csv")
X = data.iloc[:, 2:]
y = data['value']
group = data['group']

# Initialize lists to store performance metrics
train_performance = []
val_performance = []
test_performance = []

all_test_preds = []

gkf = GroupKFold(n_splits=10)

val_test_diff_list = []
param_list = []

round_id = 1

# Loop through each fold
for train_index, test_index in gkf.split(X, y, group):
    print(f"\nNew round {round_id}")
    round_id += 1
    
    remain_data = data.iloc[train_index]
    test_data = data.iloc[test_index]

    X_remain=remain_data.iloc[:, 2:]
    y_remain=remain_data['value']
    group_remain=remain_data['group']

    X_test=test_data.iloc[:, 2:]
    y_test=test_data['value']
    group_test=test_data['group']

    # 决策树参数空间
    p_grid = {
        'criterion': Categorical(['squared_error', 'friedman_mse', 'absolute_error']),
        'splitter': Categorical(['best', 'random']),
        'max_depth': Integer(5, 20),
        'min_samples_split': Integer(2, 6),
        'min_samples_leaf': Integer(1, 4),
        'max_features': Categorical([None, 'sqrt', 'log2']),
        'ccp_alpha': Real(0.0, 0.15)
    }

    
    # configure the cross-validation procedure - inner loop (validation set/HP optimization)
    cv_inner = GroupKFold(n_splits=10)  

    model = DecisionTreeRegressor(random_state=42)
    
    # define search space
    search = BayesSearchCV(model, p_grid, n_iter=100, verbose=0, scoring='r2', cv=cv_inner,  n_jobs= -1, refit=True) # should be 100
    
    # execute search
    result = search.fit(X_remain, y_remain, groups=group_remain)
        
    # get the best performing model fit on the whole training set
    # 十次交叉验证找出最优参数
    best_model = result.best_estimator_           

    # Fit the best model on the entire training set
    # 在整个训练集上拟合最优模型best_model_remain，得到训练集和测试集的性能
    best_model_remain=best_model.fit(X_remain, y_remain)
    performance_train = evaluate_performance(best_model_remain, X_remain, y_remain)
    performance_test = evaluate_performance(best_model_remain, X_test, y_test)

    # 添加 test 预测结果
    y_test_pred = best_model_remain.predict(X_test)
    test_pred_df = pd.DataFrame({
        'true_value': y_test.values,
        'predicted_value': y_test_pred,
        'group': test_data['group'].values,  
        'time': test_data['Time'].values  
    })
    all_test_preds.append(test_pred_df)
        
    allfold_performance_val = []
    
    # 记录验证集结果，在内层X_remain交叉验证中计算验证集性能
    for train_idx, valid_idx in cv_inner.split(X_remain, y_remain, groups=group_remain):
        X_train = X_remain.iloc[train_idx]
        y_train = y_remain.iloc[train_idx]
        X_val = X_remain.iloc[valid_idx]
        y_val = y_remain.iloc[valid_idx]

        model_train = best_model.fit(X_train, y_train)
        
        fold_performance_val = evaluate_performance(model_train, X_val, y_val)

        allfold_performance_val.append(fold_performance_val)

        performance_val = pd.DataFrame(allfold_performance_val).mean()

    # 存储性能
    train_performance.append(performance_train)
    val_performance.append(performance_val)
    test_performance.append(performance_test)
    
    print("performance_train=")
    print(performance_train)
    print("performance_val=")
    print(performance_val)
    print("performance_test=")
    print(performance_test)

    val_r2 = performance_val['R2']
    test_r2 = performance_test['R2']
    val_test_diff = abs(val_r2 - test_r2)
    val_test_diff_list.append(val_test_diff)
    param_list.append(result.best_params_)


# 汇总10折预测结果
final_test_result = pd.concat(all_test_preds, ignore_index=True)
final_test_result.to_csv('test_results_DT.csv', index=False)

# Calculate average performance metrics
avg_train_performance = pd.DataFrame(train_performance).mean()
avg_val_performance = pd.DataFrame(val_performance).mean()
avg_test_performance = pd.DataFrame(test_performance).mean()
std_train_performance = pd.DataFrame(train_performance).std()
std_val_performance = pd.DataFrame(val_performance).std()
std_test_performance = pd.DataFrame(test_performance).std()

# Print av erage performance metrics
print("\n" +"Average Train Performance:" + str(avg_train_performance) + "+/-" + str(std_train_performance))
print("\n" +"Average Validation Performance:", avg_val_performance, "+/-", std_val_performance)
print("\n" +"Average Test Performance:", avg_test_performance, "+/-", std_test_performance)


New round 1
performance_train=
{'MAE': 0.3464648321546915, 'MSE': 0.291014459302318, 'R2': 0.6468522506990892}
performance_val=
MAE    0.352739
MSE    0.295483
R2     0.635325
dtype: float64
performance_test=
{'MAE': 0.31967987196216896, 'MSE': 0.24330328541763735, 'R2': 0.6089726226659525}

New round 2
performance_train=
{'MAE': 0.33420638699394195, 'MSE': 0.2789261125505471, 'R2': 0.6460896982461565}
performance_val=
MAE    0.341198
MSE    0.286712
R2     0.627337
dtype: float64
performance_test=
{'MAE': 0.32614851653593946, 'MSE': 0.2406500119220682, 'R2': 0.7472864649143247}

New round 3
performance_train=
{'MAE': 0.36458776978999746, 'MSE': 0.27925782611261496, 'R2': 0.6464152514039256}
performance_val=
MAE    0.377681
MSE    0.310899
R2     0.603493
dtype: float64
performance_test=
{'MAE': 0.3977054392296418, 'MSE': 0.3068425317791742, 'R2': 0.6719116607277749}

New round 4




performance_train=
{'MAE': 0.28299642110397294, 'MSE': 0.2101359514708976, 'R2': 0.7445001304603802}
performance_val=
MAE    0.329652
MSE    0.273708
R2     0.665100
dtype: float64
performance_test=
{'MAE': 0.32668475428625476, 'MSE': 0.2814558657405264, 'R2': 0.5608554056043729}

New round 5




performance_train=
{'MAE': 0.32314073570301494, 'MSE': 0.26149698967179047, 'R2': 0.6790588487319027}
performance_val=
MAE    0.335550
MSE    0.271565
R2     0.660282
dtype: float64
performance_test=
{'MAE': 0.3777626409552334, 'MSE': 0.36120410917742823, 'R2': 0.5052816839329526}

New round 6




performance_train=
{'MAE': 0.35132526956039534, 'MSE': 0.2771371619931868, 'R2': 0.6490549111646426}
performance_val=
MAE    0.364556
MSE    0.292547
R2     0.626206
dtype: float64
performance_test=
{'MAE': 0.38289836572420116, 'MSE': 0.3259005418600578, 'R2': 0.6284896769561358}

New round 7




performance_train=
{'MAE': 0.3449103134619611, 'MSE': 0.28628168437944246, 'R2': 0.6345698181550095}
performance_val=
MAE    0.355407
MSE    0.297822
R2     0.608773
dtype: float64
performance_test=
{'MAE': 0.37083555361664566, 'MSE': 0.30541459600990833, 'R2': 0.6966268033920252}

New round 8




performance_train=
{'MAE': 0.31835572378371535, 'MSE': 0.2578195698748155, 'R2': 0.6841622532636098}
performance_val=
MAE    0.335305
MSE    0.276686
R2     0.645791
dtype: float64
performance_test=
{'MAE': 0.34466203365448717, 'MSE': 0.2967641884544763, 'R2': 0.5846274245861026}

New round 9




performance_train=
{'MAE': 0.3250981244445225, 'MSE': 0.2626458318187949, 'R2': 0.6780300812049236}
performance_val=
MAE    0.333226
MSE    0.270493
R2     0.662453
dtype: float64
performance_test=
{'MAE': 0.38341598102179486, 'MSE': 0.390681308797552, 'R2': 0.4504528353821482}

New round 10




performance_train=
{'MAE': 0.3179276140255942, 'MSE': 0.2614779201574638, 'R2': 0.6795630715510327}
performance_val=
MAE    0.336040
MSE    0.284525
R2     0.641153
dtype: float64
performance_test=
{'MAE': 0.27285365067948714, 'MSE': 0.17816444612519997, 'R2': 0.737968546474804}

Average Train Performance:MAE    0.330901
MSE    0.266619
R2     0.668830
dtype: float64+/-MAE    0.022891
MSE    0.022949
R2     0.032202
dtype: float64

Average Validation Performance: MAE    0.346135
MSE    0.286044
R2     0.637591
dtype: float64 +/- MAE    0.015840
MSE    0.013258
R2     0.021609
dtype: float64

Average Test Performance: MAE    0.350265
MSE    0.293038
R2     0.619247
dtype: float64 +/- MAE    0.039003
MSE    0.061614
R2     0.097484
dtype: float64


In [4]:
# 创建性能汇总 DataFrame（包含最优参数）
performance_by_fold = pd.DataFrame({
    'Fold': list(range(1, len(train_performance) + 1)),
    'Train_R2': [p['R2'] for p in train_performance],
    'Val_R2': [p['R2'] for p in val_performance],
    'Test_R2': [p['R2'] for p in test_performance],
    'Train_MAE': [p['MAE'] for p in train_performance],
    'Val_MAE': [p['MAE'] for p in val_performance],
    'Test_MAE': [p['MAE'] for p in test_performance],
    'Train_MSE': [p['MSE'] for p in train_performance],
    'Val_MSE': [p['MSE'] for p in val_performance],
    'Test_MSE': [p['MSE'] for p in test_performance],
    'Val_Test_R2_Diff': val_test_diff_list,
    'Best_Params': param_list
})

# 保存为 CSV
performance_by_fold.to_csv("DT_results.csv", index=False)

In [7]:
# 找到 val-test 差值最小的一折
best_param_idx = int(np.argmin(val_test_diff_list))
best_params = param_list[best_param_idx]

# 全部数据重新训练最终模型
final_model = DecisionTreeRegressor(**best_params, random_state=42)
final_model.fit(X, y)

# 保存模型为 pickle
import pickle
with open("DT_model.pkl", "wb") as f:
    pickle.dump(final_model, f)

print(f"\n✅ 最优模型已保存：第 {best_param_idx + 1} 折参数（val-test最接近）用于全数据训练")



✅ 最优模型已保存：第 6 折参数（val-test最接近）用于全数据训练
