In [3]:
# 导入库
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from bayes_opt import BayesianOptimization
from sklearn.exceptions import NotFittedError

# 读取数据
data = pd.read_csv('data/temps_extended.csv')

# 数据预处理
# 1. 处理缺失值（假设数据中有缺失值）
data = data.dropna()
# 2. 独热编码
features = pd.get_dummies(data)
# 3. 分离特征和标签
labels = features['actual']
features = features.drop('actual', axis=1)

# 转换为 NumPy 数组
features = np.array(features)
labels = np.array(labels)

# 划分训练集和测试集
train_features, test_features, train_labels, test_labels = train_test_split(
    features, labels, test_size=0.25, random_state=42
)

# 定义贝叶斯优化的目标函数
def rf_cv(n_estimators, max_depth, max_features, min_samples_split, min_samples_leaf):
    """
    目标函数：随机森林的交叉验证性能（负MAE，贝叶斯优化会最大化该值）
    """
    # 将参数转换为整数（贝叶斯优化默认用浮点数）
    params = {
        'n_estimators': int(n_estimators),
        'max_depth': int(max_depth),
        'max_features': max_features if isinstance(max_features, str) else float(max_features),
        'min_samples_split': int(min_samples_split),
        'min_samples_leaf': int(min_samples_leaf),
        'bootstrap': True,
        'random_state': 42
    }
    
    # 处理 max_features 的合法性
    if isinstance(params['max_features'], float):
        if not (0.0 < params['max_features'] <= 1.0):
            return -9999  # 返回极差的值以跳过非法参数
    elif params['max_features'] not in ['sqrt', 'log2']:
        return -9999
    
    # 创建模型并训练
    model = RandomForestRegressor(**params)
    try:
        model.fit(train_features, train_labels)
    except NotFittedError:
        return -9999
    
    # 预测并计算负MAE（贝叶斯优化会最大化该值）
    preds = model.predict(test_features)
    return -mean_absolute_error(test_labels, preds)

# 定义参数空间
param_bounds = {
    'n_estimators': (100, 2000),
    'max_depth': (5, 50),
    'max_features': (0.1, 1.0), 
    'min_samples_split': (2, 20),
    'min_samples_leaf': (1, 10)
}

# 初始化贝叶斯优化器
optimizer = BayesianOptimization(
    f=rf_cv,
    pbounds=param_bounds,
    random_state=42,
    verbose=2  # 显示详细过程
)

# 执行优化
optimizer.maximize(
    init_points=10,  # 初始随机探索的步数
    n_iter=100       # 贝叶斯优化的迭代次数
)

# 输出最佳参数
print("\n\033[1;36m=== 贝叶斯优化最终参数 ===\033[0m")  # 添加彩色标题
for param, value in sorted(optimizer.max['params'].items()):
    # 参数类型转换逻辑
    if param in ['n_estimators', 'max_depth', 'min_samples_split', 'min_samples_leaf']:
        display_value = f"{int(round(value))}棵" if param == 'n_estimators' else int(round(value))
    else:
        display_value = f"{value:.3f}" if isinstance(value, float) else value
    
    # 参数说明映射
    param_names = {
        'n_estimators': '决策树数量',
        'max_depth': '树最大深度',
        'max_features': '特征采样比例',
        'min_samples_split': '分裂最小样本',
        'min_samples_leaf': '叶节点样本'
    }
    
    print(f"│ \033[1;33m{param_names.get(param, param).ljust(10)}\033[0m │ {str(display_value).center(8)} │")

print("\033[1;36m===========================\033[0m")

# 用最佳参数训练最终模型
best_params = optimizer.max['params']
best_params['n_estimators'] = int(best_params['n_estimators'])
best_params['max_depth'] = int(best_params['max_depth'])
best_params['min_samples_split'] = int(best_params['min_samples_split'])
best_params['min_samples_leaf'] = int(best_params['min_samples_leaf'])

# 处理 max_features 类型
if isinstance(best_params['max_features'], float):
    best_params['max_features'] = best_params['max_features']
else:
    best_params['max_features'] = best_params['max_features']  # 已经是 'sqrt' 或 'log2'

final_model = RandomForestRegressor(**best_params)
final_model.fit(train_features, train_labels)

# 评估最终模型
test_preds = final_model.predict(test_features)
mae = mean_absolute_error(test_labels, test_preds)
print(f"测试集 MAE: {mae}")
print(f"测试集 ACCURACY：{100-mae}")

|   iter    |  target   | max_depth | max_fe... | min_sa... | min_sa... | n_esti... |
-------------------------------------------------------------------------------------
| [39m1        [39m | [39m-3.634   [39m | [39m21.85    [39m | [39m0.9556   [39m | [39m7.588    [39m | [39m12.78    [39m | [39m396.4    [39m |
| [39m2        [39m | [39m-3.834   [39m | [39m12.02    [39m | [39m0.1523   [39m | [39m8.796    [39m | [39m12.82    [39m | [39m1.445e+03[39m |
| [39m3        [39m | [39m-3.637   [39m | [39m5.926    [39m | [39m0.9729   [39m | [39m8.492    [39m | [39m5.822    [39m | [39m445.5    [39m |
| [35m4        [39m | [35m-3.6     [39m | [35m13.25    [39m | [35m0.3738   [39m | [35m5.723    [39m | [35m9.775    [39m | [35m653.3    [39m |
| [39m5        [39m | [39m-3.622   [39m | [39m32.53    [39m | [39m0.2255   [39m | [39m3.629    [39m | [39m8.595    [39m | [39m966.5    [39m |
| [39m6        [39m | [39m-3.634   [39m | [