In [1]:

# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
from ibug import IBUGWrapper
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings("ignore")
df = pd.read_csv(r"F:\PycharmProjects\gihub\2025\3.3增量树\data.csv")
features = [
    "train_direction", "station_name", "scheduled_arrival_time",
    "wind", "weather", "temperature", "major_holiday"
]
target = "arrival_delay"
X = df[features].copy()
y = df[target]
X.loc[:, "scheduled_arrival_time"] = X["scheduled_arrival_time"].apply(
    lambda x: int(x.split(":")[0]) * 60 + int(x.split(":")[1]))
X = X.assign(
    wind_speed=X["wind"].str.split().str[0],
    wind_direction=X["wind"].str.split("from the ").str[-1]
).drop("wind", axis=1)
X.loc[:, "major_holiday"] = X["major_holiday"].astype(int)
categorical_cols = ["train_direction", "station_name", "weather", "wind_speed", "wind_direction"]
X_encoded = pd.get_dummies(X, columns=categorical_cols)

#数据分割
X_full = X_encoded.values.astype(np.float32)
y_full = y.values.astype(np.float32)

# 第一次分割：60%初始数据，40%后续数据
X_initial, X_temp, y_initial, y_temp = train_test_split(
    X_full, y_full, test_size=0.4, random_state=42
)

# 第二次分割：40%后续数据分为20%增量池和20%测试集
X_incremental_pool, X_test, y_incremental_pool, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42
)

# 初始训练集和验证集划分
X_train, X_val, y_train, y_val = train_test_split(
    X_initial, y_initial, test_size=0.3, random_state=42
)

In [9]:
from sklearn.metrics import r2_score

# 初始模型训练
initial_params = {
    'objective': 'reg:squarederror',  # 学习目标：回归任务
    'learning_rate': 0.1,            # 学习速度（小步慢走更稳定）
    'max_depth': 3,                   # 每棵树最多3层（防止过于复杂）
    'subsample': 0.8,                 # 每次随机用80%的数据训练
    'colsample_bytree': 0.8,          # 每次随机用80%的特征
    'seed': 42 ,                       # 固定随机种子（保证结果可复现）
    'n_estimators': 50,
    'reg_alpha': 0,
    'gamma': 0.1,
    'reg_lambda': 0,
    'random_state': 42,
    'scale_pos_weight':1,
    'base_score':0.5,
    'warm_start': True
}

base_model = XGBRegressor(**initial_params)
base_model.fit(X_train, y_train)

prob_model = IBUGWrapper().fit(
    base_model,
    X_train,
    y_train,
    X_val=X_val,
    y_val=y_val
)

# 从增量池取数据（30%增量池数据）
_, X_new, _, y_new = train_test_split(
    X_incremental_pool, y_incremental_pool, 
    test_size=0.3, random_state=42
)

# 动态验证集更新（添加20%新数据到验证集）
val_update_ratio = 0.2  # 验证集更新比例
n_val_update = int(len(X_val) * val_update_ratio)
X_val_updated = np.vstack([X_val[n_val_update:], X_new[:n_val_update]])
y_val_updated = np.concatenate([y_val[n_val_update:], y_new[:n_val_update]])

# 数据保留策略（保留50%旧数据 + 全部新数据）
_, X_retain, _, y_retain = train_test_split(
    X_initial, y_initial,
    test_size=0.5,
    random_state=42
)

# 合并训练数据集
X_combined = np.vstack([X_retain, X_new])
y_combined = np.concatenate([y_retain, y_new])

# 增量模型参数
incremental_params = {
    'learning_rate': 0.05,            # 学习速度（小步慢走更稳定）
    'max_depth': 3,                   # 保持一致即可，不然不好兼容
    'subsample': 0.8,                 # 每次随机用80%的数据训练
    'colsample_bytree': 0.8,          # 每次随机用80%的特征
    'seed': 42 ,                       # 固定随机种子（保证结果可复现）
    'reg_alpha': 0,  # IBUG检查
    'gamma': 0        #IBUG检查
}

# 模型增量更新
base_model.set_params(**incremental_params)
base_model.n_estimators += 30  # 增加30棵新树
base_model.fit(
    X_combined, 
    y_combined,
    xgb_model=base_model,
    eval_set=[(X_val_updated, y_val_updated)],  # 使用更新后的验证集
    verbose=False
)

# 重新校准概率模型
updated_prob_model = IBUGWrapper().fit(
    base_model,
    X_combined,
    y_combined,
    X_val=X_val_updated,  # 使用动态更新的验证集
    y_val=y_val_updated
)

# ================ 评估部分 ================
def evaluate_prob_model(model, X, y, data_name):
    pred_mean, pred_std = model.pred_dist(X)
    mse = mean_squared_error(y, pred_mean)
    coverage = np.mean((y >= pred_mean - 1.96*pred_std) & (y <= pred_mean + 1.96*pred_std))
    rmse = np.sqrt(mse)
    r2 = r2_score(y, pred_mean)
    avg_std = np.mean(pred_std)
    print(f"{data_name}:  MSE = {mse:.4f} | RMSE = {rmse:.4f} | R² = {r2:.4f} | 95%覆盖 = {coverage:.2%} | 平均方差 = {avg_std:.4f}")

print("\n【初始模型】")
evaluate_prob_model(prob_model, X_initial, y_initial, "旧训练数据")
evaluate_prob_model(prob_model, X_new, y_new, "新数据")
evaluate_prob_model(prob_model, X_test, y_test, "测试数据")

print("\n【增量模型】")
evaluate_prob_model(updated_prob_model, X_initial, y_initial, "旧训练数据")
evaluate_prob_model(updated_prob_model, X_new, y_new, "新数据")
evaluate_prob_model(updated_prob_model, X_test, y_test, "测试数据")

# 全量模型对比
full_params = initial_params.copy()
full_model = XGBRegressor(**full_params)
full_model.fit(
    np.vstack([X_initial, X_incremental_pool]), 
    np.concatenate([y_initial, y_incremental_pool])
)

# 全量模型预测
test_pred_mean, _ = prob_model.pred_dist(X_test)
print(f"\n初始模型测试集 "
      f"MSE: {mean_squared_error(y_test, test_pred_mean):.4f}, "
      f"R平方: {r2_score(y_test, test_pred_mean):.4f}, "
      f"RMSE: {mean_squared_error(y_test, test_pred_mean, squared=False):.4f}")

# 增量模型
updated_pred_mean, _ = updated_prob_model.pred_dist(X_test)
print("增量模型测试集 "
      f"MSE: {mean_squared_error(y_test, updated_pred_mean):.4f}, "
      f"R平方: {r2_score(y_test, updated_pred_mean):.4f}, "
      f"RMSE: {mean_squared_error(y_test, updated_pred_mean, squared=False):.4f}")

# 全量模型
full_pred = full_model.predict(X_test)
print("全量模型测试集 "
      f"MSE: {mean_squared_error(y_test, full_pred):.4f}, "
      f"R平方: {r2_score(y_test, full_pred):.4f}, "
      f"RMSE: {mean_squared_error(y_test, full_pred, squared=False):.4f}")



【初始模型】
旧训练数据:  MSE = 0.8976 | RMSE = 0.9474 | R² = 0.6761 | 95%覆盖 = 100.00% | 平均方差 = 1.6055
新数据:  MSE = 2.2584 | RMSE = 1.5028 | R² = -1.5407 | 95%覆盖 = 100.00% | 平均方差 = 1.6453
测试数据:  MSE = 1.1712 | RMSE = 1.0822 | R² = -0.2820 | 95%覆盖 = 100.00% | 平均方差 = 1.4851

【增量模型】
旧训练数据:  MSE = 0.9142 | RMSE = 0.9561 | R² = 0.6701 | 95%覆盖 = 58.33% | 平均方差 = 0.1092
新数据:  MSE = 0.0129 | RMSE = 0.1134 | R² = 0.9855 | 95%覆盖 = 66.67% | 平均方差 = 0.0952
测试数据:  MSE = 2.1140 | RMSE = 1.4540 | R² = -1.3140 | 95%覆盖 = 22.22% | 平均方差 = 0.1121

初始模型测试集 MSE: 1.1712, R平方: -0.2820, RMSE: 1.0822
增量模型测试集 MSE: 2.1140, R平方: -1.3140, RMSE: 1.4540
全量模型测试集 MSE: 1.6179, R平方: -0.7709, RMSE: 1.2720
