In [9]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.datasets import fetch_california_housing
import lightgbm as lgb
import xgboost as xgb
import catboost as cb

# 加载数据
data = fetch_california_housing()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 初始化模型
lgb_model = lgb.LGBMRegressor(random_state=42)
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
cat_model = cb.CatBoostRegressor(verbose=0, random_state=42)

# 训练模型
lgb_model.fit(X_train, y_train)
xgb_model.fit(X_train, y_train)
cat_model.fit(X_train, y_train)

# 预测
lgb_preds = lgb_model.predict(X_test)
xgb_preds = xgb_model.predict(X_test)
cat_preds = cat_model.predict(X_test)

# 计算MSE
lgb_mse = mean_squared_error(y_test, lgb_preds)
xgb_mse = mean_squared_error(y_test, xgb_preds)
cat_mse = mean_squared_error(y_test, cat_preds)

print(f"LGBM MSE: {lgb_mse:.4f}")
print(f"XGB MSE: {xgb_mse:.4f}")
print(f"CatBoost MSE: {cat_mse:.4f}")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000435 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1838
[LightGBM] [Info] Number of data points in the train set: 16512, number of used features: 8
[LightGBM] [Info] Start training from score 2.071947
LGBM MSE: 0.2148
XGB MSE: 0.2246
CatBoost MSE: 0.1989


In [13]:
single_model_regression_cv(lgb_model, X_train, X_test, y_train, cv=5, seed=42, verbose=1, metric="mse")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002344 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1838
[LightGBM] [Info] Number of data points in the train set: 13209, number of used features: 8
[LightGBM] [Info] Start training from score 2.066917
Fold 1 mse: 0.2406
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000368 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1838
[LightGBM] [Info] Number of data points in the train set: 13209, number of used features: 8
[LightGBM] [Info] Start training from score 2.073789
Fold 2 mse: 0.2166
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000319 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1838
[LightGBM] [Info] Number of data points in the train set: 13210, number of used feat

{'train_predictions': array([1.34408973, 3.52637272, 2.15333585, ..., 1.97501915, 2.62226002,
        3.28115353]),
 'test_predictions': array([0.59301304, 0.95281747, 4.89589475, ..., 5.17097376, 0.63120748,
        1.77517752]),
 'cv_metrics': [0.2405911968967214,
  0.21664349660529608,
  0.2115931870292395,
  0.2290636159764674,
  0.20890274019987193],
 'cv_mean': 0.22135884734151925,
 'cv_std': 0.011849084252854303}

In [11]:
from sklearn.model_selection import KFold
from sklearn.base import clone
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import numpy as np

def single_model_regression_cv(model, X_train, X_test, y_train, cv=5, seed=42, verbose=1, metric="r2"):
    np.random.seed(seed)
    kf = KFold(n_splits=cv, shuffle=True, random_state=seed)
    
    # Choose the metric
    if metric == "r2":
        scorer = r2_score
    elif metric == "mse":
        scorer = mean_squared_error
    elif metric == "mae":
        scorer = mean_absolute_error
    else:
        raise ValueError("Unsupported metric. Use 'r2', 'mse', or 'mae'.")
    
    # Initialize lists to store fold results
    metrics = []
    train_predictions = np.zeros(len(X_train))
    test_predictions = np.zeros(len(X_test))
    
    # Cross-validation loop
    for fold, (train_idx, val_idx) in enumerate(kf.split(X_train), 1):
        X_train_fold, X_val_fold = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_train_fold, y_val_fold = y_train[train_idx], y_train[val_idx]
        
        # Clone and fit the model
        cloned_model = clone(model)
        cloned_model.fit(X_train_fold, y_train_fold)
        
        # Predictions
        val_pred = cloned_model.predict(X_val_fold)
        train_predictions[val_idx] = val_pred
        
        # Evaluate metric
        fold_metric = scorer(y_val_fold, val_pred)
        metrics.append(fold_metric)
        
        if verbose:
            print(f"Fold {fold} {metric}: {fold_metric:.4f}")
    
    # Predict on the test set
    model.fit(X_train, y_train)
    test_predictions = model.predict(X_test)
    
    # Print average metric
    if verbose:
        print(f"Average {metric}: {np.mean(metrics):.4f} (+/- {np.std(metrics):.4f})")
    
    # Results dictionary
    results = {
        "train_predictions": train_predictions,
        "test_predictions": test_predictions,
        "cv_metrics": metrics,
        "cv_mean": np.mean(metrics),
        "cv_std": np.std(metrics)
    }
    
    return results