In [4]:
# compare_gbdt_models.py
import time
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.ensemble import GradientBoostingClassifier
import warnings

# Try imports for external libs (may not be installed)
have_xgb = have_lgb = have_cat = False
try:
    import xgboost as xgb
    have_xgb = True
except Exception as e:
    warnings.warn(f"xgboost not available: {e}")
try:
    import lightgbm as lgb
    have_lgb = True
except Exception as e:
    warnings.warn(f"lightgbm not available: {e}")
try:
    from catboost import CatBoostClassifier
    have_cat = True
except Exception as e:
    warnings.warn(f"catboost not available: {e}")

# Load dataset
data = load_breast_cancer()
X, y = data.data, data.target
feature_names = data.feature_names

# Train/test split (reproducible)
RND = 42
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=RND, stratify=y
)

# Common hyperparams (simple, comparable)
common_params = dict(n_estimators=200, learning_rate=0.1, max_depth=6, random_state=RND)

results = []

# 1) sklearn GradientBoosting
print("Training sklearn GradientBoosting...")
t0 = time.time()
clf_sklearn = GradientBoostingClassifier(**common_params)
clf_sklearn.fit(X_train, y_train)
t1 = time.time()
pred = clf_sklearn.predict(X_test)
probs = clf_sklearn.predict_proba(X_test)[:, 1]
results.append(("sklearn-GBDT", t1 - t0, accuracy_score(y_test, pred), roc_auc_score(y_test, probs), getattr(clf_sklearn, "feature_importances_", None)))

# 2) XGBoost
if have_xgb:
    print("Training XGBoost (XGBClassifier)...")
    t0 = time.time()
    clf_xgb = xgb.XGBClassifier(**common_params, use_label_encoder=False, eval_metric='logloss')
    clf_xgb.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
    t1 = time.time()
    pred = clf_xgb.predict(X_test)
    probs = clf_xgb.predict_proba(X_test)[:, 1]
    results.append(("XGBoost", t1 - t0, accuracy_score(y_test, pred), roc_auc_score(y_test, probs), clf_xgb.feature_importances_))
else:
    print("Skipping XGBoost (not installed).")

# 3) LightGBM
if have_lgb:
    print("Training LightGBM (LGBMClassifier)...")
    t0 = time.time()
    clf_lgb = lgb.LGBMClassifier(**common_params, n_jobs= -1)
    clf_lgb.fit(X_train, y_train, eval_set=[(X_test, y_test)])
    t1 = time.time()
    pred = clf_lgb.predict(X_test)
    probs = clf_lgb.predict_proba(X_test)[:, 1]
    results.append(("LightGBM", t1 - t0, accuracy_score(y_test, pred), roc_auc_score(y_test, probs), clf_lgb.feature_importances_))
else:
    print("Skipping LightGBM (not installed).")

# 4) CatBoost
if have_cat:
    print("Training CatBoost (CatBoostClassifier)...")
    t0 = time.time()
    clf_cat = CatBoostClassifier(iterations=common_params['n_estimators'],
                                 learning_rate=common_params['learning_rate'],
                                 depth=common_params['max_depth'],
                                 verbose=False,
                                 random_seed=RND)
    # no categorical features here; if present, pass cat_features indices
    clf_cat.fit(X_train, y_train, eval_set=(X_test, y_test))
    t1 = time.time()
    pred = clf_cat.predict(X_test)
    probs = clf_cat.predict_proba(X_test)[:, 1]
    results.append(("CatBoost", t1 - t0, accuracy_score(y_test, pred), roc_auc_score(y_test, probs), clf_cat.get_feature_importance()))
else:
    print("Skipping CatBoost (not installed).")

# Print summary
print("\nSummary (model, train_time_sec, accuracy, auc):")
for name, t, acc, auc, fi in results:
    print(f"{name:10s}  {t:6.3f}s   acc={acc:.4f}   auc={auc:.4f}")

# Print top-10 feature importances for each model (if available)
print("\nTop features per model:")
topk = 10
for name, t, acc, auc, fi in results:
    if fi is None:
        print(f"{name}: no importances")
        continue
    # fi corresponds to feature importances vector of length n_features
    fi_arr = np.array(fi)
    idx = np.argsort(fi_arr)[::-1][:topk]
    print(f"\n{name}:")
    for i in idx:
        print(f"  {feature_names[i]:30s} {fi_arr[i]:.4f}")


Training sklearn GradientBoosting...
Training XGBoost (XGBClassifier)...
Training LightGBM (LGBMClassifier)...
[LightGBM] [Info] Number of positive: 239, number of negative: 142
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000205 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3816
[LightGBM] [Info] Number of data points in the train set: 381, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.627297 -> initscore=0.520636
[LightGBM] [Info] Start training from score 0.520636


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Training CatBoost (CatBoostClassifier)...





Summary (model, train_time_sec, accuracy, auc):
sklearn-GBDT   0.601s   acc=0.9255   auc=0.9831
XGBoost      0.152s   acc=0.9574   auc=0.9960
LightGBM     0.268s   acc=0.9681   auc=0.9948
CatBoost     0.346s   acc=0.9681   auc=0.9955

Top features per model:

sklearn-GBDT:
  worst radius                   0.7223
  worst concave points           0.1262
  worst texture                  0.0528
  texture error                  0.0219
  worst concavity                0.0195
  mean fractal dimension         0.0115
  mean texture                   0.0070
  smoothness error               0.0053
  worst symmetry                 0.0052
  compactness error              0.0049

XGBoost:
  worst perimeter                0.4929
  worst radius                   0.2323
  worst concave points           0.0452
  worst area                     0.0444
  mean perimeter                 0.0310
  mean concave points            0.0275
  concavity error                0.0205
  worst texture                  0.