In [1]:
import numpy as np
from scipy.sparse import load_npz
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score


In [2]:
X_train = load_npz("../data_process/processed_fe/X_train_proc_fe.npz")
X_valid = load_npz("../data_process/processed_fe/X_valid_proc_fe.npz")

y_train = np.load("../data_process/processed_fe/y_train_fe.npy")
y_valid = np.load("../data_process/processed_fe/y_valid_fe.npy")

print("Train:", X_train.shape)
print("Valid:", X_valid.shape)


Train: (475195, 67)
Valid: (118799, 67)


In [3]:
# LightGBM（用验证最好的一版参数）
lgbm = LGBMClassifier(
    objective="binary",
    learning_rate=0.03,
    num_leaves=31,
    subsample=0.8,
    colsample_bytree=0.8,
    n_estimators=500,
    n_jobs=-1
)

# CatBoost（简单参数）
cat = CatBoostClassifier(
    iterations=800,
    learning_rate=0.03,
    depth=6,
    loss_function="Logloss",
    verbose=False,
    random_seed=42
)

# 训练
lgbm.fit(X_train, y_train)
cat.fit(X_train, y_train)

# valid 集预测
valid_lgbm = lgbm.predict_proba(X_valid)[:, 1]
valid_cat  = cat.predict_proba(X_valid)[:, 1]

print("LGBM AUC:", roc_auc_score(y_valid, valid_lgbm))   # ~0.92155
print("CatBoost AUC:", roc_auc_score(y_valid, valid_cat)) # ~0.92118


[LightGBM] [Info] Number of positive: 379595, number of negative: 95600
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.017511 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2157
[LightGBM] [Info] Number of data points in the train set: 475195, number of used features: 67
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.798819 -> initscore=1.378932
[LightGBM] [Info] Start training from score 1.378932
LGBM AUC: 0.9197290940586428
CatBoost AUC: 0.9184689584756063


In [None]:
X_train_stack = np.vstack([
    lgbm.predict_proba(X_train)[:, 1],
    cat.predict_proba(X_train)[:, 1]
]).T

X_valid_stack = np.vstack([valid_lgbm, valid_cat]).T

meta_lr = LogisticRegression(max_iter=1000)
meta_lr.fit(X_train_stack, y_train)

stack_valid_pred = meta_lr.predict_proba(X_valid_stack)[:, 1]
auc_stack = roc_auc_score(y_valid, stack_valid_pred)
print("Stacking AUC:", auc_stack)
