In [1]:
import numpy as np
from scipy.sparse import load_npz
import lightgbm as lgb
from sklearn.metrics import roc_auc_score


In [2]:
# 加载稀疏矩阵（feature-engineered）
X_train = load_npz("../data_process/processed_fe/X_train_proc_fe.npz")
X_valid = load_npz("../data_process/processed_fe/X_valid_proc_fe.npz")
X_test  = load_npz("../data_process/processed_fe/X_test_proc_fe.npz")

y_train = np.load("../data_process/processed_fe/y_train_fe.npy")
y_valid = np.load("../data_process/processed_fe/y_valid_fe.npy")

print("Train shape:", X_train.shape)
print("Valid shape:", X_valid.shape)
print("Type:", type(X_train))


Train shape: (475195, 67)
Valid shape: (118799, 67)
Type: <class 'scipy.sparse._csr.csr_matrix'>


In [3]:
train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_valid, label=y_valid)


In [4]:
params = {
    "objective": "binary",
    "metric": "auc",
    "learning_rate": 0.03,
    "num_leaves": 31,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "verbose": -1,
}


In [5]:
# 使用 callbacks 方式进行早停（兼容旧版 LightGBM）
callbacks = [
    lgb.early_stopping(stopping_rounds=100, verbose=True)
]

model = lgb.train(
    params,
    train_data,
    num_boost_round=2000,
    valid_sets=[valid_data],
    callbacks=callbacks,
)


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1806]	valid_0's auc: 0.921553


In [6]:
y_valid_pred = model.predict(X_valid, num_iteration=model.best_iteration)
auc_lgb = roc_auc_score(y_valid, y_valid_pred)
print("LightGBM AUC:", auc_lgb)


LightGBM AUC: 0.9215532227810928
