In [1]:
import numpy as np
from scipy.sparse import load_npz
from lightgbm import LGBMClassifier
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import roc_auc_score


In [2]:
# 还是用 A 的 processed_fe 特征
X_train = load_npz("../data_process/processed_fe/X_train_proc_fe.npz")
X_valid = load_npz("../data_process/processed_fe/X_valid_proc_fe.npz")

y_train = np.load("../data_process/processed_fe/y_train_fe.npy")
y_valid = np.load("../data_process/processed_fe/y_valid_fe.npy")

# 调参时我们直接在 full(train+valid) 上做 CV
from scipy.sparse import vstack
X_full = vstack([X_train, X_valid])
y_full = np.concatenate([y_train, y_valid])

print("Full training shape:", X_full.shape)


Full training shape: (593994, 67)


In [3]:
base_lgbm = LGBMClassifier(
    objective="binary",
    metric="auc",
    n_estimators=500,
    n_jobs=-1
)

param_dist = {
    "num_leaves": [31, 63, 127],
    "learning_rate": [0.03, 0.02, 0.01],
    "min_child_samples": [20, 50, 100],
    "subsample": [0.7, 0.8, 1.0],          # bagging_fraction
    "colsample_bytree": [0.7, 0.8, 1.0],   # feature_fraction
    "reg_lambda": [0.0, 0.1, 1.0]
}


In [4]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

random_search = RandomizedSearchCV(
    estimator=base_lgbm,
    param_distributions=param_dist,
    n_iter=15,                 # 搜 15 组就够了
    scoring="roc_auc",
    cv=cv,
    verbose=1,
    n_jobs=-1,
    random_state=42
)

random_search.fit(X_full, y_full)


Fitting 5 folds for each of 15 candidates, totalling 75 fits
[LightGBM] [Info] Number of positive: 474494, number of negative: 119500
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.037274 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2157
[LightGBM] [Info] Number of data points in the train set: 593994, number of used features: 67
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.798820 -> initscore=1.378933
[LightGBM] [Info] Start training from score 1.378933


In [5]:
print("Best CV AUC:", random_search.best_score_)
print("Best params:", random_search.best_params_)

best_lgbm = random_search.best_estimator_

# 再在原来的 valid 上测一次，和你之前的 0.92155 对比
best_lgbm.fit(X_train, y_train)
y_valid_pred = best_lgbm.predict_proba(X_valid)[:, 1]
auc_valid = roc_auc_score(y_valid, y_valid_pred)
print("Tuned LGBM Valid AUC:", auc_valid)


Best CV AUC: 0.9216654109245482
Best params: {'subsample': 0.7, 'reg_lambda': 1.0, 'num_leaves': 127, 'min_child_samples': 100, 'learning_rate': 0.03, 'colsample_bytree': 0.7}
[LightGBM] [Info] Number of positive: 379595, number of negative: 95600
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.026458 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2157
[LightGBM] [Info] Number of data points in the train set: 475195, number of used features: 67
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.798819 -> initscore=1.378932
[LightGBM] [Info] Start training from score 1.378932
Tuned LGBM Valid AUC: 0.9212416936023725
