In [1]:
import numpy as np
from scipy.sparse import load_npz
import lightgbm as lgb
import pandas as pd


In [2]:
# 加载特征
X_train = load_npz("../data_process/processed_fe/X_train_proc_fe.npz")
X_valid = load_npz("../data_process/processed_fe/X_valid_proc_fe.npz")
X_test  = load_npz("../data_process/processed_fe/X_test_proc_fe.npz")

# 加载标签
y_train = np.load("../data_process/processed_fe/y_train_fe.npy")
y_valid = np.load("../data_process/processed_fe/y_valid_fe.npy")

print("Train:", X_train.shape, " Valid:", X_valid.shape, " Test:", X_test.shape)


Train: (475195, 67)  Valid: (118799, 67)  Test: (254569, 67)


In [3]:
from scipy.sparse import vstack

X_full = vstack([X_train, X_valid])
y_full = np.concatenate([y_train, y_valid])

print("Full training set shape:", X_full.shape)


Full training set shape: (593994, 67)


In [4]:
train_full = lgb.Dataset(X_full, label=y_full)


In [5]:
params = {
    "objective": "binary",
    "metric": "auc",
    "learning_rate": 0.03,
    "num_leaves": 31,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "verbose": -1,
}


In [8]:
# 使用 callbacks 来打印训练日志
callbacks = [
    lgb.log_evaluation(period=50)   # 每 50 轮打印一次
]

model_final = lgb.train(
    params,
    train_full,
    num_boost_round=500,
    callbacks=callbacks
)


In [9]:
test_pred = model_final.predict(X_test)
print(test_pred[:10])  # 查看前 10 个预测值


[0.92283028 0.98161102 0.464886   0.91750101 0.95807182 0.97547377
 0.98467939 0.97151898 0.93653807 0.00403346]


In [10]:
sample = pd.read_csv("../data_process/data/sample_submission.csv")

print(sample.head())

# 假设列名是 loan_paid_back 或 target，请保持一致
target_col = sample.columns[-1]  # 通常最后一列就是目标列名

sample[target_col] = test_pred

sample.to_csv("submission_lgbm.csv", index=False)

print("Saved submission_lgbm.csv!")


       id  loan_paid_back
0  593994               0
1  593995               0
2  593996               0
3  593997               0
4  593998               0
Saved submission_lgbm.csv!
