初始数据

In [1]:
from scipy import sparse
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

X_train = sparse.load_npz("processed/X_train_proc.npz")
X_valid = sparse.load_npz("processed/X_valid_proc.npz")
X_test  = sparse.load_npz("processed/X_test_proc.npz")

y_train = np.load("processed/y_train.npy")
y_valid = np.load("processed/y_valid.npy")

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

y_valid_pred = clf.predict_proba(X_valid)[:, 1]
print("Valid AUC:", roc_auc_score(y_valid, y_valid_pred))

Valid AUC: 0.9103415002631514


基于这些列再造一些比率/非线性特征

In [2]:
from scipy import sparse
import numpy as np
from pathlib import Path
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

BASE_DIR = Path.cwd()
FE_DIR = BASE_DIR / "processed_fe"

X_train = sparse.load_npz(FE_DIR / "X_train_proc_fe.npz")
X_valid = sparse.load_npz(FE_DIR / "X_valid_proc_fe.npz")
X_test  = sparse.load_npz(FE_DIR / "X_test_proc_fe.npz")

y_train = np.load(FE_DIR / "y_train_fe.npy")
y_valid = np.load(FE_DIR / "y_valid_fe.npy")

clf = LogisticRegression(max_iter=1000, n_jobs=-1)
clf.fit(X_train, y_train)

y_valid_prob = clf.predict_proba(X_valid)[:, 1]
auc = roc_auc_score(y_valid, y_valid_prob)
print("Valid AUC with FE:", auc)

Valid AUC with FE: 0.9103856476171693


试试只保留 DTI / loan_to_income_ratio / credit_score / purpose 类特征

In [1]:
from pathlib import Path
from scipy import sparse
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

BASE_DIR = Path.cwd()
SMALL_DIR = BASE_DIR / "processed_small"

X_train = sparse.load_npz(SMALL_DIR / "X_train_proc_small.npz")
X_valid = sparse.load_npz(SMALL_DIR / "X_valid_proc_small.npz")
X_test  = sparse.load_npz(SMALL_DIR / "X_test_proc_small.npz")

y_train = np.load(SMALL_DIR / "y_train_small.npy")
y_valid = np.load(SMALL_DIR / "y_valid_small.npy")

clf = LogisticRegression(max_iter=1000, n_jobs=-1)
clf.fit(X_train, y_train)

y_valid_prob = clf.predict_proba(X_valid)[:, 1]
auc_small = roc_auc_score(y_valid, y_valid_prob)
print("Valid AUC (small feature set):", auc_small)

Valid AUC (small feature set): 0.7745968096184708
