
# Rank-Stack Training Notebook (timefix)
此 Notebook 讀取 `preprocess_alert_timefix.py` 產生的 `features_train.csv / features_pred.csv / features_meta.json`，
完成：
- LGBM 分類器 (K-fold) 產生 OOF 機率
- Platt (Logistic Regression) 機率校準
- 邊界精修 Ranker（以窄帶資料再訓練一個分類器，bagging 平均）
- 以 `final = α * meta_cal + (1-α) * rank_score` 融合，輸出：
  - `acct_predict_out_stack.csv`（連續分數與中間欄位，用於 k-sweep）
  - `submit_stack_topk.csv`（二值提交檔）

> 若尚未執行資料前處理，請先執行：`python preprocess_alert_timefix.py`


In [1]:

# =============== 可調參數 ===============
FEATURES_TRAIN = "features_train.csv"
FEATURES_PRED  = "features_pred.csv"
FEATURES_META  = "features_meta.json"

SUBMIT_CSV     = "submit_stack_topk.csv"
SCORES_CSV     = "acct_predict_out_stack.csv"

# Public ACC0 (用來推估 Top-K 比例)：若 ACC0_PUBLIC = 0.933, 則 rate ≈ 0.067
ACC0_PUBLIC    = 0.933
RATE           = max(0.001, 1.0 - ACC0_PUBLIC)   # 目標 Top-K 比例
K_SHIFT        = 0.0000                           # 可微調 k 比例（例如 +0.0015 / -0.0015）

# Ranker 訓練的 OOF 機率邊界帶（分位）
BAND           = (0.03, 0.15)     # 窄一點：可試 (0.04, 0.12)；寬一點：可試 (0.02, 0.18)
RANKER_SEEDS   = [42, 73, 101, 137]  # bagging 平均

# 融合權重：final = α * meta_cal + (1-α) * rank_score
ALPHA          = 0.65

# PU 設定（未標示樣本的權重）
GAMMA_FIXED    = 0.30

# 交叉驗證
N_FOLDS        = 5
RANDOM_STATE   = 42


In [2]:

import json, gc, math
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings("ignore")

# LightGBM
try:
    import lightgbm as lgb
except Exception as e:
    print("[INFO] lightgbm 未安裝，嘗試安裝中...")
    import sys, subprocess
    subprocess.check_call([sys.executable, "-m", "pip", "install", "lightgbm", "-q"])
    import lightgbm as lgb

def fit_lgbm_classifier(X, y, sample_weight=None, valid=None, seed=42):
    params = dict(
        objective="binary",
        boosting_type="gbdt",
        n_estimators=2000,
        learning_rate=0.03,
        subsample=0.8,
        colsample_bytree=0.8,
        max_depth=-1,
        num_leaves=64,
        min_child_samples=40,
        reg_alpha=0.0,
        reg_lambda=1.0,
        random_state=seed,
        n_jobs=-1
    )
    if valid is not None:
        Xv, yv = valid
        model = lgb.LGBMClassifier(**params)
        model.fit(
            X, y,
            sample_weight=sample_weight,
            eval_set=[(Xv, yv)],
            eval_metric="auc",
            callbacks=[lgb.early_stopping(stopping_rounds=100, verbose=False)]
        )
    else:
        model = lgb.LGBMClassifier(**params)
        model.fit(X, y, sample_weight=sample_weight)
    return model

def k_from_rate(n, rate, k_shift=0.0):
    rate2 = max(0.0, min(1.0, rate + k_shift))
    return int(round(n * rate2))

def safe_rank_series(x):
    # 返回 0-1 區間的秩（越大越前）
    r = pd.Series(x).rank(method="average", pct=True).fillna(0.0).values
    return r


In [3]:

# 讀取 features
meta = json.load(open(FEATURES_META, "r", encoding="utf-8"))
feature_cols = meta.get("feature_cols", [])
if not feature_cols:
    # fallback：從 train 檔自動取（排除 acct/label/is_unlabeled）
    df_train_head = pd.read_csv(FEATURES_TRAIN, nrows=1)
    feature_cols = [c for c in df_train_head.columns if c not in ("acct","label","is_unlabeled")]

train = pd.read_csv(FEATURES_TRAIN)
pred  = pd.read_csv(FEATURES_PRED)

assert "acct" in train.columns and "label" in train.columns and "is_unlabeled" in train.columns
for c in feature_cols:
    if c not in pred.columns:
        pred[c] = 0.0
pred = pred[["acct"] + feature_cols]

X = train[feature_cols].copy()
y = train["label"].astype(int).values
u = train["is_unlabeled"].astype(int).values

X_te = pred[feature_cols].copy()
acct_te = pred["acct"].astype(str).values

print("Train shape:", X.shape, "Test shape:", X_te.shape, "Features:", len(feature_cols))


Train shape: (21084, 49) Test shape: (4780, 49) Features: 49


In [4]:

# PU 權重
w = np.where(y==1, 1.0, GAMMA_FIXED).astype("float32")

skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_STATE)
oof = np.zeros(len(X), dtype="float32")
models = []

for fold, (tr_idx, va_idx) in enumerate(skf.split(X, y), 1):
    Xtr, ytr = X.iloc[tr_idx], y[tr_idx]
    Xva, yva = X.iloc[va_idx], y[va_idx]
    wtr = w[tr_idx]

    model = fit_lgbm_classifier(Xtr, ytr, sample_weight=wtr, valid=(Xva, yva), seed=RANDOM_STATE+fold)
    models.append(model)
    oof[va_idx] = model.predict_proba(Xva)[:,1].astype("float32")
    print(f"[FOLD {fold}] AUC={roc_auc_score(yva, oof[va_idx]):.5f}, best_iter={model.best_iteration_}")

auc_oof = roc_auc_score(y, oof)
print(f"[OOF] AUC={auc_oof:.5f}")


[LightGBM] [Info] Number of positive: 803, number of negative: 16064
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002660 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9232
[LightGBM] [Info] Number of data points in the train set: 16867, number of used features: 47
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.142827 -> initscore=-1.792009
[LightGBM] [Info] Start training from score -1.792009
[FOLD 1] AUC=0.96080, best_iter=255
[LightGBM] [Info] Number of positive: 803, number of negative: 16064
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003232 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9229
[LightGBM] [Info] Number of data points in the train set: 16867, number of used features: 47
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.142827 -> initscore=-1.792009
[LightGBM] [Info] Start training f

In [5]:

# Platt 校準：用 OOF -> y 訓練 LR，對 Test 以 Kfold 模型預測平均後再做校準
platt = LogisticRegression(max_iter=1000, solver="lbfgs")
platt.fit(oof.reshape(-1,1), y)
meta_cal_oof = platt.predict_proba(oof.reshape(-1,1))[:,1]

# 推論端：對 test 先平均機率，再套用相同校準
pred_raw_list = [m.predict_proba(X_te)[:,1] for m in models]
meta_raw_te = np.mean(pred_raw_list, axis=0).astype("float32")
meta_cal_te  = platt.predict_proba(meta_raw_te.reshape(-1,1))[:,1]

print(f"[CAL] OOF after Platt AUC={roc_auc_score(y, meta_cal_oof):.5f}")


[CAL] OOF after Platt AUC=0.95496


In [6]:

# 以 OOF 機率的分位帶（BAND）擷取邊界資料，再訓練一個精修分類器作為 Ranker
q_lo, q_hi = np.quantile(oof, BAND[0]), np.quantile(oof, BAND[1])
band_mask = (oof >= q_lo) & (oof <= q_hi)

X_band = X.loc[band_mask].reset_index(drop=True)
y_band = y[band_mask]

print(f"[BAND] size={X_band.shape[0]}  ({BAND[0]*100:.1f}%~{BAND[1]*100:.1f}%)")

ranker_models = []
for i, seed in enumerate(RANKER_SEEDS, 1):
    m = fit_lgbm_classifier(X_band, y_band, sample_weight=None, valid=None, seed=seed)
    ranker_models.append(m)
    print(f"[RANKER] seed={seed}  n_estimators={m.n_estimators_ if hasattr(m,'n_estimators_') else 'unk'}")

# 產生訓練端的 rank_score（供檢查），與測試端 rank_score
rank_score_oof = np.zeros(len(X), dtype="float32")
if len(X_band) > 0:
    # 只對同一個分位帶內的樣本給 rank_score，其它地方用 meta_cal 補
    rank_score_oof[band_mask] = np.mean([m.predict_proba(X_band)[:,1] for m in ranker_models], axis=0).astype("float32")

# 測試端 rank_score：直接對全測試集做預測（ranker 在實戰上會對全部給分，這裡沿用）
rank_score_te = np.mean([m.predict_proba(X_te)[:,1] for m in ranker_models], axis=0).astype("float32")


[BAND] size=2530  (3.0%~15.0%)
[LightGBM] [Info] Number of positive: 6, number of negative: 2524
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000934 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7117
[LightGBM] [Info] Number of data points in the train set: 2530, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.002372 -> initscore=-6.041841
[LightGBM] [Info] Start training from score -6.041841
[RANKER] seed=42  n_estimators=2000
[LightGBM] [Info] Number of positive: 6, number of negative: 2524
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000917 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7117
[LightGBM] [Info] Number of data points in the train set: 2530, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.002372 -> initscore=-6.041841
[LightGBM] 

In [7]:

# 融合：final = α * meta_cal + (1-α) * rank_score
# 訓練端（檢查用）
final_oof = ALPHA * meta_cal_oof + (1-ALPHA) * np.where(rank_score_oof>0, rank_score_oof, meta_cal_oof)
print(f"[FINAL] OOF AUC={roc_auc_score(y, final_oof):.5f}")

# 測試端
final_te = ALPHA * meta_cal_te + (1-ALPHA) * rank_score_te

# 產出 k 與提交檔
n_test = len(final_te)
k = k_from_rate(n_test, RATE, K_SHIFT)
k = max(1, min(n_test, k))  # 保底

order = np.argsort(-final_te)  # 由大到小
topk_idx = set(order[:k])

submit = pd.DataFrame({
    "acct": acct_te,
    "predict": [1 if i in topk_idx else 0 for i in range(n_test)]
})
submit.to_csv(SUBMIT_CSV, index=False, encoding="utf-8-sig")

scores = pd.DataFrame({
    "acct": acct_te,
    "final_score": final_te,
    "meta_cal": meta_cal_te,
    "rank_score": rank_score_te,
})
scores.sort_values("final_score", ascending=False, inplace=True)
scores.to_csv(SCORES_CSV, index=False, encoding="utf-8-sig")

print(f"[OK] Saved: {SUBMIT_CSV} (k={k}, rate={k/n_test*100:.4f}%)")
print(f"[OK] Saved: {SCORES_CSV} (for k-sweep / 調 k 用)")


[FINAL] OOF AUC=0.96019
[OK] Saved: submit_stack_topk.csv (k=320, rate=6.6946%)
[OK] Saved: acct_predict_out_stack.csv (for k-sweep / 調 k 用)


In [8]:
# %% [markdown]
# 1) 用 features_train.csv 訓練輕量 LGBM，對 features_pred.csv 打分（new_score）
# 2) 與 acct_predict_out_stack.csv (old_score) 合併
# 3) 公榜導向 rerank（雙排名候選 + 候選內舊分數決勝）
# 4) 輸出 acct,label 可直接上傳
# ※ 依你目錄結構自動處理欄位對齊，不吃 is_unlabeled

import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier

# ---------- 路徑設定（照你目錄檔名） ----------
TRAIN_CSV   = "features_train.csv"          # 有 acct,label,is_unlabeled,... 特徵
PRED_CSV    = "features_pred.csv"           # 有 acct,... 特徵（無 label）
OLD_RANK_CSV= "acct_predict_out_stack.csv"  # 舊分數（final_score/meta_cal/rank_score）

# ---------- 讀檔 ----------
train = pd.read_csv(TRAIN_CSV)
pred  = pd.read_csv(PRED_CSV)
old   = pd.read_csv(OLD_RANK_CSV)

# ---------- 擇欄（扣掉 acct/label/is_unlabeled，僅取數值/布林欄位；與 pred 取交集） ----------
drop_cols = {"acct","label","is_unlabeled"}
num_cols_train = [c for c in train.columns if c not in drop_cols]
num_cols_pred  = [c for c in pred.columns  if c not in {"acct"}]

FEATS = sorted(list(set(num_cols_train) & set(num_cols_pred)))
# 只保留數值/布林；非數值嘗試轉 float，轉不動就丟掉
ok_feats = []
for c in FEATS:
    if np.issubdtype(train[c].dtype, np.number) and np.issubdtype(pred[c].dtype, np.number):
        ok_feats.append(c)
    else:
        # 嘗試轉型
        try:
            train[c] = pd.to_numeric(train[c], errors="coerce")
            pred[c]  = pd.to_numeric(pred[c],  errors="coerce")
            ok_feats.append(c)
        except Exception:
            pass
FEATS = ok_feats

# 缺值處理：用 0（簡單、穩定）
train[FEATS] = train[FEATS].fillna(0)
pred[FEATS]  = pred[FEATS].fillna(0)

print(f"[INFO] train rows={len(train)}, pred rows={len(pred)}, feats={len(FEATS)}")

# ---------- 訓練輕量 LGBM（穩定版參數） ----------
y = train["label"].astype(int).values
lgbm = LGBMClassifier(
    objective="binary",
    learning_rate=0.03,
    n_estimators=1200,
    num_leaves=64,
    feature_fraction=0.85,
    bagging_fraction=0.85,
    bagging_freq=1,
    min_data_in_leaf=40,
    lambda_l1=1.0,
    lambda_l2=2.0,
    random_state=42,
    verbose=-1
)
lgbm.fit(train[FEATS], y)

# ---------- 對 pred 打分：new_score ----------
pred_new = pred[["acct"]].copy()
pred_new["new_score"] = lgbm.predict_proba(pred[FEATS])[:,1]
pred_new.to_csv("pred_new_scores.csv", index=False)
print("[OK] Saved new scores -> pred_new_scores.csv")

# ---------- 整合舊分數：old_score 優先取 final_score / meta_cal / rank_score ----------
old_score_col = None
for col in ["final_score","meta_cal","rank_score","score","pred"]:
    if col in old.columns:
        old_score_col = col
        break
if old_score_col is None:
    raise ValueError(f"{OLD_RANK_CSV} 找不到舊分數欄位（final_score/meta_cal/rank_score/score/pred）")

old = old[["acct", old_score_col]].rename(columns={old_score_col:"old_score"})

df = old.merge(pred_new, on="acct", how="inner")
print(f"[INFO] merged rows = {len(df)}")

# ---------- 雙排名候選 + 舊分數決勝（公榜穩定） ----------
df["rank_old"] = df["old_score"].rank(method="first", ascending=False)
df["rank_new"] = df["new_score"].rank(method="first", ascending=False)
df["double_rank"] = (df["rank_old"] + df["rank_new"]) / 2.0

# 候選池：可調 1500~2200；太小會漏召回，太大會混入噪音
POOL = 1800
cand = df.sort_values("double_rank").head(POOL)
cand = cand.sort_values("old_score", ascending=False)  # 候選內用舊分數決勝

# K：先試 320 (=6.7%)；若分數曲線夠陡可再試 335 (=7.0%)
K = 320
topk = cand.head(K)[["acct"]].copy()
topk["label"] = 1

submit = df[["acct"]].merge(topk, on="acct", how="left").fillna(0).astype({"label":int})
rate = submit["label"].mean()*100
out_name = f"submit_public_rerank_k{K}.csv"
submit.to_csv(out_name, index=False)
print(f"[OK] Saved: {out_name}  (rate={rate:.3f}%)")

# 也輸出 7.0% 的版本，備選一次投放
K2 = 335
topk2 = cand.head(K2)[["acct"]].copy(); topk2["label"] = 1
submit2 = df[["acct"]].merge(topk2, on="acct", how="left").fillna(0).astype({"label":int})
out_name2 = f"submit_public_rerank_k{K2}.csv"
submit2.to_csv(out_name2, index=False)
print(f"[OK] Saved: {out_name2} (rate={submit2['label'].mean()*100:.3f}%)")

# 檢視前 10
display(submit.head(10))


[INFO] train rows=21084, pred rows=4780, feats=49
[OK] Saved new scores -> pred_new_scores.csv
[INFO] merged rows = 4780
[OK] Saved: submit_public_rerank_k320.csv  (rate=6.695%)
[OK] Saved: submit_public_rerank_k335.csv (rate=7.008%)


Unnamed: 0,acct,label
0,02abfd04fe2f90c9eb255d4502d5754f9173e83ee0f527...,1
1,48f5b877ae5e2d712f18c2300cb63cec325db0e6388882...,1
2,2ba041eb570dc5b958a97ffa4fd7bf8cb85b11fbb536d0...,1
3,0ff3d74ec0506b90e6e4ba931861d963c68d1dec7cc844...,1
4,2c8d96265fcef2eeab1185699e0a2f160f67bd19b4bf17...,1
5,fd3b76089b4c103a85c0ceb4beb5d5843177713391178b...,1
6,10d11642e5ae91b7c1cbb72491f8700ab393af3ecca0ac...,1
7,a5b7f49ca9020b63a8bf98d19443d9d1748882f6c53adf...,1
8,0f371328f8937236fd6be131149356a0b7594713dd38c3...,1
9,c5cb6d3a26c32f0ff64f74f0204c816c43c0e3a2ed4e92...,1
