In [1]:
# 05_train_model_zoo.py — Compare your classifier list (with MultinomialNB) on 5-fold CV

import os, json, gc, warnings
import numpy as np
import pandas as pd

from scipy.sparse import hstack
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import roc_auc_score, average_precision_score, log_loss
from sklearn.base import clone

warnings.filterwarnings("ignore")

# ===================== PATHS & CONFIG =====================
BASE_OUT   = "/Users/tree/Projects/recommemdation_bank/outputs"
MM_JSONL   = f"{BASE_OUT}/json/mm/json_balanced_mm.jsonl"
LABELS_PAR = f"{BASE_OUT}/balanced/labels_mm_folded.parquet"

RESULTS_DIR = f"{BASE_OUT}/metrics/model_zoo"
os.makedirs(RESULTS_DIR, exist_ok=True)

FOLDS = [0,1,2,3,4]
RANDOM_STATE = 42

# Sparse TF-IDF
WORD_VECT = dict(
    max_features=200_000,
    ngram_range=(1,2),
    min_df=2, max_df=0.995,
    lowercase=True,
    sublinear_tf=True,
    preprocessor=lambda s: s.replace('.', '_')  # keep a6.24 → a6_24
)
CHAR_VECT = dict(
    analyzer="char_wb",
    ngram_range=(3,5),
    min_df=2, max_df=1.0,
    lowercase=True
)

SVD_DIMS = 512  # dense projection size for dense-only models

# ===================== HELPERS =====================
def load_mm_texts(path):
    rows=[]
    with open(path) as f:
        for line in f:
            r = json.loads(line)
            rows.append((str(r["client_id"]), r["text"]))
    return pd.DataFrame(rows, columns=["client_id","text"])

def ndcg_at_k(y_true_row, y_score_row, k):
    k = min(k, len(y_true_row))
    if k <= 0: return np.nan
    order = np.argsort(-y_score_row)
    rel_k = np.take(y_true_row, order[:k])
    discounts = 1.0 / np.log2(np.arange(2, k+2))
    dcg  = np.sum((2**rel_k - 1) * discounts)
    ideal = np.sort(y_true_row)[::-1][:k]
    idcg = np.sum((2**ideal - 1) * discounts)
    return np.nan if idcg == 0 else dcg / idcg

def hit_at_k(y_true_row, y_score_row, k):
    if y_true_row.sum() == 0: return np.nan
    order = np.argsort(-y_score_row)[:k]
    return 1.0 if y_true_row[order].sum() > 0 else 0.0

def safe_auc(y_true, y_prob):
    y = np.asarray(y_true)
    return np.nan if len(np.unique(y)) < 2 else roc_auc_score(y, y_prob)

def safe_ap(y_true, y_prob):
    y = np.asarray(y_true)
    return np.nan if len(np.unique(y)) < 2 else average_precision_score(y, y_prob)

def safe_logloss(y_true, y_prob):
    y = np.asarray(y_true)
    p = np.clip(y_prob, 1e-6, 1-1e-6)
    return np.nan if len(np.unique(y)) < 2 else log_loss(y, p, labels=[0,1])

def build_features(train_text, valid_text):
    # sparse: word + char TF-IDF
    vec_w = TfidfVectorizer(**WORD_VECT)
    vec_c = TfidfVectorizer(**CHAR_VECT)
    Xtr_w = vec_w.fit_transform(train_text)
    Xva_w = vec_w.transform(valid_text)
    Xtr_c = vec_c.fit_transform(train_text)
    Xva_c = vec_c.transform(valid_text)
    X_tr_sparse = hstack([Xtr_w, Xtr_c], format="csr")
    X_va_sparse = hstack([Xva_w, Xva_c], format="csr")
    # dense: SVD -> Standardize (with_mean=True) for LDA/MLP/SVC/KNN/etc.
    svd = TruncatedSVD(n_components=SVD_DIMS, random_state=RANDOM_STATE)
    X_tr_dense = svd.fit_transform(X_tr_sparse)
    X_va_dense = svd.transform(X_va_sparse)
    scaler = StandardScaler(with_mean=True)
    X_tr_dense = scaler.fit_transform(X_tr_dense)
    X_va_dense = scaler.transform(X_va_dense)
    return (X_tr_sparse, X_va_sparse), (X_tr_dense, X_va_dense)

def class_weights(y):
    pos = (y == 1).sum()
    neg = (y == 0).sum()
    if pos == 0 or neg == 0:
        return None
    w_pos = neg / max(pos, 1)
    return np.where(y == 1, w_pos, 1.0)

# ===================== CLASSIFIERS =====================
# Your list, with GaussianNB → MultinomialNB
sparse_models = {
    "Logistic Regression": LogisticRegression(
        solver="liblinear", C=1.0, class_weight="balanced", max_iter=300, random_state=RANDOM_STATE
    ),
    "MultinomialNB": MultinomialNB(alpha=1.0)
}

dense_models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE, n_jobs=-1),
    "Gradient Boosting": GradientBoostingClassifier(random_state=RANDOM_STATE),
    "AdaBoost": AdaBoostClassifier(random_state=RANDOM_STATE),
    "Decision Tree": DecisionTreeClassifier(random_state=RANDOM_STATE),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Support Vector Machine": SVC(kernel="rbf", probability=True, random_state=RANDOM_STATE),
    "LDA": LinearDiscriminantAnalysis(),
    "MLP (Neural Network)": MLPClassifier(max_iter=1000, random_state=RANDOM_STATE)
}

# ===================== LOAD =====================
texts  = load_mm_texts(MM_JSONL)
labels = pd.read_parquet(LABELS_PAR)
labels["client_id"] = labels["client_id"].astype(str)
df = labels.merge(texts, on="client_id", how="inner")

target_cols = [c for c in df.columns if c.startswith("target_")]
assert "fold" in df.columns
print("Data rows:", len(df), "| Targets:", target_cols)
print("Fold counts:", df["fold"].value_counts().sort_index().to_dict())

# ===================== TRAIN / EVAL =====================
all_metrics_rows = []
all_rank_rows    = []

for fold in FOLDS:
    train = df[df.fold != fold].reset_index(drop=True)
    valid = df[df.fold == fold].reset_index(drop=True)
    print(f"\n=== Fold {fold} | train={len(train)} valid={len(valid)} ===")

    (X_tr_sparse, X_va_sparse), (X_tr_dense, X_va_dense) = build_features(train["text"], valid["text"])
    Y_tr = train[target_cols].values.astype(int)
    Y_va = valid[target_cols].values.astype(int)

    # ---- sparse-friendly ----
    for mname, model in sparse_models.items():
        S_va = np.zeros_like(Y_va, dtype=float)
        for ti, t in enumerate(target_cols):
            y_tr = Y_tr[:, ti]
            weights = class_weights(y_tr)

            clf = clone(model)
            # MultinomialNB & LR both handle sparse; pass sample_weight if supported
            try:
                clf.fit(X_tr_sparse, y_tr, sample_weight=weights)
            except TypeError:
                clf.fit(X_tr_sparse, y_tr)

            if hasattr(clf, "predict_proba"):
                pr = clf.predict_proba(X_va_sparse)[:, 1]
            elif hasattr(clf, "decision_function"):
                pr = 1.0 / (1.0 + np.exp(-clf.decision_function(X_va_sparse)))
            else:
                pr = clf.predict(X_va_sparse).astype(float)

            S_va[:, ti] = pr
            all_metrics_rows.append({
                "model": mname, "fold": fold, "target": t,
                "AUC": safe_auc(Y_va[:, ti], pr),
                "AP":  safe_ap(Y_va[:, ti], pr),
                "LogLoss": safe_logloss(Y_va[:, ti], pr),
                "pos_valid": int(Y_va[:, ti].sum()), "n_valid": int(len(Y_va))
            })

        # ranking
        hits1, hits3, ndcg3, ndcg5 = [], [], [], []
        for i in range(len(valid)):
            y_row, s_row = Y_va[i], S_va[i]
            if y_row.sum() == 0: continue
            hits1.append(hit_at_k(y_row, s_row, 1))
            hits3.append(hit_at_k(y_row, s_row, 3))
            ndcg3.append(ndcg_at_k(y_row, s_row, 3))
            ndcg5.append(ndcg_at_k(y_row, s_row, 5))
        all_rank_rows.append({
            "model": mname, "fold": fold,
            "users_eval": int(len(hits1)),
            "Hit@1": float(np.nanmean(hits1)) if hits1 else np.nan,
            "Hit@3": float(np.nanmean(hits3)) if hits3 else np.nan,
            "NDCG@3": float(np.nanmean(ndcg3)) if ndcg3 else np.nan,
            "NDCG@5": float(np.nanmean(ndcg5)) if ndcg5 else np.nan,
        })

    # ---- dense-only ----
    for mname, model in dense_models.items():
        S_va = np.zeros_like(Y_va, dtype=float)
        for ti, t in enumerate(target_cols):
            y_tr = Y_tr[:, ti]
            weights = class_weights(y_tr)
            clf = clone(model)

            try:
                clf.fit(X_tr_dense, y_tr, sample_weight=weights)
            except TypeError:
                clf.fit(X_tr_dense, y_tr)

            if hasattr(clf, "predict_proba"):
                pr = clf.predict_proba(X_va_dense)[:, 1]
            elif hasattr(clf, "decision_function"):
                dfc = clf.decision_function(X_va_dense)
                pr  = 1.0 / (1.0 + np.exp(-dfc))
            else:
                pr = clf.predict(X_va_dense).astype(float)

            S_va[:, ti] = pr
            all_metrics_rows.append({
                "model": mname, "fold": fold, "target": t,
                "AUC": safe_auc(Y_va[:, ti], pr),
                "AP":  safe_ap(Y_va[:, ti], pr),
                "LogLoss": safe_logloss(Y_va[:, ti], pr),
                "pos_valid": int(Y_va[:, ti].sum()), "n_valid": int(len(Y_va))
            })

        hits1, hits3, ndcg3, ndcg5 = [], [], [], []
        for i in range(len(valid)):
            y_row, s_row = Y_va[i], S_va[i]
            if y_row.sum() == 0: continue
            hits1.append(hit_at_k(y_row, s_row, 1))
            hits3.append(hit_at_k(y_row, s_row, 3))
            ndcg3.append(ndcg_at_k(y_row, s_row, 3))
            ndcg5.append(ndcg_at_k(y_row, s_row, 5))
        all_rank_rows.append({
            "model": mname, "fold": fold,
            "users_eval": int(len(hits1)),
            "Hit@1": float(np.nanmean(hits1)) if hits1 else np.nan,
            "Hit@3": float(np.nanmean(hits3)) if hits3 else np.nan,
            "NDCG@3": float(np.nanmean(ndcg3)) if ndcg3 else np.nan,
            "NDCG@5": float(np.nanmean(ndcg5)) if ndcg5 else np.nan,
        })

    # free memory
    del X_tr_sparse, X_va_sparse, X_tr_dense, X_va_dense, Y_tr, Y_va
    gc.collect()

# ===================== SUMMARIZE & SAVE =====================
metrics_df = pd.DataFrame(all_metrics_rows)
rank_df    = pd.DataFrame(all_rank_rows)

summary_per_model_target = (metrics_df
    .groupby(["model","target"], as_index=False)
    .agg(AUC_mean=("AUC","mean"), AP_mean=("AP","mean"), LL_mean=("LogLoss","mean"),
         pos_valid=("pos_valid","sum"), n_valid=("n_valid","sum"))
)

summary_macro = (summary_per_model_target
    .groupby("model", as_index=False)
    .agg(AUC_macro=("AUC_mean","mean"),
         AP_macro=("AP_mean","mean"),
         LL_macro=("LL_mean","mean"))
)

summary_rank = (rank_df
    .groupby("model", as_index=False)
    .agg(users_eval=("users_eval","sum"),
         Hit1=("Hit@1","mean"), Hit3=("Hit@3","mean"),
         NDCG3=("NDCG@3","mean"), NDCG5=("NDCG@5","mean"))
)

print("\n=== Macro (avg over targets) ===")
print(summary_macro.sort_values("AUC_macro", ascending=False).round(4).to_string(index=False))

print("\n=== Ranking (avg over folds) ===")
print(summary_rank.sort_values("Hit3", ascending=False).round(4).to_string(index=False))

# Save
metrics_df.to_csv(os.path.join(RESULTS_DIR, "fold_metrics_per_target.csv"), index=False)
rank_df.to_csv(os.path.join(RESULTS_DIR, "fold_metrics_ranking.csv"), index=False)
summary_per_model_target.to_csv(os.path.join(RESULTS_DIR, "summary_per_model_target.csv"), index=False)
summary_macro.to_csv(os.path.join(RESULTS_DIR, "summary_per_model_macro.csv"), index=False)
summary_rank.to_csv(os.path.join(RESULTS_DIR, "summary_per_model_ranking.csv"), index=False)

print("\nSaved results to:", RESULTS_DIR)

Data rows: 2127 | Targets: ['target_1', 'target_2', 'target_3', 'target_4']
Fold counts: {0: 425, 1: 423, 2: 419, 3: 446, 4: 414}

=== Fold 0 | train=1702 valid=425 ===

=== Fold 1 | train=1704 valid=423 ===

=== Fold 2 | train=1708 valid=419 ===

=== Fold 3 | train=1681 valid=446 ===

=== Fold 4 | train=1713 valid=414 ===

=== Macro (avg over targets) ===
                 model  AUC_macro  AP_macro  LL_macro
   Logistic Regression     0.7266    0.3379    0.6711
                   LDA     0.7061    0.3063    0.4321
         Random Forest     0.7027    0.3072    0.3399
         MultinomialNB     0.6950    0.3153    0.9757
     Gradient Boosting     0.6948    0.2917    0.4058
Support Vector Machine     0.6899    0.3146    0.3269
  MLP (Neural Network)     0.6789    0.2951    0.5008
              AdaBoost     0.6490    0.2375    0.6198
   K-Nearest Neighbors     0.5974    0.2022    0.8061
         Decision Tree     0.5625    0.1577    3.2798

=== Ranking (avg over folds) ===
             

In [2]:
# 05_train_lgbm.py — LightGBM per-target with 5-fold CV on sparse TF-IDF features

import os, json, gc, warnings, numpy as np, pandas as pd
from scipy.sparse import hstack
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score, average_precision_score, log_loss
from sklearn.preprocessing import StandardScaler

warnings.filterwarnings("ignore")

# ====== CONFIG ======
BASE_OUT   = "/Users/tree/Projects/recommemdation_bank/outputs"
MM_JSONL   = f"{BASE_OUT}/json/mm/json_balanced_mm.jsonl"
LABELS_PAR = f"{BASE_OUT}/balanced/labels_mm_folded.parquet"

METRICS_DIR = f"{BASE_OUT}/metrics/lgbm"
PRED_DIR    = f"{BASE_OUT}/predictions/lgbm"
os.makedirs(METRICS_DIR, exist_ok=True)
os.makedirs(PRED_DIR, exist_ok=True)

FOLDS = [0,1,2,3,4]
RANDOM_STATE = 42

# TF-IDF settings (same spirit as your baseline)
WORD_VECT = dict(
    max_features=200_000,
    ngram_range=(1,2),
    min_df=2, max_df=0.995,
    lowercase=True,
    sublinear_tf=True,
    preprocessor=lambda s: s.replace('.', '_')  # keep a6.24 → a6_24
)
CHAR_VECT = dict(
    analyzer="char_wb",
    ngram_range=(3,5),
    min_df=2, max_df=1.0,
    lowercase=True
)

# LightGBM params
try:
    import lightgbm as lgb
except Exception as e:
    raise SystemExit(
        f"LightGBM not installed: {e}\n"
        "Install with one of:\n"
        "  pip install lightgbm\n"
        "  conda install -c conda-forge lightgbm"
    )

LGBM_KW = dict(
    objective="binary",
    boosting_type="gbdt",
    n_estimators=500,
    learning_rate=0.05,
    num_leaves=63,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.0,
    reg_lambda=0.0,
    random_state=RANDOM_STATE,
    n_jobs=-1
)

# ====== helpers ======
def load_mm_texts(path):
    rows=[]
    with open(path) as f:
        for line in f:
            r = json.loads(line)
            rows.append((str(r["client_id"]), r["text"]))
    return pd.DataFrame(rows, columns=["client_id","text"])

def build_sparse_features(train_text, valid_text):
    vw = TfidfVectorizer(**WORD_VECT)
    vc = TfidfVectorizer(**CHAR_VECT)
    Xtr_w = vw.fit_transform(train_text); Xva_w = vw.transform(valid_text)
    Xtr_c = vc.fit_transform(train_text); Xva_c = vc.transform(valid_text)
    X_tr = hstack([Xtr_w, Xtr_c], format="csr")
    X_va = hstack([Xva_w, Xva_c], format="csr")
    return X_tr, X_va

def class_weights(y):
    pos = (y == 1).sum()
    neg = (y == 0).sum()
    if pos == 0 or neg == 0:
        return None
    w_pos = neg / max(pos, 1)
    return np.where(y == 1, w_pos, 1.0)

def safe_auc(y_true, y_prob):
    y = np.asarray(y_true)
    return np.nan if len(np.unique(y)) < 2 else roc_auc_score(y, y_prob)

def safe_ap(y_true, y_prob):
    y = np.asarray(y_true)
    return np.nan if len(np.unique(y)) < 2 else average_precision_score(y, y_prob)

def safe_logloss(y_true, y_prob):
    y = np.asarray(y_true)
    p = np.clip(y_prob, 1e-6, 1-1e-6)
    return np.nan if len(np.unique(y)) < 2 else log_loss(y, p, labels=[0,1])

def ndcg_at_k(y_true_row, y_score_row, k):
    k = min(k, len(y_true_row))
    if k <= 0: return np.nan
    order = np.argsort(-y_score_row)
    rel_k = np.take(y_true_row, order[:k])
    discounts = 1.0 / np.log2(np.arange(2, k+2))
    dcg  = np.sum((2**rel_k - 1) * discounts)
    ideal = np.sort(y_true_row)[::-1][:k]
    idcg = np.sum((2**ideal - 1) * discounts)
    return np.nan if idcg == 0 else dcg / idcg

def hit_at_k(y_true_row, y_score_row, k):
    if y_true_row.sum() == 0: return np.nan
    order = np.argsort(-y_score_row)[:k]
    return 1.0 if y_true_row[order].sum() > 0 else 0.0

# ====== load data ======
texts  = load_mm_texts(MM_JSONL)
labels = pd.read_parquet(LABELS_PAR)
labels["client_id"] = labels["client_id"].astype(str)
df = labels.merge(texts, on="client_id", how="inner")
target_cols = [c for c in df.columns if c.startswith("target_")]
assert "fold" in df.columns

print("Data rows:", len(df), "| Targets:", target_cols)
print("Fold counts:", df["fold"].value_counts().sort_index().to_dict())

# ====== CV train/eval ======
metric_rows = []
rank_rows   = []
pred_rows   = []   # per-user per-target probs from their own valid fold

for fold in FOLDS:
    train = df[df.fold != fold].reset_index(drop=True)
    valid = df[df.fold == fold].reset_index(drop=True)
    print(f"\n=== Fold {fold} | train={len(train)} valid={len(valid)} ===")

    X_tr, X_va = build_sparse_features(train["text"], valid["text"])
    Y_tr = train[target_cols].values.astype(int)
    Y_va = valid[target_cols].values.astype(int)

    # One LGBM per target (one-vs-rest)
    S_va = np.zeros_like(Y_va, dtype=float)
    for ti, t in enumerate(target_cols):
        y_tr = Y_tr[:, ti]
        weights = class_weights(y_tr)

        clf = lgb.LGBMClassifier(**LGBM_KW, verbosity=-1)
        try:
            clf.fit(X_tr, y_tr, sample_weight=weights)
        except TypeError:
            clf.fit(X_tr, y_tr)

        pr = clf.predict_proba(X_va)[:, 1]
        S_va[:, ti] = pr

        metric_rows.append({
            "fold": fold, "target": t,
            "AUC": safe_auc(Y_va[:, ti], pr),
            "AP":  safe_ap(Y_va[:, ti], pr),
            "LogLoss": safe_logloss(Y_va[:, ti], pr),
            "pos_valid": int(Y_va[:, ti].sum()), "n_valid": int(len(Y_va))
        })

    # save fold preds
    fold_pred = pd.DataFrame(S_va, columns=[f"{t}_prob" for t in target_cols])
    fold_pred.insert(0, "client_id", valid["client_id"].values)
    fold_pred.insert(1, "fold", int(fold))
    pred_rows.append(fold_pred)

    # ranking per user with ≥1 positive
    hits1, hits3, ndcg3, ndcg5 = [], [], [], []
    for i in range(len(valid)):
        y_row, s_row = Y_va[i], S_va[i]
        if y_row.sum() == 0: continue
        hits1.append(hit_at_k(y_row, s_row, 1))
        hits3.append(hit_at_k(y_row, s_row, 3))
        ndcg3.append(ndcg_at_k(y_row, s_row, 3))
        ndcg5.append(ndcg_at_k(y_row, s_row, 5))

    rank_rows.append({
        "fold": fold, "users_eval": int(len(hits1)),
        "Hit@1": float(np.nanmean(hits1)) if hits1 else np.nan,
        "Hit@3": float(np.nanmean(hits3)) if hits3 else np.nan,
        "NDCG@3": float(np.nanmean(ndcg3)) if ndcg3 else np.nan,
        "NDCG@5": float(np.nanmean(ndcg5)) if ndcg5 else np.nan,
    })

    del X_tr, X_va, Y_tr, Y_va
    gc.collect()

# ====== summarize & save ======
metrics_df = pd.DataFrame(metric_rows)
rank_df    = pd.DataFrame(rank_rows)
cv_preds   = pd.concat(pred_rows, ignore_index=True)

summary_target = (metrics_df
                  .groupby("target", as_index=False)
                  .agg(AUC_mean=("AUC","mean"),
                       AP_mean=("AP","mean"),
                       LL_mean=("LogLoss","mean"),
                       pos_valid=("pos_valid","sum"),
                       n_valid=("n_valid","sum")))

macro_row = {
    "target": "MACRO",
    "AUC_mean": summary_target["AUC_mean"].mean(),
    "AP_mean":  summary_target["AP_mean"].mean(),
    "LL_mean":  summary_target["LL_mean"].mean(),
    "pos_valid": int(summary_target["pos_valid"].sum()),
    "n_valid":   int(summary_target["n_valid"].sum()),
}
summary_target = pd.concat([summary_target, pd.DataFrame([macro_row])], ignore_index=True)

summary_rank = (rank_df
                .agg({"users_eval":"sum",
                      "Hit@1":"mean","Hit@3":"mean",
                      "NDCG@3":"mean","NDCG@5":"mean"})
                .to_frame(name="mean").T)

print("\n=== Per-target (mean over folds) ===")
print(summary_target[["target","AUC_mean","AP_mean","LL_mean","pos_valid","n_valid"]]
      .round(4).to_string(index=False))

print("\n=== Ranking metrics (avg over folds) ===")
print(summary_rank.round(4).to_string(index=False))

# write metrics
metrics_df.to_csv(os.path.join(METRICS_DIR, "fold_metrics_per_target.csv"), index=False)
rank_df.to_csv(os.path.join(METRICS_DIR, "fold_metrics_ranking.csv"), index=False)
summary_target.to_csv(os.path.join(METRICS_DIR, "summary_per_target.csv"), index=False)
summary_rank.to_csv(os.path.join(METRICS_DIR, "summary_ranking.csv"), index=False)

# write predictions + top-3
cv_path = os.path.join(PRED_DIR, "lgbm_cv_preds.parquet")
cv_preds.to_parquet(cv_path, index=False)
print("Saved CV preds:", cv_path)

top3_path = os.path.join(PRED_DIR, "lgbm_cv_top3.jsonl")
with open(top3_path, "w") as f:
    for _, r in cv_preds.iterrows():
        probs = [(c.replace("_prob",""), r[c]) for c in cv_preds.columns if c.endswith("_prob")]
        probs.sort(key=lambda x: -x[1])
        top3 = [k for k,_ in probs[:3]]
        f.write(json.dumps({"client_id": r["client_id"], "fold": int(r["fold"]), "top3": top3}) + "\n")
print("Saved Top-3 per user:", top3_path)

print("\nSaved metrics to:", METRICS_DIR)

Data rows: 2127 | Targets: ['target_1', 'target_2', 'target_3', 'target_4']
Fold counts: {0: 425, 1: 423, 2: 419, 3: 446, 4: 414}

=== Fold 0 | train=1702 valid=425 ===

=== Fold 1 | train=1704 valid=423 ===

=== Fold 2 | train=1708 valid=419 ===

=== Fold 3 | train=1681 valid=446 ===

=== Fold 4 | train=1713 valid=414 ===

=== Per-target (mean over folds) ===
  target  AUC_mean  AP_mean  LL_mean  pos_valid  n_valid
target_1    0.6973   0.4105   1.2415        556     2127
target_2    0.7785   0.2134   0.1725         33     2127
target_3    0.6912   0.2150   1.0337        276     2127
target_4    0.7190   0.2952   0.9060        237     2127
   MACRO    0.7215   0.2835   0.8384       1102     8508

=== Ranking metrics (avg over folds) ===
 users_eval  Hit@1  Hit@3  NDCG@3  NDCG@5
     1039.0 0.5515 0.9745  0.7981  0.8096
Saved CV preds: /Users/tree/Projects/recommemdation_bank/outputs/predictions/lgbm/lgbm_cv_preds.parquet
Saved Top-3 per user: /Users/tree/Projects/recommemdation_bank/ou

In [3]:
# 05_svc_oof.py — SVC (RBF) out-of-fold predictions on MM text

import os, json, gc, numpy as np, pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.base import clone
from scipy.sparse import hstack

# ====== CONFIG ======
BASE_OUT   = "/Users/tree/Projects/recommemdation_bank/outputs"
MM_JSONL   = f"{BASE_OUT}/json/mm/json_balanced_mm.jsonl"
LABELS_PAR = f"{BASE_OUT}/balanced/labels_mm_folded.parquet"

PRED_DIR = f"{BASE_OUT}/predictions/svc_rbf"
os.makedirs(PRED_DIR, exist_ok=True)

FOLDS = [0,1,2,3,4]
RANDOM_STATE = 42
SVD_DIMS = 512

# TF-IDF (same spirit as baseline)
WORD_VECT = dict(
    max_features=200_000, ngram_range=(1,2),
    min_df=2, max_df=0.995, lowercase=True, sublinear_tf=True,
    preprocessor=lambda s: s.replace('.', '_')   # keep tokens like a6.24 -> a6_24
)
CHAR_VECT = dict(
    analyzer="char_wb", ngram_range=(3,5),
    min_df=2, max_df=1.0, lowercase=True
)

# ====== helpers ======
def load_mm_texts(path):
    rows=[]
    with open(path) as f:
        for line in f:
            r = json.loads(line)
            rows.append((str(r["client_id"]), r["text"]))
    return pd.DataFrame(rows, columns=["client_id","text"])

def build_features(train_text, valid_text):
    # Sparse TF-IDF (word + char)
    vec_w = TfidfVectorizer(**WORD_VECT)
    vec_c = TfidfVectorizer(**CHAR_VECT)
    Xtr_w = vec_w.fit_transform(train_text); Xva_w = vec_w.transform(valid_text)
    Xtr_c = vec_c.fit_transform(train_text); Xva_c = vec_c.transform(valid_text)
    X_tr_sparse = hstack([Xtr_w, Xtr_c], format="csr")
    X_va_sparse = hstack([Xva_w, Xva_c], format="csr")
    # Dense projection for SVC
    svd = TruncatedSVD(n_components=SVD_DIMS, random_state=RANDOM_STATE)
    X_tr = svd.fit_transform(X_tr_sparse)
    X_va = svd.transform(X_va_sparse)
    scaler = StandardScaler(with_mean=True)
    X_tr = scaler.fit_transform(X_tr)
    X_va = scaler.transform(X_va)
    return X_tr, X_va

def class_weights(y):
    pos = (y == 1).sum()
    neg = (y == 0).sum()
    if pos == 0 or neg == 0:
        return None
    w_pos = neg / max(pos, 1)
    return np.where(y == 1, w_pos, 1.0)

# ====== load ======
texts  = load_mm_texts(MM_JSONL)
labels = pd.read_parquet(LABELS_PAR)
labels["client_id"] = labels["client_id"].astype(str)
df = labels.merge(texts, on="client_id", how="inner")

target_cols = [c for c in df.columns if c.startswith("target_")]
assert "fold" in df.columns
print("Data rows:", len(df), "| Targets:", target_cols)
print("Fold counts:", df["fold"].value_counts().sort_index().to_dict())

# ====== CV OOF preds ======
pred_rows = []

for fold in FOLDS:
    train = df[df.fold != fold].reset_index(drop=True)
    valid = df[df.fold == fold].reset_index(drop=True)
    print(f"\n=== Fold {fold} | train={len(train)} valid={len(valid)} ===")

    X_tr, X_va = build_features(train["text"], valid["text"])
    Y_tr = train[target_cols].values.astype(int)

    S_va = np.zeros((len(valid), len(target_cols)), dtype=float)

    for ti, t in enumerate(target_cols):
        y_tr = Y_tr[:, ti]
        weights = class_weights(y_tr)

        clf = SVC(kernel="rbf", probability=True, random_state=RANDOM_STATE)
        try:
            clf.fit(X_tr, y_tr, sample_weight=weights)
        except TypeError:
            clf.fit(X_tr, y_tr)

        S_va[:, ti] = clf.predict_proba(X_va)[:, 1]

    fold_pred = pd.DataFrame(S_va, columns=[f"{t}_prob" for t in target_cols])
    fold_pred.insert(0, "client_id", valid["client_id"].values)
    fold_pred.insert(1, "fold", int(fold))
    pred_rows.append(fold_pred)

    del X_tr, X_va; gc.collect()

cv_preds = pd.concat(pred_rows, ignore_index=True)

# Save parquet
cv_path = os.path.join(PRED_DIR, "svc_cv_preds.parquet")
cv_preds.to_parquet(cv_path, index=False)
print("Saved CV preds:", cv_path)

# Also save Top-3 per user (for quick inspection)
top3_path = os.path.join(PRED_DIR, "svc_cv_top3.jsonl")
with open(top3_path, "w") as f:
    for _, r in cv_preds.iterrows():
        probs = [(c.replace("_prob",""), r[c]) for c in cv_preds.columns if c.endswith("_prob")]
        probs.sort(key=lambda x: -x[1])
        top3 = [k for k,_ in probs[:3]]
        f.write(json.dumps({"client_id": r["client_id"], "fold": int(r["fold"]), "top3": top3}) + "\n")
print("Saved Top-3 per user:", top3_path)

Data rows: 2127 | Targets: ['target_1', 'target_2', 'target_3', 'target_4']
Fold counts: {0: 425, 1: 423, 2: 419, 3: 446, 4: 414}

=== Fold 0 | train=1702 valid=425 ===

=== Fold 1 | train=1704 valid=423 ===

=== Fold 2 | train=1708 valid=419 ===

=== Fold 3 | train=1681 valid=446 ===

=== Fold 4 | train=1713 valid=414 ===
Saved CV preds: /Users/tree/Projects/recommemdation_bank/outputs/predictions/svc_rbf/svc_cv_preds.parquet
Saved Top-3 per user: /Users/tree/Projects/recommemdation_bank/outputs/predictions/svc_rbf/svc_cv_top3.jsonl


In [4]:
# 05_lr_oof.py — Logistic Regression out-of-fold predictions on MM text

import os, json, gc, numpy as np, pandas as pd
from scipy.sparse import hstack
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

# ====== CONFIG ======
BASE_OUT   = "/Users/tree/Projects/recommemdation_bank/outputs"
MM_JSONL   = f"{BASE_OUT}/json/mm/json_balanced_mm.jsonl"
LABELS_PAR = f"{BASE_OUT}/balanced/labels_mm_folded.parquet"

PRED_DIR = f"{BASE_OUT}/predictions/lr"
os.makedirs(PRED_DIR, exist_ok=True)

FOLDS = [0,1,2,3,4]
RANDOM_STATE = 42

# TF-IDF (same spirit as your baseline/model zoo)
WORD_VECT = dict(
    max_features=200_000, ngram_range=(1,2),
    min_df=2, max_df=0.995, lowercase=True, sublinear_tf=True,
    preprocessor=lambda s: s.replace('.', '_')  # keep tokens like a6.24 -> a6_24
)
CHAR_VECT = dict(
    analyzer="char_wb", ngram_range=(3,5),
    min_df=2, max_df=1.0, lowercase=True
)

LR_KW = dict(
    solver="liblinear",
    C=1.0,
    class_weight="balanced",
    max_iter=300,
    random_state=RANDOM_STATE
)

# ====== helpers ======
def load_mm_texts(path):
    rows=[]
    with open(path) as f:
        for line in f:
            r = json.loads(line)
            rows.append((str(r["client_id"]), r["text"]))
    return pd.DataFrame(rows, columns=["client_id","text"])

def build_sparse(train_text, valid_text):
    vw = TfidfVectorizer(**WORD_VECT)
    vc = TfidfVectorizer(**CHAR_VECT)
    Xtr_w = vw.fit_transform(train_text); Xva_w = vw.transform(valid_text)
    Xtr_c = vc.fit_transform(train_text); Xva_c = vc.transform(valid_text)
    return hstack([Xtr_w, Xtr_c], format="csr"), hstack([Xva_w, Xva_c], format="csr")

# ====== load ======
texts  = load_mm_texts(MM_JSONL)
labels = pd.read_parquet(LABELS_PAR)
labels["client_id"] = labels["client_id"].astype(str)
df = labels.merge(texts, on="client_id", how="inner")

target_cols = [c for c in df.columns if c.startswith("target_")]
assert "fold" in df.columns, "labels_mm_folded.parquet must contain a 'fold' column."

print("Data rows:", len(df), "| Targets:", target_cols)
print("Fold counts:", df["fold"].value_counts().sort_index().to_dict())

# ====== CV OOF preds ======
pred_rows = []

for fold in FOLDS:
    train = df[df.fold != fold].reset_index(drop=True)
    valid = df[df.fold == fold].reset_index(drop=True)
    print(f"\n=== Fold {fold} | train={len(train)} valid={len(valid)} ===")

    X_tr, X_va = build_sparse(train["text"], valid["text"])

    S_va = np.zeros((len(valid), len(target_cols)), dtype=float)
    for ti, t in enumerate(target_cols):
        y_tr = train[t].values.astype(int)
        clf = LogisticRegression(**LR_KW)
        clf.fit(X_tr, y_tr)
        S_va[:, ti] = clf.predict_proba(X_va)[:, 1]

    fold_pred = pd.DataFrame(S_va, columns=[f"{t}_prob" for t in target_cols])
    fold_pred.insert(0, "client_id", valid["client_id"].values)
    fold_pred.insert(1, "fold", int(fold))
    pred_rows.append(fold_pred)

    del X_tr, X_va; gc.collect()

cv_preds = pd.concat(pred_rows, ignore_index=True)

# Save parquet (for stacking)
cv_path = os.path.join(PRED_DIR, "lr_cv_preds.parquet")
cv_preds.to_parquet(cv_path, index=False)
print("Saved CV preds:", cv_path)

# Also save Top-3 per user (optional quick check)
top3_path = os.path.join(PRED_DIR, "lr_cv_top3.jsonl")
with open(top3_path, "w") as f:
    for _, r in cv_preds.iterrows():
        probs = [(c.replace("_prob",""), r[c]) for c in cv_preds.columns if c.endswith("_prob")]
        probs.sort(key=lambda x: -x[1])
        top3 = [k for k,_ in probs[:3]]
        f.write(json.dumps({"client_id": r["client_id"], "fold": int(r["fold"]), "top3": top3}) + "\n")
print("Saved Top-3 per user:", top3_path)

Data rows: 2127 | Targets: ['target_1', 'target_2', 'target_3', 'target_4']
Fold counts: {0: 425, 1: 423, 2: 419, 3: 446, 4: 414}

=== Fold 0 | train=1702 valid=425 ===

=== Fold 1 | train=1704 valid=423 ===

=== Fold 2 | train=1708 valid=419 ===

=== Fold 3 | train=1681 valid=446 ===

=== Fold 4 | train=1713 valid=414 ===
Saved CV preds: /Users/tree/Projects/recommemdation_bank/outputs/predictions/lr/lr_cv_preds.parquet
Saved Top-3 per user: /Users/tree/Projects/recommemdation_bank/outputs/predictions/lr/lr_cv_top3.jsonl


In [5]:
# 05_stack_meta.py — OOF stacking of LGBM + LR (+ SVC if present)

import os, json, gc, warnings
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, average_precision_score, log_loss
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
warnings.filterwarnings("ignore")

BASE_OUT = "/Users/tree/Projects/recommemdation_bank/outputs"
LABELS_PAR = f"{BASE_OUT}/balanced/labels_mm_folded.parquet"
MM_JSONL   = f"{BASE_OUT}/json/mm/json_balanced_mm.jsonl"
P_LGBM = f"{BASE_OUT}/predictions/lgbm/lgbm_cv_preds.parquet"
P_LR   = f"{BASE_OUT}/predictions/lr/lr_cv_preds.parquet"
P_SVC  = f"{BASE_OUT}/predictions/svc_rbf/svc_cv_preds.parquet"
METRICS_DIR = f"{BASE_OUT}/metrics/stack"; PRED_DIR = f"{BASE_OUT}/predictions/stack"
os.makedirs(METRICS_DIR, exist_ok=True); os.makedirs(PRED_DIR, exist_ok=True)
FOLDS=[0,1,2,3,4]; RANDOM_STATE=42

def safe_auc(y,p): 
    y=np.asarray(y); 
    return np.nan if len(np.unique(y))<2 else roc_auc_score(y,p)
def safe_ap(y,p):  
    y=np.asarray(y); 
    return np.nan if len(np.unique(y))<2 else average_precision_score(y,p)
def safe_ll(y,p):  
    y=np.asarray(y); 
    p=np.clip(p,1e-6,1-1e-6); 
    return np.nan if len(np.unique(y))<2 else log_loss(y,p,labels=[0,1])
def ndcg_at_k(y_true_row, y_score_row, k):
    k=min(k, len(y_true_row)); 
    if k<=0: return np.nan
    order=np.argsort(-y_score_row); rel=np.take(y_true_row, order[:k])
    disc=1.0/np.log2(np.arange(2,k+2)); 
    dcg=np.sum((2**rel-1)*disc); ideal=np.sort(y_true_row)[::-1][:k]
    idcg=np.sum((2**ideal-1)*disc); 
    return np.nan if idcg==0 else dcg/idcg
def hit_at_k(y_true_row, y_score_row, k):
    if y_true_row.sum()==0: return np.nan
    order=np.argsort(-y_score_row)[:k]; 
    return 1.0 if y_true_row[order].sum()>0 else 0.0
def hit_ndcg_block(Y,S):
    H1,H3,N3,N5=[],[],[],[]
    for i in range(Y.shape[0]):
        if Y[i].sum()==0: continue
        H1.append(hit_at_k(Y[i],S[i],1)); H3.append(hit_at_k(Y[i],S[i],3))
        N3.append(ndcg_at_k(Y[i],S[i],3)); N5.append(ndcg_at_k(Y[i],S[i],5))
    return dict(users_eval=len(H1), Hit1=float(np.nanmean(H1)) if H1 else np.nan,
                Hit3=float(np.nanmean(H3)) if H3 else np.nan,
                NDCG3=float(np.nanmean(N3)) if N3 else np.nan,
                NDCG5=float(np.nanmean(N5)) if N5 else np.nan)

def ensure_lr_preds():
    if os.path.exists(P_LR): 
        print("[OK] Found LR CV preds:", P_LR); 
        return
    print("[MAKE] LR CV preds not found — training LR OOF now…")
    rows=[]; 
    with open(MM_JSONL) as f:
        for line in f:
            r=json.loads(line); rows.append((str(r["client_id"]), r["text"]))
    texts = pd.DataFrame(rows, columns=["client_id","text"])
    labels = pd.read_parquet(LABELS_PAR); labels["client_id"]=labels["client_id"].astype(str)
    df = labels.merge(texts, on="client_id", how="inner")
    target_cols = [c for c in df.columns if c.startswith("target_")]

    WORD_VECT = dict(max_features=200_000, ngram_range=(1,2), min_df=2, max_df=0.995,
                     lowercase=True, sublinear_tf=True, preprocessor=lambda s: s.replace('.', '_'))
    CHAR_VECT = dict(analyzer="char_wb", ngram_range=(3,5), min_df=2, max_df=1.0, lowercase=True)
    from sklearn.linear_model import LogisticRegression
    from sklearn.feature_extraction.text import TfidfVectorizer
    from scipy.sparse import hstack
    LR_KW = dict(solver="liblinear", C=1.0, class_weight="balanced", max_iter=300, random_state=RANDOM_STATE)

    def build_sparse(tr_text, va_text):
        vw=TfidfVectorizer(**WORD_VECT); vc=TfidfVectorizer(**CHAR_VECT)
        Xtr_w=vw.fit_transform(tr_text); Xva_w=vw.transform(va_text)
        Xtr_c=vc.fit_transform(tr_text); Xva_c=vc.transform(va_text)
        return hstack([Xtr_w,Xtr_c],format="csr"), hstack([Xva_w,Xva_c],format="csr")

    pred_rows=[]
    for fold in FOLDS:
        tr=df[df.fold!=fold].reset_index(drop=True); va=df[df.fold==fold].reset_index(drop=True)
        Xtr,Xva = build_sparse(tr["text"], va["text"])
        S = np.zeros((len(va), len(target_cols)))
        for ti,t in enumerate(target_cols):
            y = tr[t].values.astype(int)
            clf = LogisticRegression(**LR_KW).fit(Xtr, y)
            S[:,ti] = clf.predict_proba(Xva)[:,1]
        out = pd.DataFrame(S, columns=[f"{t}_prob" for t in target_cols])
        out.insert(0,"client_id", va["client_id"].values)
        out.insert(1,"fold", int(fold))
        pred_rows.append(out)
        del Xtr,Xva; gc.collect()

    cv_lr = pd.concat(pred_rows, ignore_index=True)
    os.makedirs(os.path.dirname(P_LR), exist_ok=True)
    cv_lr.to_parquet(P_LR, index=False)
    print("[SAVED] LR CV preds ->", P_LR)

# prerequisites
assert os.path.exists(P_LGBM), f"LGBM preds not found at {P_LGBM}. Run the LGBM cell first."
ensure_lr_preds()
use_svc = os.path.exists(P_SVC)
print("[INFO] Using models in stack:", ["LGBM","LR"] + (["SVC"] if use_svc else []))

# load OOF preds
lgbm = pd.read_parquet(P_LGBM)
lr   = pd.read_parquet(P_LR)
if use_svc: svc = pd.read_parquet(P_SVC)

# rename with suffixes and join
def add_suffix(df, suffix):
    df = df.copy()
    for c in list(df.columns):
        if c.endswith("_prob"):
            df[c.replace("_prob","")+f"_prob_{suffix}"] = df[c]
            del df[c]
    return df
lgbm = add_suffix(lgbm, "lgbm"); lr = add_suffix(lr, "lr")
base = lgbm.merge(lr, on=["client_id","fold"], how="inner")
if use_svc:
    svc = add_suffix(svc, "svc")
    base = base.merge(svc, on=["client_id","fold"], how="inner")

# labels
labels = pd.read_parquet(LABELS_PAR); labels["client_id"]=labels["client_id"].astype(str)
targets = [c for c in labels.columns if c.startswith("target_")]
base = base.merge(labels[["client_id","fold"]+targets], on=["client_id","fold"], how="left")

# meta-learning OOF
meta_rows=[]; rank_rows=[]; pred_rows=[]
for fold in FOLDS:
    tr = base[base.fold != fold].reset_index(drop=True)
    va = base[base.fold == fold].reset_index(drop=True)
    Y_va = va[targets].values.astype(int)

    def feat_cols_for(t):
        cols=[f"{t}_prob_lgbm", f"{t}_prob_lr"]
        if use_svc: cols.append(f"{t}_prob_svc")
        return cols

    S_va = np.zeros((len(va), len(targets)), dtype=float)
    for ti,t in enumerate(targets):
        Xtr = tr[feat_cols_for(t)].values; ytr = tr[t].values.astype(int)
        Xva = va[feat_cols_for(t)].values
        meta = LogisticRegression(solver="liblinear", class_weight="balanced", random_state=RANDOM_STATE)
        meta.fit(Xtr, ytr)
        p = meta.predict_proba(Xva)[:,1]
        S_va[:, ti] = p
        meta_rows.append({"fold":fold,"target":t,"AUC":safe_auc(Y_va[:, ti], p),
                          "AP":safe_ap(Y_va[:, ti], p),"LogLoss":safe_ll(Y_va[:, ti], p),
                          "pos_valid":int(Y_va[:, ti].sum()),"n_valid":int(len(Y_va))})

    fold_pred = pd.DataFrame(S_va, columns=[f"{t}_prob_stack" for t in targets])
    fold_pred.insert(0,"client_id", va["client_id"].values); fold_pred.insert(1,"fold", int(fold))
    pred_rows.append(fold_pred)

    rk = hit_ndcg_block(Y_va, S_va); rk["fold"]=fold; rank_rows.append(rk)

# summarize + save
met = pd.DataFrame(meta_rows)
summary_target = (met.groupby("target", as_index=False)
                    .agg(AUC_mean=("AUC","mean"), AP_mean=("AP","mean"), LL_mean=("LogLoss","mean"),
                         pos_valid=("pos_valid","sum"), n_valid=("n_valid","sum")))
macro = {"target":"MACRO","AUC_mean":summary_target["AUC_mean"].mean(),
         "AP_mean":summary_target["AP_mean"].mean(),"LL_mean":summary_target["LL_mean"].mean(),
         "pos_valid":int(summary_target["pos_valid"].sum()),"n_valid":int(summary_target["n_valid"].sum())}
summary_target = pd.concat([summary_target, pd.DataFrame([macro])], ignore_index=True)

rkdf = pd.DataFrame(rank_rows)
summary_rank = (rkdf.agg({"users_eval":"sum","Hit1":"mean","Hit3":"mean","NDCG3":"mean","NDCG5":"mean"})
                .to_frame("mean").T)

print("\n=== Stacked (LGBM + LR" + (" + SVC" if use_svc else "") + ") — per-target mean over folds ===")
print(summary_target[["target","AUC_mean","AP_mean","LL_mean","pos_valid","n_valid"]]
      .round(4).to_string(index=False))
print("\n=== Stacked — ranking (avg over folds) ===")
print(summary_rank.round(4).to_string(index=False))

met.to_csv(os.path.join(METRICS_DIR, "stack_fold_metrics_per_target.csv"), index=False)
summary_target.to_csv(os.path.join(METRICS_DIR, "stack_summary_per_target.csv"), index=False)
summary_rank.to_csv(os.path.join(METRICS_DIR, "stack_summary_ranking.csv"), index=False)

cv_stack = pd.concat(pred_rows, ignore_index=True)
cv_path = os.path.join(PRED_DIR, "stack_cv_preds.parquet")
cv_stack.to_parquet(cv_path, index=False)
print("Saved stacked CV preds:", cv_path)

top3_path = os.path.join(PRED_DIR, "stack_cv_top3.jsonl")
with open(top3_path, "w") as f:
    for _, r in cv_stack.iterrows():
        probs = [(c.replace("_prob_stack",""), r[c]) for c in cv_stack.columns if c.endswith("_prob_stack")]
        probs.sort(key=lambda x: -x[1])
        f.write(json.dumps({"client_id": r["client_id"], "fold": int(r["fold"]), "top3": [k for k,_ in probs[:3]]}) + "\n")
print("Saved Top-3 per user:", top3_path)

[OK] Found LR CV preds: /Users/tree/Projects/recommemdation_bank/outputs/predictions/lr/lr_cv_preds.parquet
[INFO] Using models in stack: ['LGBM', 'LR', 'SVC']

=== Stacked (LGBM + LR + SVC) — per-target mean over folds ===
  target  AUC_mean  AP_mean  LL_mean  pos_valid  n_valid
target_1    0.6954   0.4139   0.6396        556     2127
target_2    0.7728   0.3450   0.5359         33     2127
target_3    0.6840   0.2280   0.6465        276     2127
target_4    0.7570   0.3789   0.5746        237     2127
   MACRO    0.7273   0.3414   0.5992       1102     8508

=== Stacked — ranking (avg over folds) ===
 users_eval   Hit1   Hit3  NDCG3  NDCG5
     1039.0 0.4778 0.9105  0.726 0.7687
Saved stacked CV preds: /Users/tree/Projects/recommemdation_bank/outputs/predictions/stack/stack_cv_preds.parquet
Saved Top-3 per user: /Users/tree/Projects/recommemdation_bank/outputs/predictions/stack/stack_cv_top3.jsonl


In [6]:
# 05_blend_meta.py — grid-searched weight blend of SVC + LGBM + LR to optimize NDCG@3

import os, json, numpy as np, pandas as pd

BASE_OUT = "/Users/tree/Projects/recommemdation_bank/outputs"
LABELS_PAR = f"{BASE_OUT}/balanced/labels_mm_folded.parquet"
P_SVC  = f"{BASE_OUT}/predictions/svc_rbf/svc_cv_preds.parquet"
P_LGBM = f"{BASE_OUT}/predictions/lgbm/lgbm_cv_preds.parquet"
P_LR   = f"{BASE_OUT}/predictions/lr/lr_cv_preds.parquet"

OUT_DIR = f"{BASE_OUT}/predictions/blend"
MET_DIR = f"{BASE_OUT}/metrics/blend"
os.makedirs(OUT_DIR, exist_ok=True)
os.makedirs(MET_DIR, exist_ok=True)

FOLDS = [0,1,2,3,4]

# --- helpers ---
def ndcg_at_k(y_true_row, y_score_row, k):
    k = min(k, len(y_true_row))
    if k <= 0: return np.nan
    order = np.argsort(-y_score_row)
    rel_k = np.take(y_true_row, order[:k])
    discounts = 1.0/np.log2(np.arange(2, k+2))
    dcg = np.sum((2**rel_k - 1) * discounts)
    ideal = np.sort(y_true_row)[::-1][:k]
    idcg = np.sum((2**ideal - 1) * discounts)
    return np.nan if idcg == 0 else dcg/idcg

def hit_at_k(y_true_row, y_score_row, k):
    if y_true_row.sum() == 0: return np.nan
    order = np.argsort(-y_score_row)[:k]
    return 1.0 if y_true_row[order].sum() > 0 else 0.0

def rank_metrics(Y, S):
    H1,H3,N3,N5=[],[],[],[]
    for i in range(Y.shape[0]):
        if Y[i].sum()==0: continue
        H1.append(hit_at_k(Y[i], S[i], 1))
        H3.append(hit_at_k(Y[i], S[i], 3))
        N3.append(ndcg_at_k(Y[i], S[i], 3))
        N5.append(ndcg_at_k(Y[i], S[i], 5))
    return {
        "users_eval": len(H1),
        "Hit@1": float(np.nanmean(H1)) if H1 else np.nan,
        "Hit@3": float(np.nanmean(H3)) if H3 else np.nan,
        "NDCG@3": float(np.nanmean(N3)) if N3 else np.nan,
        "NDCG@5": float(np.nanmean(N5)) if N5 else np.nan,
    }

# --- load labels & preds ---
labels = pd.read_parquet(LABELS_PAR)
labels["client_id"] = labels["client_id"].astype(str)
targets = [c for c in labels.columns if c.startswith("target_")]

svc  = pd.read_parquet(P_SVC)
lgbm = pd.read_parquet(P_LGBM)
lr   = pd.read_parquet(P_LR)

# align and stack into dict[fold] -> (Y, S_model)
def fold_pack(df, model_suffix):
    cols = [c for c in df.columns if c.endswith("_prob")]
    packed = {}
    for f in FOLDS:
        d = df[df.fold==f].merge(labels[["client_id","fold"]+targets], on=["client_id","fold"], how="left")
        Y = d[targets].values.astype(int)
        S = d[[c for c in cols]].values.astype(float)
        # ensure same order of targets for all models
        order = [f"{t}_prob" for t in targets]
        S = d[order].values.astype(float)
        packed[f] = (Y, S)
    return packed

P_svc  = fold_pack(svc,  "svc")
P_lgbm = fold_pack(lgbm, "lgbm")
P_lr   = fold_pack(lr,   "lr")

# --- grid search weights (w_svc, w_lgbm, w_lr >=0, sum=1) to maximize mean NDCG@3 ---
best = None
grid = np.linspace(0, 1, 11)  # step 0.1; small & fast
for ws in grid:
    for wl in grid:
        wr = 1.0 - ws - wl
        if wr < 0 or wr > 1: 
            continue
        # aggregate metrics across folds
        mets = []
        for f in FOLDS:
            Y,_ = P_svc[f]
            S = ws*P_svc[f][1] + wl*P_lgbm[f][1] + wr*P_lr[f][1]
            mets.append(rank_metrics(Y, S))
        # average
        N3 = np.mean([m["NDCG@3"] for m in mets])
        H1 = np.mean([m["Hit@1"] for m in mets])
        # pick by NDCG@3, break ties with Hit@1
        score = (N3, H1)
        if (best is None) or (score > best[0]):
            best = (score, (ws, wl, wr))

(ws, wl, wr) = best[1]
print(f"Best weights (by NDCG@3, tie=Hit@1): w_svc={ws:.2f}, w_lgbm={wl:.2f}, w_lr={wr:.2f}")

# --- evaluate & save blended OOF ---
rank_rows = []
pred_rows = []
for f in FOLDS:
    Y,_ = P_svc[f]
    S = ws*P_svc[f][1] + wl*P_lgbm[f][1] + wr*P_lr[f][1]
    rk = rank_metrics(Y, S); rk["fold"]=f; rank_rows.append(rk)
    # save fold preds with columns target_i_prob_blend
    d = svc[svc.fold==f][["client_id","fold"]].copy()
    for i,t in enumerate(targets):
        d[f"{t}_prob_blend"] = S[:, i]
    pred_rows.append(d)

rkdf = pd.DataFrame(rank_rows)
summary_rank = rkdf.agg({"users_eval":"sum","Hit@1":"mean","Hit@3":"mean","NDCG@3":"mean","NDCG@5":"mean"}).to_frame("mean").T
print("\n=== Blended ranking (avg over folds) ===")
print(summary_rank.round(4).to_string(index=False))

cv_blend = pd.concat(pred_rows, ignore_index=True)
cv_path = os.path.join(OUT_DIR, "blend_cv_preds.parquet")
cv_blend.to_parquet(cv_path, index=False)
print("Saved blended CV preds:", cv_path)

# Top-3 per user
top3_path = os.path.join(OUT_DIR, "blend_cv_top3.jsonl")
with open(top3_path, "w") as f:
    for _, r in cv_blend.iterrows():
        probs = [(c.replace("_prob_blend",""), r[c]) for c in cv_blend.columns if c.endswith("_prob_blend")]
        probs.sort(key=lambda x: -x[1])
        f.write(json.dumps({"client_id": r["client_id"], "fold": int(r["fold"]), "top3": [k for k,_ in probs[:3]]}) + "\n")
print("Saved Top-3 per user:", top3_path)

# also write weights & metrics
pd.DataFrame([{"w_svc":ws, "w_lgbm":wl, "w_lr":wr}]).to_csv(os.path.join(MET_DIR,"blend_weights.csv"), index=False)
summary_rank.to_csv(os.path.join(MET_DIR,"blend_summary_ranking.csv"), index=False)

Best weights (by NDCG@3, tie=Hit@1): w_svc=0.10, w_lgbm=0.90, w_lr=0.00

=== Blended ranking (avg over folds) ===
 users_eval  Hit@1  Hit@3  NDCG@3  NDCG@5
     1039.0 0.5752 0.9809  0.8124  0.8214
Saved blended CV preds: /Users/tree/Projects/recommemdation_bank/outputs/predictions/blend/blend_cv_preds.parquet
Saved Top-3 per user: /Users/tree/Projects/recommemdation_bank/outputs/predictions/blend/blend_cv_top3.jsonl
