In [1]:
# 05_train_baseline.py — TF-IDF + LogisticRegression baseline with 5-fold CV

import os, json, math, re, glob, gc
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, average_precision_score, log_loss

# ===================== CONFIG =====================
BASE_OUT = "/Users/tree/Projects/recommemdation_bank/outputs"

MM_JSONL   = f"{BASE_OUT}/json/mm/json_balanced_mm.jsonl"
LABELS_PAR = f"{BASE_OUT}/balanced/labels_mm_folded.parquet"   # created in 04

RESULTS_DIR = f"{BASE_OUT}/metrics"
os.makedirs(RESULTS_DIR, exist_ok=True)

FOLDS = [0,1,2,3,4]
RANDOM_STATE = 42

# TF-IDF settings — tuned for your tokenized text format (<TRX>/<GEO> rows)
VEC_KW = dict(
    max_features=200_000,
    ngram_range=(1,2),          # unigrams + bigrams
    min_df=2,                   # ignore extremely rare tokens
    max_df=0.995,
    lowercase=True,
    preprocessor=lambda s: s.replace('.', '_')  # keep a6.24 etc. as one token (a6_24)
)

LR_KW = dict(
    solver="liblinear",         # robust on small datasets & sparse mats
    C=1.0,
    class_weight="balanced",
    max_iter=300,
    random_state=RANDOM_STATE
)

# ===================== HELPERS =====================
def load_mm_texts(jsonl_path):
    rows=[]
    with open(jsonl_path) as f:
        for line in f:
            r = json.loads(line)
            rows.append((str(r["client_id"]), r["text"]))
    return pd.DataFrame(rows, columns=["client_id","text"])

def ndcg_at_k(y_true_row, y_score_row, k):
    """y_true_row: 1D {0,1} array; y_score_row: 1D scores; both length = #targets."""
    k = min(k, len(y_true_row))
    if k <= 0: return np.nan
    # order by predicted score
    order = np.argsort(-y_score_row)
    rel_k = np.take(y_true_row, order[:k])
    # DCG
    discounts = 1.0 / np.log2(np.arange(2, k+2))
    dcg = np.sum((2**rel_k - 1) * discounts)
    # IDCG
    ideal = np.sort(y_true_row)[::-1][:k]
    idcg = np.sum((2**ideal - 1) * discounts)
    if idcg == 0:
        return np.nan
    return dcg / idcg

def hit_at_k(y_true_row, y_score_row, k):
    """1 if any positive label is in top-k predictions, else 0; nan if no positives."""
    if y_true_row.sum() == 0:
        return np.nan
    order = np.argsort(-y_score_row)[:k]
    return 1.0 if y_true_row[order].sum() > 0 else 0.0

def safe_auc(y_true, y_prob):
    y = np.asarray(y_true)
    if len(np.unique(y)) < 2:
        return np.nan
    return roc_auc_score(y, y_prob)

def safe_ap(y_true, y_prob):
    y = np.asarray(y_true)
    if len(np.unique(y)) < 2:
        return np.nan
    return average_precision_score(y, y_prob)

def safe_logloss(y_true, y_prob):
    y = np.asarray(y_true)
    # clip to avoid -inf
    p = np.clip(y_prob, 1e-6, 1-1e-6)
    if len(np.unique(y)) < 2:
        return np.nan
    try:
        return log_loss(y, p, labels=[0,1])
    except Exception:
        return np.nan

# ===================== LOAD DATA =====================
texts  = load_mm_texts(MM_JSONL)
labels = pd.read_parquet(LABELS_PAR)
labels["client_id"] = labels["client_id"].astype(str)

df = labels.merge(texts, on="client_id", how="inner")
target_cols = [c for c in df.columns if c.startswith("target_")]
assert "fold" in df.columns, "labels_mm_folded.parquet must have a 'fold' column."
print("Data rows:", len(df), "| Targets:", target_cols)
print("Fold counts:", df["fold"].value_counts().sort_index().to_dict())

# ===================== CV TRAIN / EVAL =====================
all_fold_rows = []         # per-target metrics
all_rank_rows = []         # per-user ranking metrics aggregated per fold

for fold in FOLDS:
    train = df[df.fold != fold].reset_index(drop=True)
    valid = df[df.fold == fold].reset_index(drop=True)

    print(f"\n=== Fold {fold} | train={len(train)} valid={len(valid)} ===")

    # Vectorize on training text only
    vec = TfidfVectorizer(**VEC_KW)
    X_tr = vec.fit_transform(train["text"])
    X_va = vec.transform(valid["text"])

    # Collect predictions for ranking metrics
    Y_va = valid[target_cols].values.astype(int)
    S_va = np.zeros_like(Y_va, dtype=float)

    # Per-target binary classifier
    for ti, t in enumerate(target_cols):
        y_tr = train[t].values.astype(int)
        y_va = valid[t].values.astype(int)

        clf = LogisticRegression(**LR_KW)
        clf.fit(X_tr, y_tr)
        pr_va = clf.predict_proba(X_va)[:, 1]
        S_va[:, ti] = pr_va

        # Metrics for this target
        auc  = safe_auc(y_va, pr_va)
        ap   = safe_ap(y_va, pr_va)
        ll   = safe_logloss(y_va, pr_va)

        all_fold_rows.append({
            "fold": fold, "target": t,
            "AUC": auc, "AP": ap, "LogLoss": ll,
            "pos_valid": int(y_va.sum()), "n_valid": int(len(y_va))
        })

    # Ranking-style metrics across targets per user (only evaluate users with ≥1 positive)
    hits1, hits3, ndcg3, ndcg5 = [], [], [], []
    for i in range(len(valid)):
        y_row = Y_va[i]
        s_row = S_va[i]
        if y_row.sum() == 0:
            continue
        hits1.append(hit_at_k(y_row, s_row, 1))
        hits3.append(hit_at_k(y_row, s_row, 3))
        ndcg3.append(ndcg_at_k(y_row, s_row, 3))
        ndcg5.append(ndcg_at_k(y_row, s_row, 5))

    rank_row = {
        "fold": fold,
        "users_eval": int(len(hits1)),   # users with at least one positive label
        "Hit@1": float(np.nanmean(hits1)) if hits1 else np.nan,
        "Hit@3": float(np.nanmean(hits3)) if hits3 else np.nan,
        "NDCG@3": float(np.nanmean(ndcg3)) if ndcg3 else np.nan,
        "NDCG@5": float(np.nanmean(ndcg5)) if ndcg5 else np.nan,
    }
    all_rank_rows.append(rank_row)

    # free memory
    del X_tr, X_va, vec
    gc.collect()

# ===================== SUMMARIZE =====================
metrics_df = pd.DataFrame(all_fold_rows)
rank_df    = pd.DataFrame(all_rank_rows)

# per-target averages across folds
summary_target = (metrics_df
                  .groupby("target", as_index=False)
                  .agg(AUC_mean=("AUC", "mean"),
                       AUC_std =("AUC", "std"),
                       AP_mean =("AP", "mean"),
                       AP_std  =("AP", "std"),
                       LL_mean =("LogLoss", "mean"),
                       LL_std  =("LogLoss", "std"),
                       pos_valid=("pos_valid","sum"),
                       n_valid  =("n_valid","sum"))
                 )

# macro averages across targets
macro_row = {
    "target": "MACRO",
    "AUC_mean": summary_target["AUC_mean"].mean(),
    "AUC_std":  summary_target["AUC_mean"].std(),
    "AP_mean":  summary_target["AP_mean"].mean(),
    "AP_std":   summary_target["AP_mean"].std(),
    "LL_mean":  summary_target["LL_mean"].mean(),
    "LL_std":   summary_target["LL_mean"].std(),
    "pos_valid": int(summary_target["pos_valid"].sum()),
    "n_valid":   int(summary_target["n_valid"].sum()),
}
summary_target = pd.concat([summary_target, pd.DataFrame([macro_row])], ignore_index=True)

# ranking averages
summary_rank = (rank_df
                .agg({"users_eval":"sum",
                      "Hit@1":"mean","Hit@3":"mean",
                      "NDCG@3":"mean","NDCG@5":"mean"})
                .to_frame(name="mean").T)

print("\n=== Per-target metrics (mean±std across folds) ===")
print(summary_target[["target","AUC_mean","AP_mean","LL_mean","pos_valid","n_valid"]]
      .round(4).to_string(index=False))

print("\n=== Ranking metrics (averaged across folds) ===")
print(summary_rank.round(4).to_string(index=False))

# ===================== SAVE =====================
metrics_df.to_csv(f"{RESULTS_DIR}/fold_metrics_per_target.csv", index=False)
rank_df.to_csv(f"{RESULTS_DIR}/fold_metrics_ranking.csv", index=False)
summary_target.to_csv(f"{RESULTS_DIR}/summary_per_target.csv", index=False)
summary_rank.to_csv(f"{RESULTS_DIR}/summary_ranking.csv", index=False)

print("\nSaved metrics to:", RESULTS_DIR)

Data rows: 2127 | Targets: ['target_1', 'target_2', 'target_3', 'target_4']
Fold counts: {0: 425, 1: 423, 2: 419, 3: 446, 4: 414}

=== Fold 0 | train=1702 valid=425 ===

=== Fold 1 | train=1704 valid=423 ===

=== Fold 2 | train=1708 valid=419 ===

=== Fold 3 | train=1681 valid=446 ===

=== Fold 4 | train=1713 valid=414 ===

=== Per-target metrics (mean±std across folds) ===
  target  AUC_mean  AP_mean  LL_mean  pos_valid  n_valid
target_1    0.6596   0.4009   0.6350        556     2127
target_2    0.7477   0.3377   0.2790         33     2127
target_3    0.6448   0.2142   0.6115        276     2127
target_4    0.7389   0.3380   0.5253        237     2127
   MACRO    0.6977   0.3227   0.5127       1102     8508

=== Ranking metrics (averaged across folds) ===
 users_eval  Hit@1  Hit@3  NDCG@3  NDCG@5
     1039.0 0.4727 0.9526  0.7516  0.7742

Saved metrics to: /Users/tree/Projects/recommemdation_bank/outputs/metrics
