
# Recommendation Dataset Prep — Config & Pipelines

These notebooks mirror the MBD-mini preparation flow but are **path- and schema-flexible** for your project.  
They will:
- Create (or derive) a **balanced client list** (`mbd_targets_balanced.parquet`).
- Build per-client **TRX**, **GEO**, and **TRX+GEO** text JSONL datasets.
- Filter those JSONLs by the balanced IDs to produce `json_balanced*.jsonl`.
- Run quick **sanity checks**.

> Default settings match the repo's spirit: TRX cap=256, GEO cap=64 (with consecutive-duplicate collapse), `log10(amount)` formatting, and downsampling to equalize pos/neg per target and fold.



# 00 — Config & Balanced ID Builder

- Set your **paths** below (globs are OK).
- Choose how to **get targets**: from a targets file or derive from a **proxy column** in TRX (e.g., `event_type` or `event_subtype`) by picking **top-K categories**.
- Builds `mbd_targets_balanced.parquet` under `BASE_OUT/balanced/`.

**Your provided examples** (already used as defaults below):
- TRX example: `/Users/tree/Projects/recommemdation_bank/data/mbd_mini/detail/trx/fold=0/part-*.parquet`
- GEO example: `/Users/tree/Projects/recommemdation_bank/data/mbd_mini/detail/geo/fold=0/part-*.parquet`
- You mentioned target path but pointed it to a GEO file by mistake. Below we support both:
  1) **TARGETS_FROM_FILE**: if you have a proper targets parquet; or
  2) **TARGETS_FROM_PROXY**: derive multilabel targets from TRX `event_type` (or `event_subtype`) by top-K categories.


In [1]:

# ====== CONFIG ======
from pathlib import Path
import glob, os

# 1) File patterns (globs). Adjust only the "fold=*" root if needed.
TRX_GLOB = "/Users/tree/Projects/recommemdation_bank/data/mbd_mini/detail/trx/fold=*/part-*.parquet"
GEO_GLOB = "/Users/tree/Projects/recommemdation_bank/data/mbd_mini/detail/geo/fold=*/part-*.parquet"

# If you have a targets parquet file/table:
TARGETS_PATH = "/Users/tree/Projects/recommemdation_bank/data/mbd_mini/targets/fold=*/part-*.parquet"   # e.g., "/Users/tree/.../targets/part-*.parquet"  (set to None to derive from proxy)

# 2) Folds present
FOLDS = [0,1,2,3,4]     # you said you have 4 folds; adjust if different

# 3) Output base
BASE_OUT = "/Users/tree/Projects/recommemdation_bank/outputs"  # change if you prefer
os.makedirs(BASE_OUT, exist_ok=True)

# 4) Targets derivation policy
TARGETS_FROM_FILE  =  True
TARGETS_FROM_PROXY = TARGETS_PATH is None  # derive from TRX

# If deriving from proxy:
PROXY_COLUMN_CANDIDATES = ["event_subtype", "event_type"]  # will use first one available in TRX
TOP_K = 4                     # number of categories -> multilabel targets (target_1..target_K)
MIN_CLIENTS_PER_CLASS = 200   # ignore very rare classes

# 5) Balancing policy
SEED = 42
POS_NEG_RATIO = 1.0  # 1.0 means pos:neg = 1:1 downsample; >1 keeps more negatives

BALANCED_DIR = f"{BASE_OUT}/balanced"
os.makedirs(BALANCED_DIR, exist_ok=True)

print("TRX_GLOB:", TRX_GLOB)
print("GEO_GLOB:", GEO_GLOB)
print("TARGETS_PATH:", TARGETS_PATH)
print("BASE_OUT:", BASE_OUT)
print("BALANCED_DIR:", BALANCED_DIR)


TRX_GLOB: /Users/tree/Projects/recommemdation_bank/data/mbd_mini/detail/trx/fold=*/part-*.parquet
GEO_GLOB: /Users/tree/Projects/recommemdation_bank/data/mbd_mini/detail/geo/fold=*/part-*.parquet
TARGETS_PATH: /Users/tree/Projects/recommemdation_bank/data/mbd_mini/targets/fold=*/part-*.parquet
BASE_OUT: /Users/tree/Projects/recommemdation_bank/outputs
BALANCED_DIR: /Users/tree/Projects/recommemdation_bank/outputs/balanced


In [2]:

# ====== BUILD TARGETS (from file OR derive from TRX proxy) ======
import pandas as pd
import numpy as np
from pathlib import Path

rng = np.random.default_rng(SEED)

def read_parquets(pattern, columns=None, limit_files=None):
    paths = sorted(glob.glob(pattern))
    if limit_files is not None:
        paths = paths[:limit_files]
    if not paths:
        raise FileNotFoundError(f"No files matched: {pattern}")
    dfs = []
    for p in paths:
        try:
            df = pd.read_parquet(p, columns=columns)
            dfs.append(df)
        except Exception as e:
            print(f"SKIP {p}: {e}")
    if not dfs:
        raise RuntimeError(f"No readable parquet files for {pattern}")
    return pd.concat(dfs, ignore_index=True)

# 1) Targets from file
if TARGETS_FROM_FILE:
    tdf = read_parquets(TARGETS_PATH)
    # Expect client_id, fold, and columns like target_*
    target_cols = [c for c in tdf.columns if c.startswith("target_")]
    if not target_cols:
        raise ValueError("Targets file has no columns starting with 'target_'")
    assert "client_id" in tdf.columns, "Targets must include 'client_id'"
    if "fold" not in tdf.columns:
        # best-effort: try to infer or set a dummy fold
        tdf["fold"] = -1
    targets = tdf[["client_id","fold"] + target_cols].drop_duplicates("client_id")
    print("Targets from file:", targets.shape, "with", len(target_cols), "target columns")
else:
    # 2) Derive targets from TRX proxy
    #    Strategy: pick PROXY_COLUMN from candidates; find top-K frequent values;
    #    build multilabel targets: target_i = 1 if client has that category anywhere in TRX.
    trx_cols_try = ["client_id", "fold"] + list(set(PROXY_COLUMN_CANDIDATES))
    trx = read_parquets(TRX_GLOB, columns=None)  # read all columns; will subset later
    existing = [c for c in trx_cols_try if c in trx.columns]
    missing = [c for c in trx_cols_try if c not in trx.columns]
    print("Existing columns:", existing, "| Missing:", missing)
    if not any(c in trx.columns for c in PROXY_COLUMN_CANDIDATES):
        raise ValueError(f"None of proxy columns found: {PROXY_COLUMN_CANDIDATES} in TRX")
    # choose the first available proxy column
    proxy_col = next(c for c in PROXY_COLUMN_CANDIDATES if c in trx.columns)
    print("Using proxy column:", proxy_col)

    # Keep only needed columns
    base_cols = ["client_id", "fold", proxy_col]
    trx = trx[base_cols].dropna(subset=["client_id", proxy_col])
    # For safety, fold fill
    if "fold" not in trx.columns:
        trx["fold"] = -1

    # Top-K classes with enough clients
    # Count unique clients per category
    cat_clients = trx.groupby(proxy_col)["client_id"].nunique().sort_values(ascending=False)
    cat_kept = cat_clients[cat_clients >= MIN_CLIENTS_PER_CLASS].head(TOP_K).index.tolist()
    if not cat_kept:
        raise ValueError("No proxy categories have enough clients. Lower MIN_CLIENTS_PER_CLASS or check data.")
    print("Top categories:", cat_kept)

    # Build multilabel targets per client
    labels = []
    for cat in cat_kept:
        d = trx.loc[trx[proxy_col] == cat, ["client_id"]].drop_duplicates()
        d[cat] = 1
        labels.append(d.rename(columns={cat: f"target_{len(labels)+1}"}))
    lab = labels[0]
    for i in range(1, len(labels)):
        lab = lab.merge(labels[i], on="client_id", how="outer")
    lab = lab.fillna(0)

    # Fold assignment per client: choose the most frequent fold seen in TRX
    fold_map = (trx.groupby(["client_id","fold"]).size()
                  .reset_index(name="n")
                  .sort_values(["client_id","n"], ascending=[True, False])
                  .drop_duplicates("client_id")[["client_id","fold"]])
    targets = lab.merge(fold_map, on="client_id", how="left")
    # reorder columns
    target_cols = [c for c in targets.columns if c.startswith("target_")]
    targets = targets[["client_id","fold"] + target_cols]
    print("Derived targets shape:", targets.shape, "| targets:", target_cols)

# Save raw targets for reference
raw_targets_path = f"{BALANCED_DIR}/targets_raw.parquet"
targets.to_parquet(raw_targets_path, index=False)
print("Saved raw targets to:", raw_targets_path)

# ====== Compute balanced client_ids ======
keep_ids = set()
target_cols = [c for c in targets.columns if c.startswith("target_")]
if not target_cols:
    raise ValueError("No target_* columns found to balance on.")

for fold in sorted(targets["fold"].dropna().unique()):
    d = targets[targets["fold"] == fold]
    for t in target_cols:
        pos = d.loc[d[t] == 1, "client_id"].dropna().unique()
        neg = d.loc[d[t] == 0, "client_id"].dropna().unique()
        if len(pos) == 0 or len(neg) == 0:
            print(f"[WARN] fold={fold}, {t}: pos={len(pos)}, neg={len(neg)} -> skip")
            continue
        k = min(len(pos), int(len(neg) / POS_NEG_RATIO))
        if k == 0:
            print(f"[WARN] fold={fold}, {t}: computed k=0 -> skip")
            continue
        keep_ids.update(rng.choice(pos, k, replace=False))
        keep_ids.update(rng.choice(neg, k, replace=False))

balanced_ids = pd.DataFrame({"client_id": sorted(keep_ids)})
balanced_path = f"{BALANCED_DIR}/mbd_targets_balanced.parquet"
balanced_ids.to_parquet(balanced_path, index=False)

print(f"Balanced IDs saved to: {balanced_path}")
print("Num balanced clients:", len(balanced_ids))


Targets from file: (100224, 6) with 4 target columns
Saved raw targets to: /Users/tree/Projects/recommemdation_bank/outputs/balanced/targets_raw.parquet
Balanced IDs saved to: /Users/tree/Projects/recommemdation_bank/outputs/balanced/mbd_targets_balanced.parquet
Num balanced clients: 2132
