
# 01 — TRX → JSONL (per client) + Balanced Filter

- Reads TRX parquet files across your folds.
- Builds per-client text blocks (cap last 256 rows, `log10(amount)`), tagged `<TRX>...</TRX>`.
- Saves per-fold JSONL and a combined one.
- Filters by `mbd_targets_balanced.parquet` (from notebook 00) to produce `json_balanced_trx.jsonl`.


In [None]:

# ====== CONFIG (keep in sync with 00) ======
import os, glob, json, math
import pandas as pd
import numpy as np
from pathlib import Path
from datetime import datetime
import re

TRX_GLOB = "/Users/tree/Projects/recommemdation_bank/data/mbd_mini/detail/trx/fold=*/part-*.parquet"
FOLDS = [0,1,2,3,4]
BASE_OUT = "/Users/tree/Projects/recommemdation_bank/outputs"
BALANCED_PATH = f"{BASE_OUT}/balanced/mbd_targets_balanced.parquet"

OUT_DIR = f"{BASE_OUT}/json/trx"
os.makedirs(OUT_DIR, exist_ok=True)

def pretty_date(ts):
    # ts could be pandas.Timestamp, datetime, or int (epoch secs/ms)
    if pd.isna(ts):
        return ""
    if isinstance(ts, (np.integer, int, float)):
        s = float(ts)
        # heuristic: treat as seconds if it's realistic; otherwise ms
        if s > 1e12:  # definitely milliseconds
            dt = datetime.utcfromtimestamp(s/1000.0)
        elif s > 1e10: # ambiguous large seconds -> assume ms
            dt = datetime.utcfromtimestamp(s/1000.0)
        else:
            dt = datetime.utcfromtimestamp(s)
        return dt.strftime("%Y-%m-%d")
    if isinstance(ts, pd.Timestamp):
        return ts.strftime("%Y-%m-%d")
    if isinstance(ts, datetime):
        return ts.strftime("%Y-%m-%d")
    try:
        return pd.to_datetime(ts).strftime("%Y-%m-%d")
    except Exception:
        return ""
def read_trx_with_inferred_fold(glob_pattern):
    files = sorted(glob.glob(glob_pattern))
    if not files:
        raise FileNotFoundError(f"No TRX parquet matched: {glob_pattern}")
    dfs = []
    for p in files:
        dfp = pd.read_parquet(p)
        # infer fold from path if missing
        if "fold" not in dfp.columns:
            m = re.search(r"fold=(\d+)", p)
            fold_val = int(m.group(1)) if m else -1
            dfp["fold"] = fold_val
        dfs.append(dfp)
    df = pd.concat(dfs, ignore_index=True)
    # normalize client_id to str for matching
    if "client_id" in df.columns:
        df["client_id"] = df["client_id"].astype(str)
    return df

def trx_to_text(df):
    # expects columns: client_id, event_time, event_type, amount, src_type32 (if present)
    cols = [c for c in ["event_time","event_type","amount","src_type32"] if c in df.columns]
    d = df[cols].copy()
    # cap to last 256 by event_time if available
    if "event_time" in d.columns:
        d = d.sort_values("event_time")
    d = d.tail(256)
    # transforms
    if "amount" in d.columns:
        d["amount"] = pd.to_numeric(d["amount"], errors="coerce")
        d["amount_log10"] = d["amount"].apply(lambda x: "" if pd.isna(x) or x<=0 else f"{math.log10(x):.2f}")
    if "event_time" in d.columns:
        d["date"] = d["event_time"].apply(pretty_date)
    # assemble rows
    rows = []
    for _, r in d.iterrows():
        parts = []
        if "date" in r: parts.append(str(r["date"]))
        if "event_type" in r: parts.append(f"t{int(r['event_type'])}" if pd.notna(r["event_type"]) else "t")
        if "amount_log10" in r and r["amount_log10"]!="": parts.append(f"a{r['amount_log10']}")
        if "src_type32" in r and pd.notna(r["src_type32"]): parts.append(f"s{int(r['src_type32'])}")
        rows.append(" ".join(parts))
    body = "\n".join(rows)
    return f"<TRX>\n{body}\n</TRX>"

def write_jsonl_per_fold():
    df = read_trx_with_inferred_fold(TRX_GLOB)

    # debug: show discovered folds and coverage vs balanced ids
    print("Discovered folds in TRX:", sorted(df["fold"].unique()))
    bal = pd.read_parquet(BALANCED_PATH)
    bal_ids = set(bal["client_id"].astype(str))
    tr_ids  = set(df["client_id"].astype(str))
    print("TRX unique client_ids:", len(tr_ids))
    print("Balanced∩TRX client_ids:", len(bal_ids & tr_ids))

    out_all = []
    for fold in sorted(df["fold"].unique()):
        if fold not in FOLDS: 
            continue
        d = df[df["fold"] == fold].copy()
        parts = []
        for cid, g in d.groupby("client_id"):
            text = trx_to_text(g)
            parts.append({"client_id": str(cid), "text": text})
            out_all.append(parts[-1])
        out_path = f"{OUT_DIR}/mbd_fold_{fold}.jsonl"
        with open(out_path, "w") as f:
            for rec in parts:
                f.write(json.dumps(rec, ensure_ascii=False) + "\n")
        print("Wrote", out_path, len(parts))

    # combined
    out_path_all = f"{OUT_DIR}/mbd_all.jsonl"
    with open(out_path_all, "w") as f:
        for rec in out_all:
            f.write(json.dumps(rec, ensure_ascii=False) + "\n")
    print("Wrote", out_path_all, len(out_all))

def filter_by_balanced():
    bal = pd.read_parquet(BALANCED_PATH)
    ids = set(bal["client_id"].astype(str).tolist())
    src = f"{OUT_DIR}/mbd_all.jsonl"
    dst = f"{OUT_DIR}/json_balanced_trx.jsonl"
    kept = 0
    with open(src, "r") as fin, open(dst, "w") as fout:
        for line in fin:
            rec = json.loads(line)
            if rec["client_id"] in ids:
                fout.write(line); kept += 1
    print("Balanced TRX json written:", dst, "rows:", kept)

write_jsonl_per_fold()
filter_by_balanced()


Discovered folds in TRX: [np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4)]
TRX unique client_ids: 98721
Balanced∩TRX client_ids: 2118
Wrote /Users/tree/Projects/recommemdation_bank/outputs/json/trx/mbd_fold_0.jsonl 20032
Wrote /Users/tree/Projects/recommemdation_bank/outputs/json/trx/mbd_fold_1.jsonl 19598
Wrote /Users/tree/Projects/recommemdation_bank/outputs/json/trx/mbd_fold_2.jsonl 19652
Wrote /Users/tree/Projects/recommemdation_bank/outputs/json/trx/mbd_fold_3.jsonl 19697
Wrote /Users/tree/Projects/recommemdation_bank/outputs/json/trx/mbd_fold_4.jsonl 19742
Wrote /Users/tree/Projects/recommemdation_bank/outputs/json/trx/mbd_all.jsonl 98721
Balanced TRX json written: /Users/tree/Projects/recommemdation_bank/outputs/json/trx/json_balanced_trx.jsonl rows: 2118


In [4]:
# Recreate a df in the outer scope for debugging
df_debug = read_trx_with_inferred_fold(TRX_GLOB)

bal = pd.read_parquet(BALANCED_PATH)
bal_ids = set(bal["client_id"].astype(str))
tr_ids  = set(df_debug["client_id"].astype(str))

print("TRX unique client_ids:", len(tr_ids))
print("Balanced ∩ TRX client_ids:", len(bal_ids & tr_ids))
print("Per-fold TRX client_ids:", df_debug.groupby("fold")["client_id"].nunique().to_dict())

TRX unique client_ids: 98721
Balanced ∩ TRX client_ids: 2118
Per-fold TRX client_ids: {0: 20032, 1: 19598, 2: 19652, 3: 19697, 4: 19742}
