
# 02 — GEO → JSONL (per client) + Balanced Filter

- Reads GEO parquet files across your folds.
- Dedupe **consecutive** identical rows, cap last 64.
- `<GEO>...</GEO>` block with date + geohash levels.
- Filter with `mbd_targets_balanced.parquet` to produce `json_balanced_geo.jsonl`.


In [2]:
# ====== CONFIG (keep in sync with 00) ======
import os, glob, json, re
import pandas as pd
import numpy as np
from datetime import datetime

GEO_GLOB = "/Users/tree/Projects/recommemdation_bank/data/mbd_mini/detail/geo/fold=*/part-*.parquet"
FOLDS = [0,1,2,3,4]
BASE_OUT = "/Users/tree/Projects/recommemdation_bank/outputs"
BALANCED_PATH = f"{BASE_OUT}/balanced/mbd_targets_balanced.parquet"

OUT_DIR = f"{BASE_OUT}/json/geo"
os.makedirs(OUT_DIR, exist_ok=True)

def pretty_date(ts):
    # robust YYYY-MM-DD for sec/ms/ts/string
    if pd.isna(ts):
        return ""
    if isinstance(ts, (np.integer, int, float)):
        s = float(ts)
        if s > 1e12:      # ms
            dt = datetime.utcfromtimestamp(s/1000.0)
        elif s > 1e10:    # ambiguous -> assume ms
            dt = datetime.utcfromtimestamp(s/1000.0)
        else:             # sec
            dt = datetime.utcfromtimestamp(s)
        return dt.strftime("%Y-%m-%d")
    if isinstance(ts, pd.Timestamp) or isinstance(ts, datetime):
        return pd.to_datetime(ts).strftime("%Y-%m-%d")
    try:
        return pd.to_datetime(ts).strftime("%Y-%m-%d")
    except Exception:
        return ""

def read_geo_with_inferred_fold(glob_pattern):
    files = sorted(glob.glob(glob_pattern))
    if not files:
        raise FileNotFoundError(f"No GEO parquet matched: {glob_pattern}")
    dfs = []
    for p in files:
        # read only needed columns if possible
        try:
            dfp = pd.read_parquet(p, columns=["client_id","event_time","geohash_4","geohash_5","geohash_6"])
        except Exception:
            dfp = pd.read_parquet(p)
        if "fold" not in dfp.columns:
            m = re.search(r"fold=(\d+)", p)
            dfp["fold"] = int(m.group(1)) if m else -1
        if "client_id" in dfp.columns:
            dfp["client_id"] = dfp["client_id"].astype(str)
        dfs.append(dfp)
    df = pd.concat(dfs, ignore_index=True)
    return df

def _safe_tag(val, prefix):
    if pd.isna(val):
        return ""
    try:
        return f"{prefix}{int(val)}"
    except Exception:
        return f"{prefix}{str(val)}"

def dedupe_consecutive(df, cols):
    """
    Drop consecutive duplicates within THIS frame (no grouping).
    Note: geo_to_text() is invoked per client, so grouping is unnecessary here.
    """
    if df.empty or not cols:
        return df
    prev = df[cols].shift()
    keep = prev.isna().any(axis=1) | (df[cols] != prev).any(axis=1)
    return df.loc[keep]

def geo_to_text(df):
    # expects a single client's rows; columns may include: event_time, geohash_4/5/6
    cols = [c for c in ["event_time","geohash_4","geohash_5","geohash_6"] if c in df.columns]
    d = df[cols].copy()

    if "event_time" in d.columns:
        d = d.sort_values("event_time")

    # consecutive duplicate collapse on geohash columns
    dedupe_cols = [c for c in ["geohash_4","geohash_5","geohash_6"] if c in d.columns]
    d = dedupe_consecutive(d, dedupe_cols)

    # cap to last 64 rows
    d = d.tail(64)

    if "event_time" in d.columns:
        d["date"] = d["event_time"].apply(pretty_date)

    rows = []
    for _, r in d.iterrows():
        parts = []
        if "date" in r: parts.append(str(r["date"]))
        if "geohash_4" in r: 
            tag = _safe_tag(r["geohash_4"], "g4"); 
            if tag: parts.append(tag)
        if "geohash_5" in r: 
            tag = _safe_tag(r["geohash_5"], "g5");  
            if tag: parts.append(tag)
        if "geohash_6" in r: 
            tag = _safe_tag(r["geohash_6"], "g6");  
            if tag: parts.append(tag)
        rows.append(" ".join(parts))
    body = "\n".join(rows)
    return f"<GEO>\n{body}\n</GEO>"

def write_jsonl_per_fold():
    df = read_geo_with_inferred_fold(GEO_GLOB)

    # debug: show discovered folds and coverage vs balanced ids
    print("Discovered GEO folds:", sorted(df["fold"].unique()))
    bal = pd.read_parquet(BALANCED_PATH)
    bal_ids = set(bal["client_id"].astype(str))
    geo_ids = set(df["client_id"].astype(str))
    print("GEO unique client_ids:", len(geo_ids))
    print("Balanced∩GEO client_ids:", len(bal_ids & geo_ids))

    out_all = []
    for fold in sorted(df["fold"].unique()):
        if fold not in FOLDS: 
            continue
        d = df[df["fold"] == fold].copy()
        parts = []
        for cid, g in d.groupby("client_id"):
            text = geo_to_text(g)
            parts.append({"client_id": str(cid), "text": text})
            out_all.append(parts[-1])
        out_path = f"{OUT_DIR}/mbd_fold_{fold}.jsonl"
        with open(out_path, "w") as f:
            for rec in parts:
                f.write(json.dumps(rec, ensure_ascii=False) + "\n")
        print("Wrote", out_path, len(parts))

    # combined
    out_path_all = f"{OUT_DIR}/mbd_all.jsonl"
    with open(out_path_all, "w") as f:
        for rec in out_all:
            f.write(json.dumps(rec, ensure_ascii=False) + "\n")
    print("Wrote", out_path_all, len(out_all))

def filter_by_balanced():
    bal = pd.read_parquet(BALANCED_PATH)
    ids = set(bal["client_id"].astype(str).tolist())
    src = f"{OUT_DIR}/mbd_all.jsonl"
    dst = f"{OUT_DIR}/json_balanced_geo.jsonl"
    kept = 0
    with open(src, "r") as fin, open(dst, "w") as fout:
        for line in fin:
            rec = json.loads(line)
            if rec["client_id"] in ids:
                fout.write(line); kept += 1
    print("Balanced GEO json written:", dst, "rows:", kept)

write_jsonl_per_fold()
filter_by_balanced()

Discovered GEO folds: [np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4)]
GEO unique client_ids: 72573
Balanced∩GEO client_ids: 1623
Wrote /Users/tree/Projects/recommemdation_bank/outputs/json/geo/mbd_fold_0.jsonl 14863
Wrote /Users/tree/Projects/recommemdation_bank/outputs/json/geo/mbd_fold_1.jsonl 14409
Wrote /Users/tree/Projects/recommemdation_bank/outputs/json/geo/mbd_fold_2.jsonl 14396
Wrote /Users/tree/Projects/recommemdation_bank/outputs/json/geo/mbd_fold_3.jsonl 14380
Wrote /Users/tree/Projects/recommemdation_bank/outputs/json/geo/mbd_fold_4.jsonl 14525
Wrote /Users/tree/Projects/recommemdation_bank/outputs/json/geo/mbd_all.jsonl 72573
Balanced GEO json written: /Users/tree/Projects/recommemdation_bank/outputs/json/geo/json_balanced_geo.jsonl rows: 1623
