In [None]:
import json
from pathlib import Path
import sys
import shutil

# IMPORTANT: sys.path needs a DIRECTORY, not the .py file itself
STATS_DIR = Path(r"I:\project2026\llmagent\RQs").resolve()  # folder containing stats_utils.py
sys.path.insert(0, str(STATS_DIR))

from stats_utils import normalize_and_slim_record

IN_DIR = Path(r"..\..\batch_results")
OUT_DIR = Path(r"..\batch_results_normalized")


def process_file(in_path: Path, out_path: Path) -> int:
    n = 0
    with in_path.open("r", encoding="utf-8") as fin, out_path.open("w", encoding="utf-8") as fout:
        for line in fin:
            line = line.strip()
            if not line:
                continue
            obj = json.loads(line)
            if not isinstance(obj, dict):
                continue
            slim = normalize_and_slim_record(obj)
            fout.write(json.dumps(slim, ensure_ascii=False) + "\n")
            n += 1
    return n


def main() -> None:
    # Delete OUT_DIR if it exists, then recreate it cleanly
    if OUT_DIR.exists():
        if OUT_DIR.is_dir():
            shutil.rmtree(OUT_DIR)
        else:
            OUT_DIR.unlink()

    OUT_DIR.mkdir(parents=True, exist_ok=True)

    files = sorted(IN_DIR.glob("*.jsonl"))
    if not files:
        print(f"No .jsonl files found in: {IN_DIR.resolve()}")
        return

    total_files = 0
    total_records = 0

    for fp in files:
        out_fp = OUT_DIR / fp.name
        n = process_file(fp, out_fp)
        print(f"{fp.name}: {n} records -> {out_fp}")
        total_files += 1
        total_records += n

    print(f"Done. Files: {total_files}, Records: {total_records}")
    print(f"Output folder: {OUT_DIR.resolve()}")


if __name__ == "__main__":
    main()

PII_A1_commerce_20260127T175911Z.jsonl: 5 records -> ..\batch_results_normalized\PII_A1_commerce_20260127T175911Z.jsonl
PII_A1_msgstore_20260127T180043Z.jsonl: 5 records -> ..\batch_results_normalized\PII_A1_msgstore_20260127T180043Z.jsonl
PII_A1_wa_20260127T180213Z.jsonl: 5 records -> ..\batch_results_normalized\PII_A1_wa_20260127T180213Z.jsonl
PII_A2_core_20260127T180339Z.jsonl: 5 records -> ..\batch_results_normalized\PII_A2_core_20260127T180339Z.jsonl
PII_A2_journal_20260127T180440Z.jsonl: 5 records -> ..\batch_results_normalized\PII_A2_journal_20260127T180440Z.jsonl
PII_A2_main_20260127T180710Z.jsonl: 5 records -> ..\batch_results_normalized\PII_A2_main_20260127T180710Z.jsonl
PII_A3_account1cache4_20260127T180745Z.jsonl: 5 records -> ..\batch_results_normalized\PII_A3_account1cache4_20260127T180745Z.jsonl
PII_A3_account2cache4_20260127T180821Z.jsonl: 5 records -> ..\batch_results_normalized\PII_A3_account2cache4_20260127T180821Z.jsonl
PII_A3_account3cache4_20260127T180857Z.jsonl: 