In [2]:
import json
from pathlib import Path
from typing import Any, Dict, Tuple

IGNORE_FIELDS = {"Raw_rows_first_100", "Exploration_sql", "Extraction_sql", "PII_Prompt"}


def get_app_code(db_path: str) -> str:
    """
    selectedDBs\\A2_journal.db  -> A2
    selectedDBs/A1_msgstore.db -> A1
    """
    stem = Path(db_path).stem  # A2_journal
    return stem.split("_", 1)[0]


def _dedupe_preserve_order(items):
    seen = set()
    out = []
    for x in items:
        key = json.dumps(x, sort_keys=True, ensure_ascii=False) if isinstance(x, (dict, list)) else x
        if key in seen:
            continue
        seen.add(key)
        out.append(x)
    return out


def prefix_source_columns(db_path: str, cols: list) -> list:
    """
    Prefix each source column with the database filename to avoid ambiguity
    after aggregating multiple DB files under the same app.

    Example:
      db_path = selectedDBs\\A1_msgstore.db
      col     = message.text_data
      -> A1_msgstore.db:message.text_data
    """
    db_file = Path(db_path).name  # includes extension
    out = []
    for c in cols:
        if isinstance(c, str) and c:
            out.append(f"{db_file}:{c}")
    return out


def aggregate_jsonl_folder(in_dir: str | Path, out_path: str | Path) -> Path:
    """
    Read all *.jsonl files under in_dir and aggregate records by:
      (app_code derived from db_path, PII_type)

    Output per group:
      - db_path: "selectedDBs\\<APP_CODE>"
      - PII_type
      - PII_all: with duplicates
      - PII_unique: deduped (exact match)
      - Num_of_PII_all: with duplicates (sum of per-record Num_of_PII or len(PII))
      - Num_of_PII_unique: len(PII_unique)
      - source_columns: deduped, prefixed with db filename
      - other list fields: deduped
      - other numeric fields: summed
      - ignores Raw_rows_first_100, Exploration_sql, Extraction_sql
    """
    in_dir = Path(in_dir)
    out_path = Path(out_path)
    out_path.parent.mkdir(parents=True, exist_ok=True)

    grouped: Dict[Tuple[str, str], Dict[str, Any]] = {}

    for jsonl_file in sorted(in_dir.glob("*.jsonl")):
        with jsonl_file.open("r", encoding="utf-8") as f:
            for line_no, line in enumerate(f, start=1):
                line = line.strip()
                if not line:
                    continue

                try:
                    rec = json.loads(line)
                except json.JSONDecodeError as e:
                    raise ValueError(f"Bad JSON in {jsonl_file} line {line_no}: {e}") from e

                dbp = rec.get("db_path", "")
                pii_type = rec.get("PII_type")
                if not pii_type:
                    continue

                app_code = get_app_code(dbp)
                key = (app_code, pii_type)

                if key not in grouped:
                    grouped[key] = {
                        "db_path": f"selectedDBs\\{app_code}",
                        "PII_type": pii_type,
                        "PII_all": [],
                        "PII_unique": [],
                        "Num_of_PII_all": 0,
                        "Num_of_PII_unique": 0,
                    }

                agg = grouped[key]

                # --- Special handling: PII + counts ---
                pii_list = rec.get("PII", [])
                if isinstance(pii_list, list):
                    agg["PII_all"].extend(pii_list)

                n = rec.get("Num_of_PII")
                if isinstance(n, (int, float)) and not isinstance(n, bool):
                    agg["Num_of_PII_all"] += int(n)
                else:
                    agg["Num_of_PII_all"] += len(pii_list) if isinstance(pii_list, list) else 0

                # --- Aggregate other fields (arrays/numbers only) ---
                for k, v in rec.items():
                    if k in IGNORE_FIELDS:
                        continue
                    if k in ("db_path", "PII_type", "PII", "Num_of_PII"):
                        continue

                    # Prefix source_columns with db filename
                    if k == "source_columns":
                        cols = v if isinstance(v, list) else []
                        v = prefix_source_columns(dbp, cols)

                    if isinstance(v, list):
                        if k not in agg:
                            agg[k] = []
                        if isinstance(agg[k], list):
                            agg[k].extend(v)

                    elif isinstance(v, (int, float)) and not isinstance(v, bool):
                        if k not in agg:
                            agg[k] = 0
                        if isinstance(agg[k], (int, float)) and not isinstance(agg[k], bool):
                            agg[k] += v

                    # ignore non-list, non-numeric values

    # --- Finalize: dedupe lists + compute unique PII fields ---
    for agg in grouped.values():
        agg["PII_unique"] = _dedupe_preserve_order(agg["PII_all"])
        agg["Num_of_PII_unique"] = len(agg["PII_unique"])

        for k, v in list(agg.items()):
            if isinstance(v, list) and k not in ("PII_all", "PII_unique"):
                agg[k] = _dedupe_preserve_order(v)

        # source_columns counts
        src = agg.get("source_columns", [])
        if isinstance(src, list):
            agg["Num_of_source_columns_unique"] = len(src)
            # optional: with-dup count (before dedupe) is not available anymore here
            # unless you track it separately.
        else:
            agg["Num_of_source_columns_unique"] = 0

    # --- Write aggregated JSONL ---
    with out_path.open("w", encoding="utf-8") as f:
        for (app_code, pii_type) in sorted(grouped.keys()):
            f.write(json.dumps(grouped[(app_code, pii_type)], ensure_ascii=False) + "\n")

    return out_path


if __name__ == "__main__":
    # --- Aggregate GPT-4o results ---
    IN_DIR = Path(r"..\batch_results_gpt4o_normalized")
    OUT_DIR = Path(r".")  # pick whatever folder you want
    OUT_DIR.mkdir(parents=True, exist_ok=True)

    out_path = OUT_DIR / "RQ2_app_level_gpt4o.jsonl"

    out = aggregate_jsonl_folder(IN_DIR, out_path)
    print(f"Wrote: {out.resolve()}")
    
    # --- Aggregate ground truth as well ---
    
    IN_DIR = Path(r"..\ground_truth_normalized")
    OUT_DIR = Path(r".")  # pick whatever folder you want
    OUT_DIR.mkdir(parents=True, exist_ok=True)

    out_path = OUT_DIR / "RQ2_app_level_ground_truth.jsonl"

    out = aggregate_jsonl_folder(IN_DIR, out_path)
    print(f"Wrote: {out.resolve()}")



Wrote: I:\project2026\llmagent\RQs\RQ2\RQ2_app_level_gpt4o.jsonl
Wrote: I:\project2026\llmagent\RQs\RQ2\RQ2_app_level_ground_truth.jsonl
