In [None]:
import json
from pathlib import Path
from typing import Any, Dict, Tuple

IGNORE_FIELDS = {"Raw_rows_first_100", "Exploration_sql", "Extraction_sql", "PII_Prompt"}


def _dedupe_preserve_order(items):
    """
    Stable dedupe for lists that may contain scalars, dicts, or lists.
    """
    seen = set()
    out = []
    for x in items:
        key = json.dumps(x, sort_keys=True, ensure_ascii=False) if isinstance(x, (dict, list)) else x
        if key in seen:
            continue
        seen.add(key)
        out.append(x)
    return out


def prefix_source_columns(db_path: str, cols: list) -> list:
    """
    Prefix each source column with the database filename to avoid ambiguity
    after aggregating across many DBs.

    Example:
      db_path = selectedDBs\\A1_msgstore.db
      col     = message.text_data
      -> A1_msgstore.db:message.text_data
    """
    db_file = Path(db_path).name
    out = []
    for c in cols:
        if isinstance(c, str) and c:
            out.append(f"{db_file}:{c}")
    return out


def aggregate_jsonl_folder_corpus_level(in_dir: str | Path, out_path: str | Path) -> Path:
    """
    Corpus-level aggregation across all *.jsonl files in in_dir, grouped ONLY by PII_type.

    Input records are expected to already be normalized (your batch_results_normalized),
    but this function still performs dedupe at aggregation time.

    Output per PII_type keeps:
      - PII_type
      - PII_all: concatenated across corpus (with duplicates)
      - PII_unique: deduped
      - Num_of_PII_all: total count with duplicates (sum of per-record Num_of_PII or len(PII))
      - Num_of_PII_unique: len(PII_unique)
      - source_columns: deduped, prefixed with db filename
      - Num_of_source_columns: len(source_columns)

    It ignores IGNORE_FIELDS and discards all other keys.
    """
    in_dir = Path(in_dir)
    out_path = Path(out_path)
    out_path.parent.mkdir(parents=True, exist_ok=True)

    grouped: Dict[str, Dict[str, Any]] = {}

    for jsonl_file in sorted(in_dir.glob("*.jsonl")):
        with jsonl_file.open("r", encoding="utf-8") as f:
            for line_no, line in enumerate(f, start=1):
                line = line.strip()
                if not line:
                    continue

                try:
                    rec = json.loads(line)
                except json.JSONDecodeError as e:
                    raise ValueError(f"Bad JSON in {jsonl_file} line {line_no}: {e}") from e

                pii_type = rec.get("PII_type")
                if not pii_type:
                    continue

                if pii_type not in grouped:
                    grouped[pii_type] = {
                        "PII_type": pii_type,
                        "PII_all": [],
                        "PII_unique": [],
                        "Num_of_PII_all": 0,
                        "Num_of_PII_unique": 0,
                        "source_columns": [],
                        "Num_of_source_columns": 0,
                    }

                agg = grouped[pii_type]

                # --- PII + count (with-dup) ---
                pii_list = rec.get("PII", [])
                if isinstance(pii_list, list):
                    agg["PII_all"].extend(pii_list)

                n = rec.get("Num_of_PII")
                if isinstance(n, (int, float)) and not isinstance(n, bool):
                    agg["Num_of_PII_all"] += int(n)
                else:
                    agg["Num_of_PII_all"] += len(pii_list) if isinstance(pii_list, list) else 0

                # --- source_columns (with-dup) ---
                dbp = rec.get("db_path", "")
                cols = rec.get("source_columns", [])
                if isinstance(cols, list):
                    agg["source_columns"].extend(prefix_source_columns(dbp, cols))

                # ignore everything else (and IGNORE_FIELDS)

    # --- Finalize: dedupe lists + compute unique counts ---
    for agg in grouped.values():
        agg["PII_unique"] = _dedupe_preserve_order(agg["PII_all"])
        agg["Num_of_PII_unique"] = len(agg["PII_unique"])

        agg["source_columns"] = _dedupe_preserve_order(agg["source_columns"])
        agg["Num_of_source_columns"] = len(agg["source_columns"])

    # --- Write aggregated JSONL ---
    with out_path.open("w", encoding="utf-8") as f:
        for pii_type in sorted(grouped.keys()):
            f.write(json.dumps(grouped[pii_type], ensure_ascii=False) + "\n")

    return out_path


if __name__ == "__main__":
    out = aggregate_jsonl_folder_corpus_level(
        r"..\batch_results_gpt4o_normalized",
        "RQ3_corpus_level_gpt4o.jsonl",
    )
    print(f"Wrote: {out.resolve()}")
    
    
    out = aggregate_jsonl_folder_corpus_level(
        r"..\ground_truth_normalized",
        "RQ3_corpus_level_ground_truth.jsonl",
    )
    print(f"Wrote: {out.resolve()}")

Wrote: I:\project2026\llmagent\RQs\RQ3\RQ3_corpus_level_gpt4o.jsonl
