In [1]:
import json
from pathlib import Path
from typing import Dict, Set, List

SYSTEM_PATH = Path("RQ3_corpus_level_gpt4o.jsonl")
GT_PATH = Path("RQ3_corpus_level_ground_truth.jsonl")

PII_TYPE_ORDER = ["EMAIL", "PHONE", "USERNAME", "PERSON_NAME", "POSTAL_ADDRESS"]
PII_TYPE_DISPLAY = {
    "EMAIL": "Email Address",
    "PHONE": "Phone Number",
    "USERNAME": "User Name",
    "PERSON_NAME": "Person Name",
    "POSTAL_ADDRESS": "Postal Address",
}

def load_pii_unique_sets(path: Path) -> Dict[str, Set[str]]:
    out: Dict[str, Set[str]] = {}
    with path.open("r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            rec = json.loads(line)
            if not isinstance(rec, dict):
                continue
            t = (rec.get("PII_type") or "").strip().upper()
            if not t:
                continue
            vals = rec.get("PII_unique", [])
            if not isinstance(vals, list):
                vals = [vals] if vals is not None else []
            out[t] = set(v for v in vals if isinstance(v, str))
    return out

def fmt_pct(x: float) -> str:
    return f"{x*100:.1f}\\%"

def make_rows(gt_sets: Dict[str, Set[str]], sys_sets: Dict[str, Set[str]]) -> List[str]:
    rows: List[str] = []
    for t in PII_TYPE_ORDER:
        G = gt_sets.get(t, set())
        S = sys_sets.get(t, set())
        overlap = len(G & S)
        gt_n = len(G)
        sys_n = len(S)
        recall = (overlap / gt_n) if gt_n else 0.0
        precision = (overlap / sys_n) if sys_n else 0.0

        name = PII_TYPE_DISPLAY.get(t, t)
        rows.append(
            f"{name}  & {gt_n}   & {sys_n}   & {overlap}  & {fmt_pct(recall)} & {fmt_pct(precision)} \\\\"
        )
    return rows

if __name__ == "__main__":
    sys_sets = load_pii_unique_sets(SYSTEM_PATH)
    gt_sets = load_pii_unique_sets(GT_PATH)

    print(r"\textbf{PII Type} &")
    print(r"\textbf{GT} &")
    print(r"\textbf{System} &")
    print(r"\textbf{Overlap} &")
    print(r"\textbf{Recall} &")
    print(r"\textbf{Precision} \\")
    print(r"\hline")

    for row in make_rows(gt_sets, sys_sets):
        print(row)
        print(r"\hline")


\textbf{PII Type} &
\textbf{GT} &
\textbf{System} &
\textbf{Overlap} &
\textbf{Recall} &
\textbf{Precision} \\
\hline
Email Address  & 10   & 10   & 10  & 100.0\% & 100.0\% \\
\hline
Phone Number  & 1050   & 1050   & 1050  & 100.0\% & 100.0\% \\
\hline
User Name  & 85   & 85   & 85  & 100.0\% & 100.0\% \\
\hline
Person Name  & 909   & 909   & 909  & 100.0\% & 100.0\% \\
\hline
Postal Address  & 2   & 2   & 2  & 100.0\% & 100.0\% \\
\hline
