In [1]:
import json
from pathlib import Path
from collections import defaultdict, OrderedDict
from typing import Dict, Tuple, List

IN_DIR = Path(r"..\batch_results_gpt4o_normalized")
OUT_TEX = Path("RQ1_t4.tex")

APP_NAME = OrderedDict([
    ("A1", "WhatsApp"),
    ("A2", "Snapchat"),
    ("A3", "Telegram"),
    ("A4", r"\begin{tabular}[c]{@{}l@{}}Google \\Maps\end{tabular}"),
    ("A5", r"\begin{tabular}[c]{@{}l@{}}Samsung \\Internet\end{tabular}"),
    ("I1", "WhatsApp"),
    ("I2", "Contacts"),
    ("I3", r"\begin{tabular}[c]{@{}l@{}}Apple \\Messages\end{tabular}"),
    ("I4", "Safari"),
    ("I5", "Calendar"),
])

# Plain-text screen display names (no LaTeX newlines)
APP_NAME_PLAIN = OrderedDict([
    ("A1", "WhatsApp"),
    ("A2", "Snapchat"),
    ("A3", "Telegram"),
    ("A4", "Google Maps"),
    ("A5", "Samsung Internet"),
    ("I1", "WhatsApp"),
    ("I2", "Contacts"),
    ("I3", "Apple Messages"),
    ("I4", "Safari"),
    ("I5", "Calendar"),
])

PII_COLS = OrderedDict([
    ("EMAIL", "Email"),
    ("PHONE", "Phone"),
    ("USERNAME", "User Name"),
    ("PERSON_NAME", "Person Name"),
    ("POSTAL_ADDRESS", "Postal Address"),
])

MAX_DB_NAME_LEN = 20


def get_app_code(db_path: str) -> str:
    stem = Path(db_path).stem
    return stem.split("_", 1)[0] if "_" in stem else stem.split("-", 1)[0]


def get_db_filename(db_path: str) -> str:
    name = Path(db_path).name
    if "_" in name:
        return name.split("_", 1)[1]
    return name


def latex_escape(s: str) -> str:
    return s.replace("\\", r"\textbackslash{}").replace("_", r"\_")


def shorten_db_name(db_file: str, max_len: int = MAX_DB_NAME_LEN) -> str:
    s = db_file.strip()
    if len(s) <= max_len:
        return s

    p = Path(s)
    stem = p.stem
    suffix = p.suffix
    if not suffix:
        return s[: max_len - 3] + "..."

    keep = max_len - len(suffix) - 3
    if keep <= 0:
        return s[: max_len - 3] + "..."
    return stem[:keep] + "..." + suffix


def load_db_level_counts(folder: Path) -> Dict[Tuple[str, str], Dict[str, int]]:
    counts: Dict[Tuple[str, str], Dict[str, int]] = defaultdict(lambda: {t: 0 for t in PII_COLS.keys()})

    for fp in sorted(folder.glob("*.jsonl")):
        with fp.open("r", encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue
                rec = json.loads(line)
                if not isinstance(rec, dict):
                    continue

                db_path = rec.get("db_path", "")
                pii_type = (rec.get("PII_type") or "").strip().upper()
                if not db_path or pii_type not in PII_COLS:
                    continue

                app = get_app_code(db_path)
                db_file = get_db_filename(db_path)

                n = rec.get("Num_of_PII", 0)
                if isinstance(n, bool) or not isinstance(n, (int, float)):
                    n = 0
                counts[(app, db_file)][pii_type] += int(n)

    return counts


def format_row_tex(
    app_code: str,
    app_display: str,
    db_file: str,
    counts_for_db: Dict[str, int],
    is_first_row_for_app: bool,
    nrows_for_app: int,
) -> str:
    email = counts_for_db["EMAIL"]
    phone = counts_for_db["PHONE"]
    uname = counts_for_db["USERNAME"]
    pname = counts_for_db["PERSON_NAME"]
    addr = counts_for_db["POSTAL_ADDRESS"]
    total = email + phone + uname + pname + addr

    db_show = shorten_db_name(db_file, MAX_DB_NAME_LEN)
    db_tex = latex_escape(db_show)

    if is_first_row_for_app:
        return (
            rf"\multirow{{{nrows_for_app}}}{{*}}{{{app_code}}} & "
            rf"\multirow{{{nrows_for_app}}}{{*}}{{{app_display}}} & "
            rf"{db_tex} & {email} & {phone} & {uname} & {pname} & {addr} & \textbf{{{total}}} \\"
        )

    return rf" &  & {db_tex} & {email} & {phone} & {uname} & {pname} & {addr} & \textbf{{{total}}} \\"


def build_table_tex(counts: Dict[Tuple[str, str], Dict[str, int]]) -> str:
    by_app: Dict[str, List[Tuple[str, Dict[str, int]]]] = defaultdict(list)
    for (app, db_file), c in counts.items():
        by_app[app].append((db_file, c))

    for app in by_app:
        by_app[app].sort(key=lambda x: x[0].lower())

    lines: List[str] = []
    lines.append(r"\begin{table*}[th]")
    lines.append(r"\centering")
    lines.append(r"\small")
    lines.append(r"\caption{PII Discovered Per Application and Database (ChatGPT 4o-mini)}")
    lines.append(r"\label{tab:pii_per_app_gpt_details}")
    lines.append(r"\begin{tabular}{|c|l|l|r|r|r|r|r|r|}")
    lines.append(r"\hline")
    lines.append(
        r"\textbf{ID} & \textbf{Application} & \textbf{Database} & "
        r"\textbf{Email} & \textbf{Phone} & \textbf{User Name} & \textbf{Person Name} & "
        r"\textbf{Postal Address} & \textbf{Total PII} \\"
    )
    lines.append(r"\hline")

    app_order = list(APP_NAME.keys()) + [a for a in sorted(by_app.keys()) if a not in APP_NAME]

    for app in app_order:
        if app not in by_app:
            continue
        db_rows = by_app[app]
        nrows = len(db_rows)
        app_display = APP_NAME.get(app, app)

        for i, (db_file, cdict) in enumerate(db_rows):
            lines.append(format_row_tex(app, app_display, db_file, cdict, i == 0, nrows))
            if i < nrows - 1:
                lines.append(r"\cline{3-9}")
            else:
                lines.append(r"\hline")

    lines.append(r"\end{tabular}")
    lines.append(r"\end{table*}")
    return "\n".join(lines)


def build_plain_text_table(counts: Dict[Tuple[str, str], Dict[str, int]]) -> str:
    """
    Prints a simple readable table to the console (no LaTeX, no multirow).
    One row per (app, db).
    """
    by_app: Dict[str, List[Tuple[str, Dict[str, int]]]] = defaultdict(list)
    for (app, db_file), c in counts.items():
        by_app[app].append((db_file, c))

    for app in by_app:
        by_app[app].sort(key=lambda x: x[0].lower())

    rows: List[List[str]] = []
    header = ["ID", "Application", "Database", "Email", "Phone", "UserName", "PersonName", "PostalAddr", "Total"]
    rows.append(header)

    app_order = list(APP_NAME_PLAIN.keys()) + [a for a in sorted(by_app.keys()) if a not in APP_NAME_PLAIN]

    for app in app_order:
        if app not in by_app:
            continue
        app_disp = APP_NAME_PLAIN.get(app, app)

        for db_file, cdict in by_app[app]:
            email = cdict["EMAIL"]
            phone = cdict["PHONE"]
            uname = cdict["USERNAME"]
            pname = cdict["PERSON_NAME"]
            addr = cdict["POSTAL_ADDRESS"]
            total = email + phone + uname + pname + addr

            db_show = shorten_db_name(db_file, MAX_DB_NAME_LEN)
            rows.append([app, app_disp, db_show, str(email), str(phone), str(uname), str(pname), str(addr), str(total)])

    # compute column widths
    widths = [0] * len(rows[0])
    for r in rows:
        for i, cell in enumerate(r):
            widths[i] = max(widths[i], len(cell))

    def fmt_row(r: List[str]) -> str:
        parts = []
        for i, cell in enumerate(r):
            if i <= 2:
                parts.append(cell.ljust(widths[i]))
            else:
                parts.append(cell.rjust(widths[i]))
        return "  ".join(parts)

    lines = [fmt_row(rows[0])]
    lines.append("-" * len(lines[0]))
    for r in rows[1:]:
        lines.append(fmt_row(r))
    return "\n".join(lines)


if __name__ == "__main__":
    counts = load_db_level_counts(IN_DIR)

    # Write LaTeX
    tex = build_table_tex(counts)
    OUT_TEX.write_text(tex, encoding="utf-8")

    # Print a plain-text preview to screen
    print(build_plain_text_table(counts))
    print(f"\nWrote LaTeX: {OUT_TEX.resolve()}")


ID  Application       Database              Email  Phone  UserName  PersonName  PostalAddr  Total
-------------------------------------------------------------------------------------------------
A1  WhatsApp          commerce.db               0      0         0           0           0      0
A1  WhatsApp          msgstore.db               0      7         2          22           0     31
A1  WhatsApp          wa.db                     0     16         0          10           0     26
A2  Snapchat          core.db                   0      1        12           1           0     14
A2  Snapchat          journal.db                0      0         0           0           0      0
A2  Snapchat          main.db                   1     13         0          12           0     26
A3  Telegram          account1cache4.db         0      0         0           0           0      0
A3  Telegram          account2cache4.db         0      0         0           0           0      0
A3  Telegram        