In [None]:
import csv
import json
from pathlib import Path
from collections import OrderedDict
from typing import Dict

CSV_PATH = Path(r"app_total_columns.csv")
JSONL_PATH = Path(r"RQ2_app_level_gpt4o.jsonl")
OUT_TEX = Path("RQ2_search_space_reduction_gpt4o.tex")

APP_NAME_PLAIN = OrderedDict([
    ("A1", "WhatsApp"),
    ("A2", "Snapchat"),
    ("A3", "Telegram"),
    ("A4", "Google Maps"),
    ("A5", "Samsung Internet"),
    ("I1", "WhatsApp"),
    ("I2", "Contacts"),
    ("I3", "Apple Messages"),
    ("I4", "Safari"),
    ("I5", "Calendar"),
])


def get_app_code_from_db_path(db_path: str) -> str:
    p = Path(db_path)
    stem = p.stem
    if "_" in stem:
        return stem.split("_", 1)[0]
    if "-" in stem:
        return stem.split("-", 1)[0]
    return stem


def read_candidate_totals(csv_path: Path) -> Dict[str, int]:
    totals: Dict[str, int] = {}
    with csv_path.open("r", encoding="utf-8", newline="") as f:
        reader = csv.DictReader(f)
        for row in reader:
            code = (row.get("app_code") or "").strip()
            tc = row.get("total_columns")
            if not code or tc is None:
                continue
            totals[code] = int(tc)
    return totals


def read_scanned_cols_from_app_jsonl(jsonl_path: Path) -> Dict[str, int]:
    scanned: Dict[str, int] = {}
    with jsonl_path.open("r", encoding="utf-8") as f:
        for line_no, line in enumerate(f, start=1):
            line = line.strip()
            if not line:
                continue

            try:
                rec = json.loads(line)
            except json.JSONDecodeError as e:
                raise ValueError(f"Bad JSON in {jsonl_path} line {line_no}: {e}") from e

            if not isinstance(rec, dict):
                continue

            db_path = rec.get("db_path", "")
            if not isinstance(db_path, str) or not db_path:
                continue

            app = get_app_code_from_db_path(db_path)

            n_scanned = rec.get("Num_of_source_columns", None)
            if isinstance(n_scanned, bool) or (n_scanned is not None and not isinstance(n_scanned, (int, float))):
                n_scanned = None

            if n_scanned is None:
                cols = rec.get("source_columns", [])
                n_scanned = len(cols) if isinstance(cols, list) else 0

            scanned[app] = int(n_scanned)

    return scanned


def format_reduction(total: int, scanned: int) -> float:
    if total <= 0:
        return 0.0
    if scanned <= 0:
        return 100.0
    red = (1.0 - (scanned / total)) * 100.0
    if red < 0:
        red = 0.0
    if red > 100:
        red = 100.0
    return red


def build_latex_table(candidate_totals: Dict[str, int], scanned_cols: Dict[str, int]) -> str:
    lines = []
    lines.append(r"\begin{table}[th]")
    lines.append(r"\centering")
    lines.append(r"\caption{Search space reduction during row-level PII extraction.}")
    lines.append(r"\label{tab:search_space_reduction}")
    lines.append(r"\small")
    lines.append(r"\begin{tabular}{|l|l|p{1.3cm}|p{1.7cm}|p{1.0cm}|}")
    lines.append(r"\hline")
    lines.append(
        r"\textbf{ID} & \textbf{Apps} & "
        r"\textbf{Candidate Cols (Total)} & "
        r"\textbf{Cols Scanned (Extraction)} & "
        r"\textbf{Reduc. (\%)} \\"
    )
    lines.append(r"\hline")

    app_order = list(APP_NAME_PLAIN.keys())
    extra = sorted((set(candidate_totals) | set(scanned_cols)) - set(app_order))
    app_order += extra

    for app in app_order:
        if app not in candidate_totals and app not in scanned_cols:
            continue

        app_name = APP_NAME_PLAIN.get(app, app)
        total = int(candidate_totals.get(app, 0))
        scanned = int(scanned_cols.get(app, 0))
        reduc = format_reduction(total, scanned)

        lines.append(f"{app} & {app_name} & {total} & {scanned} & {reduc:.2f}\\% \\\\")
        lines.append(r"\hline")

    lines.append(r"\end{tabular}")
    lines.append(r"\end{table}")
    return "\n".join(lines)


def build_plaintext_table(candidate_totals: Dict[str, int], scanned_cols: Dict[str, int]) -> str:
    headers = ["ID", "Apps", "CandidateCols", "ColsScanned", "Reduc(%)"]

    app_order = list(APP_NAME_PLAIN.keys())
    extra = sorted((set(candidate_totals) | set(scanned_cols)) - set(app_order))
    app_order += extra

    rows = [headers]
    for app in app_order:
        if app not in candidate_totals and app not in scanned_cols:
            continue
        app_name = APP_NAME_PLAIN.get(app, app)
        total = int(candidate_totals.get(app, 0))
        scanned = int(scanned_cols.get(app, 0))
        reduc = format_reduction(total, scanned)
        rows.append([app, app_name, str(total), str(scanned), f"{reduc:.2f}%"])

    # column widths
    widths = [0] * len(headers)
    for r in rows:
        for i, cell in enumerate(r):
            widths[i] = max(widths[i], len(cell))

    def fmt_row(r):
        out = []
        for i, cell in enumerate(r):
            # left align text cols, right align numeric cols
            if i in (0, 1):
                out.append(cell.ljust(widths[i]))
            else:
                out.append(cell.rjust(widths[i]))
        return "  ".join(out)

    lines = [fmt_row(rows[0]), "-" * len(fmt_row(rows[0]))]
    for r in rows[1:]:
        lines.append(fmt_row(r))
    return "\n".join(lines)


if __name__ == "__main__":
    candidate_totals = read_candidate_totals(CSV_PATH)
    scanned_cols = read_scanned_cols_from_app_jsonl(JSONL_PATH)

    # Write LaTeX to file
    tex = build_latex_table(candidate_totals, scanned_cols)
    OUT_TEX.write_text(tex, encoding="utf-8")

    # Print ONLY plaintext table to screen
    print(build_plaintext_table(candidate_totals, scanned_cols))
    print(f"\nWrote LaTeX: {OUT_TEX.resolve()}")


ID  Apps              CandidateCols  ColsScanned  Reduc(%)
----------------------------------------------------------
A1  WhatsApp                   1637           14    99.14%
A2  Snapchat                    848            2    99.76%
A3  Telegram                   1197            0   100.00%
A4  Google Maps                  80            2    97.50%
A5  Samsung Internet            185           11    94.05%
I1  WhatsApp                    328            6    98.17%
I2  Contacts                     13           13     0.00%
I3  Apple Messages              186            0   100.00%
I4  Safari                       74            7    90.54%
I5  Calendar                    541            0   100.00%

Wrote LaTeX: I:\project2026\llmagent\RQs\RQ2\search_space_reduction_gpt4o.tex
