In [48]:
# ---------------------------------------------------------------------
# 0.  CONFIG  – change only these two paths
# ---------------------------------------------------------------------
PRED_CSV      = Path("/data1/home/srinivasana/PedXBench/data/outputs/llm_runs/llm_2stage_fc_predictions_o1.csv")
MANUAL_XLSX   = Path("/data1/home/srinivasana/PedXBench/data/outputs/manually_annotated_labels_100_final.xlsx")


PRED_LABELCOL = "resolved_label"      # in the LLM file
MAN_LABELCOL  = "resovled_label_A"    # in the manual file
# ---------------------------------------------------------------------

import re, pandas as pd
from sklearn.metrics import (
    confusion_matrix, classification_report,
    accuracy_score, f1_score
)

########################################################################
# 1.  helper – extract a canon-id like  "NDA_21505"
########################################################################
ID_RE   = re.compile(r"\b(?P<prefix>NDA|ANDA|BLA)\s*[-_/]?\s*(?P<num>\d{5,7})",
                     re.I)

def extract_canon(cell: str) -> str | None:
    if not isinstance(cell, str):
        return None
    m = ID_RE.search(cell)
    if m:
        return f"{m.group('prefix').upper()}_{int(m.group('num')):05d}"
    return None                   # nothing found → row will be dropped later




In [68]:
import re, json
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.metrics import (accuracy_score, f1_score, confusion_matrix,
                             classification_report)

# ───────────────────────────── paths ─────────────────────────────
PRED_CSV      = Path("/data1/home/srinivasana/PedXBench/data/processed/llm_2stage_fc_predictions_full.csv") ## or /data1/home/srinivasana/PedXBench/data/processed/llm_predictions_o3_v7_twostage.csv or /data1/home/srinivasana/PedXBench/data/outputs/llm_runs/llm_2stage_fc_predictions_gpt4omini.csv
MANUAL_XLSX   = Path("/data1/home/srinivasana/PedXBench/data/outputs/manually_annotated_labels_100_final.xlsx")


PRED_LABEL_COL   = "resolved_label"      # column in CSV with the LLM label
MANUAL_LABEL_COL = "resovled_label_A"    # column in XLSX with the gold label

ID_PAT = re.compile(r"\d{5,7}")          # 5–7 digit NDA/BLA/ANDA number

def canon(cell: str | float) -> str | None:
    """Return the first 5-7 digit number inside *cell*, else None."""
    if pd.isna(cell):
        return None
    m = ID_PAT.findall(str(cell))
    return m[0].lstrip("0") if m else None    # strip leading zeros for safety

LABEL_MAP = {
    "notextrapolated": "None",
    "none":            "None",
    "partial":         "Partial",
    "full":            "Full",
    "unlabeled":       "Unlabeled",
    "unlabelled":      "Unlabeled",
}

def norm(lbl):
    if not isinstance(lbl, str):
        return np.nan
    return LABEL_MAP.get(lbl.strip().lower(), lbl)

# ─────────────────────── load & clean ────────────────────────────
read_opts = dict(dtype=str, keep_default_na=False)

pred = (pd.read_csv(PRED_CSV, **read_opts)
          .assign(canon_id=lambda d:
                  d.get("canon_id", d.get("app_id")).map(canon))
          .rename(columns={PRED_LABEL_COL: "pred_label"})
          .assign(pred_label=lambda d: d["pred_label"].map(norm))
          .dropna(subset=["canon_id", "pred_label"]))

manual = (pd.read_excel(MANUAL_XLSX, engine="openpyxl", **read_opts)
            .assign(canon_id=lambda d:
                    d["FDA Application Number(s) "].map(canon))
            .rename(columns={MANUAL_LABEL_COL: "gold_label"})
            .assign(gold_label=lambda d: d["gold_label"].map(norm))
            .dropna(subset=["canon_id", "gold_label"]))

# ───────────────────── merge & evaluate ──────────────────────────
merged = pd.merge(manual, pred, on="canon_id", how="inner")
print(f"Merged rows: {len(merged)}  /  gold rows: {len(manual)}")

if merged.empty:
    raise SystemExit("❌ No overlapping IDs – check the canon-ID extraction.")

y_true, y_pred = merged["gold_label"], merged["pred_label"]
order = ["None", "Partial", "Full", "Unlabeled"]

print("\n=== Confusion matrix ===")
print(pd.DataFrame(confusion_matrix(y_true, y_pred, labels=order),
                   index=[f"gold_{l}" for l in order],
                   columns=[f"pred_{l}" for l in order]))

print("\n=== Classification report ===")
print(classification_report(y_true, y_pred, labels=order,
                            digits=3, zero_division=0))

acc  = accuracy_score(y_true, y_pred)
f1   = f1_score(y_true, y_pred, average="macro", zero_division=0)
print(f"Accuracy : {acc:.3f}")
print(f"Macro-F1 : {f1:.3f}")


Merged rows: 200  /  gold rows: 128

=== Confusion matrix ===
                pred_None  pred_Partial  pred_Full  pred_Unlabeled
gold_None             106             5          0              14
gold_Partial           20            34          0               3
gold_Full               0             0          1               1
gold_Unlabeled          9             0          0               7

=== Classification report ===
              precision    recall  f1-score   support

        None      0.785     0.848     0.815       125
     Partial      0.872     0.596     0.708        57
        Full      1.000     0.500     0.667         2
   Unlabeled      0.280     0.438     0.341        16

    accuracy                          0.740       200
   macro avg      0.734     0.595     0.633       200
weighted avg      0.772     0.740     0.745       200

Accuracy : 0.740
Macro-F1 : 0.633
