## first building a simple logistic regression 

In [2]:
# baseline_lr.ipynb  (cell 1)

import re, json, numpy as np, pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, classification_report, confusion_matrix


## code to clean fda application id column

In [9]:
import re, pandas as pd, numpy as np
from pathlib import Path

# ---------------------------------------------------------------------
# 0.  Compile once – keeps leading zeros in root & supplement
# ---------------------------------------------------------------------
_ID_RE = re.compile(
    r'\b(?P<kind>NDA|BLA|ANDA)\s+'             # KIND
    r'(?P<root>\d{5,7})'                       # ROOT (keep zeros)
    r'(?:\s*/\s*(?P<sup>\d+))?'                # optional "/0086"
    r'(?:\s*-\s*Original\s*(?P<orig>\d+))?',   # optional "- Original 1"
    flags=re.I,
)

def _parse_ids(cell: str):
    """Return list of dicts with canon_id, kind, root, supplement."""
    hits = []
    for m in _ID_RE.finditer(str(cell)):
        kind = m.group('kind').upper()
        root = m.group('root')                 # keep leading zeros
        if m.group('sup'):
            sup   = m.group('sup')             # keep leading zeros
            canon = f"{kind}_{root}_{sup}"
        elif m.group('orig'):
            sup   = f"ORIG{m.group('orig')}"
            canon = f"{kind}_{root}_{sup}"
        else:
            sup   = np.nan
            canon = f"{kind}_{root}"
        hits.append(dict(kind=kind, root=root,
                         supplement=sup, canon_id=canon))
    return hits or [dict(kind=np.nan, root=np.nan,
                         supplement=np.nan, canon_id=np.nan)]

# ---------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------
def add_canon_ids(df: pd.DataFrame, id_col: str) -> pd.DataFrame:
    """
    • Explodes *id_col* if a cell contains several application numbers.
    • Adds:  canon_id, kind, root, supplement.
    • Returns a fresh DataFrame (original left untouched).
    """
    rows = []
    for _, row in df.iterrows():
        for d in _parse_ids(row[id_col]):
            rows.append({**row.to_dict(), **d})
    return pd.DataFrame(rows)




def clean_xlsx(src: str | Path,
               id_col: str,
               dst: str | Path | None = None,
               sheet: int | str = 0,
               **read_kw) -> pd.DataFrame:
    """
    Load an Excel sheet, preserve literal 'None' strings, add canon_id columns,
    and (optionally) write the cleaned file back out.

    Parameters
    ----------
    src      : path to the source .xlsx
    id_col   : exact header that contains the FDA Application Number(s)
    dst      : path to save the cleaned file (same workbook format). If None,
               the cleaned DataFrame is just returned.
    sheet    : sheet index or name passed to read_excel
    read_kw  : any extra kwargs for read_excel (they override defaults)
    """
    # ---- default options that keep 'None' as text -----------------
    defaults = dict(dtype=str, keep_default_na=False)
    defaults.update(read_kw)                     # allow caller override

    df = pd.read_excel(src, sheet_name=sheet, engine="openpyxl", **defaults)

    # your existing function that appends kind / root / supplement / canon_id
    df = add_canon_ids(df, id_col=id_col)

    if dst:
        dst = Path(dst).with_suffix(".xlsx")
        dst.parent.mkdir(parents=True, exist_ok=True)
        df.to_excel(dst, index=False)
        print("✔ cleaned file written to", dst)

    return df



In [4]:
dual_A = pd.read_excel(
    "/Users/srinivasana/Documents/peds_agents_local/peds_dual_annotated_sheet1.xlsx",
    engine="openpyxl",
    dtype=str,
    keep_default_na=False,
    na_values=[],
    na_filter=False
)

# 2.  append canon-ID columns
dual_A = add_canon_ids(dual_A, id_col="FDA Application Number(s)")

In [5]:
dual_B = pd.read_excel(
    "/Users/srinivasana/Documents/peds_agents_local/data/peds_filtered_10_each_B.xlsx",
    engine="openpyxl",
    dtype=str,
    keep_default_na=False,
    na_values=[],
    na_filter=False
)

# 2.  append canon-ID columns
dual_B = add_canon_ids(dual_B, id_col="FDA Application Number(s)")

In [73]:
# annotated_100_clean = clean_xlsx(
#     "/Users/srinivasana/Documents/peds_agents/data/plc_100_for_annotation_complete.xlsx",
#     id_col="FDA Application Number(s) ",                # exact header text
#     dst="/Users/srinivasana/Documents/peds_agents/data/plc_100_for_annotation_complete.xlsx"
# )

✔ cleaned file written to /Users/srinivasana/Documents/peds_agents/data/plc_100_for_annotation_complete.xlsx


In [None]:
# manual_annotated_labels= pd.read_excel(
#     "/Users/srinivasana/Documents/peds_agents_local/data/plc_100_for_annotation_complete.xlsx",
#     engine="openpyxl",
#     dtype=str,
#     keep_default_na=False,
#     na_values=[],
#     na_filter=False
# )

In [23]:
manual_annotated_labels= pd.read_excel(
    "/Users/srinivasana/Documents/peds_agents_local/data/plc_100_for_annotation_resolved_v3.xlsx",
    engine="openpyxl",
    dtype=str,
    keep_default_na=False,
    na_values=[],
    na_filter=False
)

In [24]:
manual_annotated_labels.columns

Index(['FDA Application Number(s) ', 'Pediatric Labeling Approval Date',
       'Trade Name', 'Generic Name', 'Type of Legislation', 'Indication',
       'Indication(s) Studied', 'Labeling Change Summary',
       'Therapeutic Category', 'Dosage Form(s)', 'Route(s) of Administration',
       'Pharmacological Class', 'Studied in Neonates', 'Indicated in Neonates',
       'Product Labeling Link', 'Study Number', 'Ages Studied', 'Study Type',
       'Study Design', 'Patients Enrolled', 'Patients Analyzed',
       'Number of Centers', 'Number of Countries',
       'Total # of Hispanic/Latino', 'Total # of Non-Hispanic/Non-Latino',
       'Total #  of Unknown Ethnicity', 'Total #  of Asian',
       'Total #  of Black', 'Total #  of White',
       'Total #  of Native Hawaiian or Pacific Islander',
       'Total #  of American Indian/Alaska Native', 'Total #  of Other Race',
       'Total #  of Unknown Race', 'Country Names', 'Notes', 'bucket',
       'Annotator', 'manual_label', 'eff_page_A',

In [25]:
manual_annotated_labels = add_canon_ids(manual_annotated_labels, id_col='FDA Application Number(s) ')

In [26]:
manual_annotated_labels['resovled_label_A']

0           None
1           None
2           None
3        Partial
4           None
         ...    
150         None
151         None
152         None
153    Unlabeled
154         None
Name: resovled_label_A, Length: 155, dtype: object

In [27]:
manual_annotated_labels['canon_id']

0      NDA_209949_0011
1      NDA_021505_0019
2      NDA_021035_0052
3      BLA_761046_0012
4      NDA_204275_0022
            ...       
150    NDA_204412_0006
151    NDA_019651_0024
152    NDA_019651_0025
153    NDA_021830_0006
154    NDA_204412_0003
Name: canon_id, Length: 155, dtype: object

In [91]:
# assuming `manual_annotated_labels` is your DataFrame
out_path = "/Users/srinivasana/Documents/peds_agents_local/data/manual_annotated_labels.xlsx"

manual_annotated_labels.to_excel(out_path, index=False, engine="openpyxl")
print("✓ written to", out_path)


✓ written to /Users/srinivasana/Documents/peds_agents_local/data/manual_annotated_labels.xlsx


In [100]:
# llm_labels = pd.read_csv(
#     "/Users/srinivasana/Documents/peds_agents_local/llm_predictions_o3-mini_v4_updated_full.csv",
#     dtype=str,              # read every column as strings
#     keep_default_na=False,  # turn off default NA list ("None","N/A",…)
#     na_values=[],           # no custom NA markers
#     na_filter=False         # skip the NA-inference pass entirely
# )


In [28]:
llm_labels = pd.read_csv(
    "/Users/srinivasana/Documents/peds_agents_local/llm_fc_predictions.csv",
    dtype=str,              # read every column as strings
    keep_default_na=False,  # turn off default NA list ("None","N/A",…)
    na_values=[],           # no custom NA markers
    na_filter=False         # skip the NA-inference pass entirely
)

In [29]:
llm_labels

Unnamed: 0,app_id,resolved_label,peds_study_type,efficacy_summary,pk_summary,lowest_age_band,highest_age_band,rationale,confidence,summary_json,txt_file
0,ANDA_214745,Partial,PK+Safety,There is no pediatric efficacy RCT evidence; t...,Population pharmacokinetic analysis in 87 pedi...,0.4 years,17.8 years,The pediatric data include both PK and safety ...,high,"{""PediatricSummary"":[{""section"":""Pharmacokinet...",/Users/srinivasana/Documents/peds_agents_local...
1,BLA_103976_5231,,RCT,Pediatric efficacy RCT evidence is available: ...,No separate PK/exposure modeling data were the...,6 years,17 years,Since at least one pediatric efficacy RCT is a...,high,"{""PediatricSummary"":[{""section"":""Asthma"",""summ...",/Users/srinivasana/Documents/peds_agents_local...
2,BLA_125160_0305,,RCT,"An open‐label, randomized, parallel‐group stud...",No pediatric PK/exposure modeling data provide...,6,17,"A pediatric efficacy RCT was performed, which ...",high,"{""PediatricSummary"":[{""section"":""8.4 Pediatric...",/Users/srinivasana/Documents/peds_agents_local...
3,BLA_125294_0045,Partial,PK+Safety,Evidence is based on adequate and well-control...,Pharmacokinetic and safety profiles in pediatr...,1 month,<17 years,The pediatric evidence does not include a rand...,high,"{""PediatricSummary"":[{""section"":""8.4 Pediatric...",/Users/srinivasana/Documents/peds_agents_local...
4,BLA_125477_0039,Partial,PK+Safety,"The single-arm, multicenter, open-label study ...",Pharmacokinetic data in pediatric patients wer...,1,16,There is pediatric data from an open-label tri...,high,"{""PediatricSummary"":[{""section"":""8.4 Pediatric...",/Users/srinivasana/Documents/peds_agents_local...
...,...,...,...,...,...,...,...,...,...,...,...
104,NDA_215650,Unlabeled,,The safety and effectiveness in females aged 1...,No pediatric pharmacokinetic or safety data ha...,<12 years,≥12 years,Since the product's efficacy and safety in the...,high,"{""PediatricSummary"":[{""section"":""8.4 Pediatric...",/Users/srinivasana/Documents/peds_agents_local...
105,NDA_216185,Partial,PK+Safety,Pediatric efficacy data are extrapolated from ...,Pharmacokinetic analyses and safety data from ...,Pediatric (≥50 kg),Adult,The indication for pediatric use in patients w...,high,"{""PediatricSummary"":[{""section"":""Indications a...",/Users/srinivasana/Documents/peds_agents_local...
106,NDA_217514_ORIG1,,RCT,Direct pediatric efficacy evidence from Study ...,"Weight-adjusted dosing was used, with exposure...",1 years,17 years,The presence of direct evidence of efficacy in...,high,"{""PediatricSummary"":[{""section"":""8.4 Pediatric...",/Users/srinivasana/Documents/peds_agents_local...
107,NDA_218550,Partial,PK+Safety,The pediatric labeling is supported by the ext...,Population pharmacokinetic studies in pediatri...,1 month,Older than 6 months (with specific dosing guid...,While clinical trials in pediatric patients pr...,high,"{""PediatricSummary"":[{""section"":""Indications a...",/Users/srinivasana/Documents/peds_agents_local...


In [30]:
llm_labels.shape

(109, 11)

In [31]:
llm_labels['app_id']

0           ANDA_214745
1       BLA_103976_5231
2       BLA_125160_0305
3       BLA_125294_0045
4       BLA_125477_0039
             ...       
104          NDA_215650
105          NDA_216185
106    NDA_217514_ORIG1
107          NDA_218550
108     NDA_761055_0012
Name: app_id, Length: 109, dtype: object

In [33]:
llm_labels = llm_labels.rename(columns={'app_id': 'canon_id'})

In [15]:
llm_labels

Unnamed: 0,canon_id,resolved_label,peds_study_type,efficacy_summary,pk_summary,lowest_age_band,highest_age_band,rationale,confidence,summary_json,txt_file
0,ANDA_214745,Partial,PK+Safety,There is no pediatric efficacy RCT evidence; t...,Population pharmacokinetic analysis in 87 pedi...,0.4 years,17.8 years,The pediatric data include both PK and safety ...,high,"{""PediatricSummary"":[{""section"":""Pharmacokinet...",/Users/srinivasana/Documents/peds_agents_local...
1,BLA_103976_5231,,RCT,Pediatric efficacy RCT evidence is available: ...,No separate PK/exposure modeling data were the...,6 years,17 years,Since at least one pediatric efficacy RCT is a...,high,"{""PediatricSummary"":[{""section"":""Asthma"",""summ...",/Users/srinivasana/Documents/peds_agents_local...
2,BLA_125160_0305,,RCT,"An open‐label, randomized, parallel‐group stud...",No pediatric PK/exposure modeling data provide...,6,17,"A pediatric efficacy RCT was performed, which ...",high,"{""PediatricSummary"":[{""section"":""8.4 Pediatric...",/Users/srinivasana/Documents/peds_agents_local...
3,BLA_125294_0045,Partial,PK+Safety,Evidence is based on adequate and well-control...,Pharmacokinetic and safety profiles in pediatr...,1 month,<17 years,The pediatric evidence does not include a rand...,high,"{""PediatricSummary"":[{""section"":""8.4 Pediatric...",/Users/srinivasana/Documents/peds_agents_local...
4,BLA_125477_0039,Partial,PK+Safety,"The single-arm, multicenter, open-label study ...",Pharmacokinetic data in pediatric patients wer...,1,16,There is pediatric data from an open-label tri...,high,"{""PediatricSummary"":[{""section"":""8.4 Pediatric...",/Users/srinivasana/Documents/peds_agents_local...
...,...,...,...,...,...,...,...,...,...,...,...
104,NDA_215650,Unlabeled,,The safety and effectiveness in females aged 1...,No pediatric pharmacokinetic or safety data ha...,<12 years,≥12 years,Since the product's efficacy and safety in the...,high,"{""PediatricSummary"":[{""section"":""8.4 Pediatric...",/Users/srinivasana/Documents/peds_agents_local...
105,NDA_216185,Partial,PK+Safety,Pediatric efficacy data are extrapolated from ...,Pharmacokinetic analyses and safety data from ...,Pediatric (≥50 kg),Adult,The indication for pediatric use in patients w...,high,"{""PediatricSummary"":[{""section"":""Indications a...",/Users/srinivasana/Documents/peds_agents_local...
106,NDA_217514_ORIG1,,RCT,Direct pediatric efficacy evidence from Study ...,"Weight-adjusted dosing was used, with exposure...",1 years,17 years,The presence of direct evidence of efficacy in...,high,"{""PediatricSummary"":[{""section"":""8.4 Pediatric...",/Users/srinivasana/Documents/peds_agents_local...
107,NDA_218550,Partial,PK+Safety,The pediatric labeling is supported by the ext...,Population pharmacokinetic studies in pediatri...,1 month,Older than 6 months (with specific dosing guid...,While clinical trials in pediatric patients pr...,high,"{""PediatricSummary"":[{""section"":""Indications a...",/Users/srinivasana/Documents/peds_agents_local...


## llm and manually annotated cohens K

In [87]:
import pandas as pd
from sklearn.metrics import cohen_kappa_score, confusion_matrix

# ───────────────────────────────────────────────────────────────
# 1.  Join on canon_id
# ───────────────────────────────────────────────────────────────
merged = manual_annotated_labels.merge(
            llm_labels,
            on="canon_id",
            how="inner",
            suffixes=("_man", "_llm")
)

print(f"Rows with both annotations: {len(merged)}")

# ───────────────────────────────────────────────────────────────
# 2.  Normalise text labels
# ───────────────────────────────────────────────────────────────
def norm(lbl: str | float):
    if pd.isna(lbl):
        return pd.NA
    s = str(lbl).lower()
    if "full"     in s: return "Full"
    if "partial"  in s: return "Partial"
    if "none"     in s: return "None"
    return "Unlabeled"             # catch str like 'un-labelled', 'n/a', etc.

merged["lab_man"] = merged["manual_label"].apply(norm)
merged["lab_llm"] = merged["resolved_label"].apply(norm)

merged = merged.dropna(subset=["lab_man","lab_llm"])
labels = ["None","Partial","Full","Unlabeled"]

# ───────────────────────────────────────────────────────────────
# 3.  Cohen’s κ
# ───────────────────────────────────────────────────────────────
kappa = cohen_kappa_score(merged["lab_man"], merged["lab_llm"], labels=labels)
print(f"Cohen’s kappa: {kappa:.3f}")

# ───────────────────────────────────────────────────────────────
# 4.  Confusion matrix (optional)
# ───────────────────────────────────────────────────────────────
cm = confusion_matrix(merged["lab_man"], merged["lab_llm"], labels=labels)
cm_df = pd.DataFrame(cm, index=[f"MAN_{l}" for l in labels],
                         columns=[f"LLM_{l}" for l in labels])
print("\nConfusion matrix\n")
print(cm_df)


Rows with both annotations: 122
Cohen’s kappa: 0.406

Confusion matrix

               LLM_None  LLM_Partial  LLM_Full  LLM_Unlabeled
MAN_None             46           10         0              9
MAN_Partial          12           20         3              5
MAN_Full              0            0         0              1
MAN_Unlabeled         2            3         0             11


In [68]:
merged['root'].nunique()

85

In [19]:
llm_labels.columns

Index(['canon_id', 'resolved_label', 'peds_study_type', 'efficacy_summary',
       'pk_summary', 'lowest_age_band', 'highest_age_band', 'rationale',
       'confidence', 'summary_json', 'txt_file'],
      dtype='object')

In [35]:
llm_labels = llm_labels[llm_labels['confidence'] == "high"]

In [36]:
import pandas as pd
from sklearn.metrics import cohen_kappa_score, confusion_matrix

# ───────────────────────────────────────────────────────────────
# 1.  Join on canon_id
# ───────────────────────────────────────────────────────────────
merged = manual_annotated_labels.merge(
            llm_labels,
            on="canon_id",
            how="inner",
            suffixes=("_man", "_llm")
)

print(f"Rows with both annotations: {len(merged)}")

# ───────────────────────────────────────────────────────────────
# 2.  Normalise text labels
# ───────────────────────────────────────────────────────────────
def norm(lbl: str | float):
    if pd.isna(lbl):
        return pd.NA
    s = str(lbl).lower()
    if "full"     in s: return "Full"
    if "partial"  in s: return "Partial"
    if "none"     in s: return "None"
    return "Unlabeled"             # catch str like 'un-labelled', 'n/a', etc.

merged["lab_man"] = merged["resovled_label_A"].apply(norm)
merged["lab_llm"] = merged["resolved_label"].apply(norm)

merged = merged.dropna(subset=["lab_man","lab_llm"])
labels = ["None","Partial","Full","Unlabeled"]

# ───────────────────────────────────────────────────────────────
# 3.  Cohen’s κ
# ───────────────────────────────────────────────────────────────
kappa = cohen_kappa_score(merged["lab_man"], merged["lab_llm"], labels=labels)
print(f"Cohen’s kappa: {kappa:.3f}")

# ───────────────────────────────────────────────────────────────
# 4.  Confusion matrix (optional)
# ───────────────────────────────────────────────────────────────
cm = confusion_matrix(merged["lab_man"], merged["lab_llm"], labels=labels)
cm_df = pd.DataFrame(cm, index=[f"MAN_{l}" for l in labels],
                         columns=[f"LLM_{l}" for l in labels])
print("\nConfusion matrix\n")
print(cm_df)


Rows with both annotations: 115
Cohen’s kappa: 0.715

Confusion matrix

               LLM_None  LLM_Partial  LLM_Full  LLM_Unlabeled
MAN_None             64            4         0              0
MAN_Partial          10           28         0              0
MAN_Full              0            1         0              1
MAN_Unlabeled         0            1         0              6


In [22]:
###############################################################
#  Assumptions
#  -----------------------------------------------------------
#  df            == merged dataframe containing BOTH labels
#  MANUAL_COL    == column with the human label
#  LLM_COL       == column with the LLM label
###############################################################
MANUAL_COL = "resovled_label_A"       # or "manual_label_A", etc.
LLM_COL    = "resolved_label"     # the LLM column

# 1) rows where the labels differ
mismatch = merged[merged[MANUAL_COL] != merged[LLM_COL]]

print(f"Mismatching rows: {len(mismatch)} of {len(merged)}")

# 2) show a concise table on-screen
cols_to_show = [
    "canon_id",                    # your unique ID
    MANUAL_COL, LLM_COL,           # both labels
    "Therapeutic Category",        # helpful context
    "Trade Name",                  # optional extra columns
]
display(mismatch[cols_to_show].reset_index(drop=True))

# 3) optionally save for manual review
mismatch.to_excel("/Users/srinivasana/Documents/peds_agents_local/data/mismatch_review.xlsx", index=False)
print("✔ wrote mismatch_review.xlsx")


Mismatching rows: 17 of 115


Unnamed: 0,canon_id,resovled_label_A,resolved_label,Therapeutic Category,Trade Name
0,NDA_215650,Full,Unlabeled,ANTIBACTERIALS,Xaciato vaginal gel
1,NDA_022187_0024,Partial,,ANTIVIRALS,Intelence
2,NDA_206829_0011,Partial,,ANTIBACTERIALS,Zerbaxa for injection
3,NDA_021652_0019,,Partial,ANTIVIRALS,Epzicom
4,BLA_761033_ORIG1,Partial,,ANTIASTHMATIC,Cinqair
5,BLA_125526_ORIG1,Partial,,ANTIASTHMATIC,Nucala
6,NDA_202811_0018,Unlabeled,Partial,IRRITABLE BOWEL SYNDROME,Linzess capsules
7,BLA_761122_ORIG1,,Partial,ANTIASTHMATIC,Nucala
8,NDA_208083_0008,Full,Partial,ANTIBACTERIALS,Clindamycin Phosphate injection
9,NDA_022200_0031,Partial,,TYPE 2 DIABETES,Bydureon injectable suspension


✔ wrote mismatch_review.xlsx


## sheet A vs sheet B cohens k

In [71]:
import pandas as pd
from sklearn.metrics import cohen_kappa_score, confusion_matrix

# ───────────────────────────────────────────────────────────────
# 1 · inner-join on canon_id
# ───────────────────────────────────────────────────────────────
merged = (
    dual_A
    .merge(dual_B,
           on="canon_id",
           how="inner",
           suffixes=("_A", "_B"))
)
print(f"Rows annotated by both teams: {len(merged)}")

# ───────────────────────────────────────────────────────────────
# 2 · normalise label text to {Full, Partial, None, Unlabeled}
# ───────────────────────────────────────────────────────────────
def norm(lbl):
    if pd.isna(lbl):          # genuine blank → keep missing
        return pd.NA
    s = str(lbl).lower()
    if "full"     in s: return "Full"
    if "partial"  in s: return "Partial"
    if "none"     in s: return "None"
    # anything that says un-labelled / unlabeled / n/a / etc.
    return "Unlabeled"

merged["lab_A"] = merged["manual_label_A"].apply(norm)
merged["lab_B"] = merged["manual_label_B"].apply(norm)

merged = merged.dropna(subset=["lab_A", "lab_B"])   # keep only paired labels
labels = ["None", "Partial", "Full", "Unlabeled"]

# ───────────────────────────────────────────────────────────────
# 3 · Cohen’s κ
# ───────────────────────────────────────────────────────────────
kappa = cohen_kappa_score(merged["lab_A"], merged["lab_B"], labels=labels)
print(f"Cohen’s kappa (A vs B):  {kappa:.3f}")

# ───────────────────────────────────────────────────────────────
# 4 · Confusion matrix (optional)
# ───────────────────────────────────────────────────────────────
cm = confusion_matrix(merged["lab_A"], merged["lab_B"], labels=labels)
cm_df = pd.DataFrame(
    cm,
    index=[f"A_{l}" for l in labels],
    columns=[f"B_{l}" for l in labels]
)
print("\nConfusion matrix\n")
print(cm_df)


Rows annotated by both teams: 31
Cohen’s kappa (A vs B):  0.678

Confusion matrix

             B_None  B_Partial  B_Full  B_Unlabeled
A_None           13          2       0            0
A_Partial         2          9       1            0
A_Full            0          0       3            0
A_Unlabeled       1          0       0            0


In [26]:
dual_A.shape

(33, 52)

## creating more labels for manual annotation

In [33]:

path = ("/Users/srinivasana/Documents/peds_agents_local/data/web_fdaaa_clean.xlsx")

label_metadata = pd.read_excel(path, engine="openpyxl")


In [34]:
label_metadata.columns

Index(['FDA Application Number(s) ', 'Pediatric Labeling Approval Date',
       'Trade Name', 'Generic Name', 'Type of Legislation', 'Indication',
       'Indication(s) Studied', 'Labeling Change Summary',
       'Therapeutic Category', 'Dosage Form(s)', 'Route(s) of Administration',
       'Pharmacological Class', 'Studied in Neonates', 'Indicated in Neonates',
       'Product Labeling Link', 'Study Number', 'Ages Studied', 'Study Type',
       'Study Design', 'Patients Enrolled', 'Patients Analyzed',
       'Number of Centers', 'Number of Countries',
       'Total # of Hispanic/Latino', 'Total # of Non-Hispanic/Non-Latino',
       'Total #  of Unknown Ethnicity', 'Total #  of Asian',
       'Total #  of Black', 'Total #  of White',
       'Total #  of Native Hawaiian or Pacific Islander',
       'Total #  of American Indian/Alaska Native', 'Total #  of Other Race',
       'Total #  of Unknown Race', 'Country Names', 'Notes', 'kind', 'root',
       'supplement', 'canon_id'],
      dty

In [35]:
label_metadata

Unnamed: 0,FDA Application Number(s),Pediatric Labeling Approval Date,Trade Name,Generic Name,Type of Legislation,Indication,Indication(s) Studied,Labeling Change Summary,Therapeutic Category,Dosage Form(s),...,Total # of Native Hawaiian or Pacific Islander,Total # of American Indian/Alaska Native,Total # of Other Race,Total # of Unknown Race,Country Names,Notes,kind,root,supplement,canon_id
0,BLA 761039/0015,2023-12-22,Udenyca injection,pegfilgrastim-cbqv,PREA Only,Chemotherapy induced neutropenia,Decrease the incidence of infection as manifes...,- Safety and effectiveness have been establish...,Hematopoietic Growth Factors,INJECTABLE,...,,,,,,,BLA,761039.0,0015,BLA_761039_0015
1,NDA 204311,2023-12-22,Abacavir and Lamivudine Tablets for Oral Suspe...,Abacavir and Lamivudine,PREA Only,HIV – 1 infection (prevention of transmission ...,Treatment of HIV-1 infection in pediatric pati...,- Safety and effectiveness in combination with...,Antivirals,SUSPENSION,...,,,,,,,NDA,204311.0,,NDA_204311
2,NDA 217242,2023-12-15,Zoryve Foam,roflumilast,PREA Only,Seborrheic dermatitis,Seborrheic dermatitis in patients 9 years and ...,- Safety and effectiveness of for the treatmen...,Psoriasis Agents,FOAM,...,,,,,,,NDA,217242.0,,NDA_217242
3,NDA 213871/0004,2023-12-14,Cibinqo tablets,abrocitinib,PREA Only,Atopic dermatitis,Moderate-to-severe atopic dermatitis in pediat...,- The recommended dose is 100 mg once daily. I...,Dermatitis Agents,TABLET,...,,,,,,,NDA,213871.0,0004,NDA_213871_0004
4,BLA 761180/0001,2023-12-14,Adbry injection,tralokinumab-ldrm,PREA Only,Atopic dermatitis,Moderate-to-severe atopic dermatitis in patien...,- Safety and effectiveness have been establish...,Dermatitis Agents,INJECTABLE,...,,,,,,,BLA,761180.0,0001,BLA_761180_0001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1058,NDA 021866/0004_x000D_\nNDA 021729/0004_x000D_...,2007-10-29,Abilify,aripiprazole,BPCA Only,Schizophrenia,Schizophrenia,-\tExtended schizophrenia indication from adul...,Antipsychotics/Schizophrenia,INJECTABLE_x000D_\nSOLUTION_x000D_\nTABLET_x00...,...,2_x000D_\n0_x000D_\n1_x000D_\n1,0_x000D_\n0_x000D_\n0_x000D_\n1,36_x000D_\n1_x000D_\n44_x000D_\n45,N/A_x000D_\nN/A_x000D_\nN/A_x000D_\nN/A,N/A_x000D_\nN/A_x000D_\nN/A_x000D_\nN/A,,NDA,21866.0,0004,NDA_021866_0004
1059,NDA 021866/0004_x000D_\nNDA 021729/0004_x000D_...,2007-10-29,Abilify,aripiprazole,BPCA Only,Schizophrenia,Schizophrenia,-\tExtended schizophrenia indication from adul...,Antipsychotics/Schizophrenia,INJECTABLE_x000D_\nSOLUTION_x000D_\nTABLET_x00...,...,2_x000D_\n0_x000D_\n1_x000D_\n1,0_x000D_\n0_x000D_\n0_x000D_\n1,36_x000D_\n1_x000D_\n44_x000D_\n45,N/A_x000D_\nN/A_x000D_\nN/A_x000D_\nN/A,N/A_x000D_\nN/A_x000D_\nN/A_x000D_\nN/A,,NDA,21729.0,0004,NDA_021729_0004
1060,NDA 021866/0004_x000D_\nNDA 021729/0004_x000D_...,2007-10-29,Abilify,aripiprazole,BPCA Only,Schizophrenia,Schizophrenia,-\tExtended schizophrenia indication from adul...,Antipsychotics/Schizophrenia,INJECTABLE_x000D_\nSOLUTION_x000D_\nTABLET_x00...,...,2_x000D_\n0_x000D_\n1_x000D_\n1,0_x000D_\n0_x000D_\n0_x000D_\n1,36_x000D_\n1_x000D_\n44_x000D_\n45,N/A_x000D_\nN/A_x000D_\nN/A_x000D_\nN/A,N/A_x000D_\nN/A_x000D_\nN/A_x000D_\nN/A,,NDA,21713.0,0012,NDA_021713_0012
1061,NDA 021866/0004_x000D_\nNDA 021729/0004_x000D_...,2007-10-29,Abilify,aripiprazole,BPCA Only,Schizophrenia,Schizophrenia,-\tExtended schizophrenia indication from adul...,Antipsychotics/Schizophrenia,INJECTABLE_x000D_\nSOLUTION_x000D_\nTABLET_x00...,...,2_x000D_\n0_x000D_\n1_x000D_\n1,0_x000D_\n0_x000D_\n0_x000D_\n1,36_x000D_\n1_x000D_\n44_x000D_\n45,N/A_x000D_\nN/A_x000D_\nN/A_x000D_\nN/A,N/A_x000D_\nN/A_x000D_\nN/A_x000D_\nN/A,,NDA,21436.0,0017,NDA_021436_0017


In [42]:
# 1.  Count categories
vc = label_metadata["Therapeutic Category"].value_counts(dropna=True)

# 2.  Choose target categories
cats = ["Solid Tumor", "Antivirals", "Dermatitis Agents",
        "Type 2 Diabetes", "Ulcerative Colitis", "ADHD"]

# 3.  Sample 10 from each
to_annotate = (
    label_metadata[label_metadata["Therapeutic Category"].isin(cats)]
      .groupby("Therapeutic Category", group_keys=False)
      .apply(lambda g: g.sample(n=min(10, len(g)), random_state=42))
)

to_annotate.to_excel("/Users/srinivasana/Documents/peds_agents_local/data/next_manual_batch.xlsx", index=False)


  .apply(lambda g: g.sample(n=min(10, len(g)), random_state=42))


In [44]:
to_annotate.columns

Index(['FDA Application Number(s) ', 'Pediatric Labeling Approval Date',
       'Trade Name', 'Generic Name', 'Type of Legislation', 'Indication',
       'Indication(s) Studied', 'Labeling Change Summary',
       'Therapeutic Category', 'Dosage Form(s)', 'Route(s) of Administration',
       'Pharmacological Class', 'Studied in Neonates', 'Indicated in Neonates',
       'Product Labeling Link', 'Study Number', 'Ages Studied', 'Study Type',
       'Study Design', 'Patients Enrolled', 'Patients Analyzed',
       'Number of Centers', 'Number of Countries',
       'Total # of Hispanic/Latino', 'Total # of Non-Hispanic/Non-Latino',
       'Total #  of Unknown Ethnicity', 'Total #  of Asian',
       'Total #  of Black', 'Total #  of White',
       'Total #  of Native Hawaiian or Pacific Islander',
       'Total #  of American Indian/Alaska Native', 'Total #  of Other Race',
       'Total #  of Unknown Race', 'Country Names', 'Notes', 'kind', 'root',
       'supplement', 'canon_id'],
      dty

In [45]:
to_annotate = add_canon_ids(to_annotate, id_col='FDA Application Number(s) ')

In [46]:
to_annotate

Unnamed: 0,FDA Application Number(s),Pediatric Labeling Approval Date,Trade Name,Generic Name,Type of Legislation,Indication,Indication(s) Studied,Labeling Change Summary,Therapeutic Category,Dosage Form(s),...,Total # of Native Hawaiian or Pacific Islander,Total # of American Indian/Alaska Native,Total # of Other Race,Total # of Unknown Race,Country Names,Notes,kind,root,supplement,canon_id
0,NDA 021476/0026,2012-10-10,Lunesta,eszopiclone,BPCA + PREA,ADHD,Attention-Deficit Hyperactivity Disorder assoc...,-\tSafety and effectiveness have not been esta...,ADHD,TABLET,...,N/A_x000D_\nN/A_x000D_\nN/A_x000D_\nN/A,N/A_x000D_\nN/A_x000D_\nN/A_x000D_\nN/A,N/A_x000D_\nN/A_x000D_\nN/A_x000D_\nN/A,N/A_x000D_\nN/A_x000D_\nN/A_x000D_\nN/A,N/A_x000D_\nN/A_x000D_\nN/A_x000D_\nN/A,,NDA,021476,0026,NDA_021476_0026
1,NDA 021356/0033,2010-03-24,Viread,tenofovir disoproxil fumarate,BPCA + PREA,HIV – 1 infection (prevention of transmission ...,Treatment of HIV infection in combination with...,-\tExpanded indication from adults to pediatri...,Antivirals,TABLET,...,0,0,17,,,,NDA,021356,0033,NDA_021356_0033
2,NDA 212477 - Original 1,2019-08-28,Harvoni Oral Pellets,ledipasvir/sofosbuvir,BPCA + PREA,Hepatitis C,Treatment of chronic hepatitis C virus,-\tExpanded indication to pediatric patients 3...,Antivirals,PELLETS,...,,,,,,Descriptors for this labeling change are ident...,NDA,212477,ORIG1,NDA_212477_ORIG1
3,NDA 022257_x000D_\nNDA 021304/0007,2009-08-28,Valcyte,valganciclovir,BPCA + PREA,Other,Prevention of cytomegalovirus (CMV) disease in...,-\tUse in pediatric patients ≥ 4 months is bas...,Antivirals,FOR SOLUTION_x000D_\nTABLET,...,0_x000D_\n0_x000D_\n2_x000D_\n0,0_x000D_\n0_x000D_\n0_x000D_\n0,8_x000D_\n0_x000D_\n7_x000D_\n1,N/A_x000D_\nN/A_x000D_\nN/A_x000D_\nN/A,N/A_x000D_\nN/A_x000D_\nN/A_x000D_\nN/A,,NDA,022257,,NDA_022257
4,NDA 022257_x000D_\nNDA 021304/0007,2009-08-28,Valcyte,valganciclovir,BPCA + PREA,Other,Prevention of cytomegalovirus (CMV) disease in...,-\tUse in pediatric patients ≥ 4 months is bas...,Antivirals,FOR SOLUTION_x000D_\nTABLET,...,0_x000D_\n0_x000D_\n2_x000D_\n0,0_x000D_\n0_x000D_\n0_x000D_\n0,8_x000D_\n0_x000D_\n7_x000D_\n1,N/A_x000D_\nN/A_x000D_\nN/A_x000D_\nN/A,N/A_x000D_\nN/A_x000D_\nN/A_x000D_\nN/A,,NDA,021304,0007,NDA_021304_0007
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59,NDA 204412/0006,2015-09-09,Delzicol,mesalamine**,BPCA + PREA,Ulcerative Colitis,Treatment of mildly to moderately active ulcer...,-\tExpanded the indication to pediatric patien...,Ulcerative Colitis,"CAPSULE, DELAYED RELEASE",...,0,0,1,,,,NDA,204412,0006,NDA_204412_0006
60,NDA 019651/0024,2013-10-18,Asacol,mesalamine**,BPCA + PREA,Ulcerative Colitis,Treatment of mildly to moderately active ulcer...,-\tSafety and effectiveness in pediatric patie...,Ulcerative Colitis,"TABLET, DELAYED RELEASE",...,0_x000D_\nN/A_x000D_\nN/A,0_x000D_\nN/A_x000D_\nN/A,2_x000D_\nN/A_x000D_\nN/A,N/A_x000D_\nN/A_x000D_\nN/A,"United States,Canada,Croatia (Hrvatska),Poland...",,NDA,019651,0024,NDA_019651_0024
61,NDA 019651/0025,2015-05-27,Asacol,mesalamine**,BPCA Only,Ulcerative Colitis,Maintenance of remission of mildly to moderate...,-\tSafety and effectiveness for the maintenan...,Ulcerative Colitis,"TABLET, DELAYED RELEASE",...,NNPS,NNPS,NNPS,NNPS,NNPS,NNPS,NDA,019651,0025,NDA_019651_0025
62,NDA 021830/0006,2013-10-18,Asacol HD,mesalamine,PREA Only,Ulcerative Colitis,Postmarketing study using Asacol,-\tSafety and effectiveness of Asacol HD in pe...,Ulcerative Colitis,"TABLET, DELAYED RELEASE",...,N/A_x000D_\nN/A_x000D_\nN/A,N/A_x000D_\nN/A_x000D_\nN/A,N/A_x000D_\nN/A_x000D_\nN/A,N/A_x000D_\nN/A_x000D_\nN/A,N/A_x000D_\nN/A_x000D_\nN/A,,NDA,021830,0006,NDA_021830_0006


In [39]:
manual_annotated_labels['Therapeutic Category'].value_counts()

Therapeutic Category
ANTIASTHMATIC                   28
SEIZURES                        24
ANTIBACTERIALS                  17
ANTIVIRALS                      13
SOLID TUMOR                      8
LIPID ALTERING AGENTS            5
ULCERATIVE COLITIS               4
TYPE 2 DIABETES                  4
DERMATITIS AGENTS                3
HEMATOLOGIC MALIGNANCIES         3
IRRITABLE BOWEL SYNDROME         2
ANTIPSYCHOTICS/SCHIZOPHRENIA     2
CROHN’S DISEASE                  2
CNS STIMULANTS                   2
ULCERATIVE PROCTITIS             1
TYPE 1 DIABETES                  1
CONSTIPATION                     1
BRONCHOPULMONARY DYSPLASIA       1
OBESITY                          1
HEMATOPOIETIC GROWTH FACTORS     1
Name: count, dtype: int64

In [29]:
llm_labels.columns

Index(['canon_id', 'resolved_label', 'peds_study_type', 'efficacy_excerpt',
       'pk_excerpt', 'lowest_age_band', 'highest_age_band', 'rationale',
       'confidence', 'ambiguity_flag', 'notes', 'txt_file'],
      dtype='object')

## 

In [30]:
llm_labels

Unnamed: 0,canon_id,resolved_label,peds_study_type,efficacy_excerpt,pk_excerpt,lowest_age_band,highest_age_band,rationale,confidence,ambiguity_flag,notes,txt_file
0,ANDA_071961_ORIG1,Partial,PK+Safety,Efficacy in the pediatric population was estab...,"During the 30-minute blinded phase, patients w...",neonates,17 years,Adult efficacy is extrapolated to pediatric pa...,high,False,,/Users/srinivasana/Documents/peds_agents/txt/A...
1,ANDA_214745,,RCT,In pediatric use for the treatment of status e...,Population PK analysis in 87 pediatric patient...,3 months,17 years,The label contains a well-controlled pediatric...,high,False,,/Users/srinivasana/Documents/peds_agents/txt/A...
2,ANDA_999907_ORIG1,Partial,PK+Safety,,"Due to maturational changes in renal function,...",28 days,,The label lacks independent pediatric efficacy...,high,False,,/Users/srinivasana/Documents/peds_agents/txt/A...
3,BLA_101069_5846,,RCT,"Efficacy of measles, mumps, and rubella vaccin...",,12 months,6 years,The label includes direct pediatric studies (R...,high,False,,/Users/srinivasana/Documents/peds_agents/txt/B...
4,BLA_103000_5309,,RCT,Safety and effectiveness for the treatment of ...,,2 years,17 years,The label provides robust RCT data in pediatri...,high,False,,/Users/srinivasana/Documents/peds_agents/txt/B...
...,...,...,...,...,...,...,...,...,...,...,...,...
771,NDA_217927,Partial,PK+Safety,Safety and effectiveness of oxaprozin have bee...,A population pharmacokinetic study indicated n...,6 years,16 years,The label supports pediatric use based on an o...,high,False,,/Users/srinivasana/Documents/peds_agents/txt/N...
772,NDA_218550,Partial,PK+Safety,The safety and effectiveness of ROZLYTREK have...,In pediatric patients older than 6 months admi...,1 month,17 years,Adult efficacy is extrapolated to pediatrics b...,high,False,,/Users/srinivasana/Documents/peds_agents/txt/N...
773,NDA_761055_0012,,RCT,The safety and efficacy of DUPIXENT have been ...,"In adolescent patients with atopic dermatitis,...",12 years,17 years,The label includes dedicated adolescent RCTs w...,high,False,,/Users/srinivasana/Documents/peds_agents/txt/N...
774,NDA_999905_ORIG1,Unlabeled,,,,13 years,,The label specifies indication only for patien...,high,False,,/Users/srinivasana/Documents/peds_agents/txt/N...


In [47]:
import pandas as pd

# ❶  Inner-join on canon_id
overlap = to_annotate.merge(llm_labels[["canon_id"]], on="canon_id", how="inner")

print(f"Rows in to_annotate        : {len(to_annotate)}")
print(f"Rows in llm_labels         : {len(llm_labels)}")
print(f"Overlapping canon_id values: {len(overlap)}")

Rows in to_annotate        : 64
Rows in llm_labels         : 776
Overlapping canon_id values: 45


In [49]:
# rows present in to_annotate but NOT in llm_labels
missing = to_annotate[~to_annotate["canon_id"].isin(llm_labels["canon_id"])]

# ➊  Show the count
print(f"Non-overlapping rows: {len(missing)}")

# ➋  List the canon_id values
ids = missing["canon_id"].unique().tolist()
print("canon_id not yet in llm_labels:\n", ids)




Non-overlapping rows: 19
canon_id not yet in llm_labels:
 ['NDA_022257', 'NDA_021304_0007', 'NDA_021548_0028', 'NDA_022116_0012', 'NDA_214410_ORIG1', 'NDA_210854_0004', 'NDA_210854_0010', 'NDA_205786_ORIG1', 'NDA_203045_0009', 'NDA_022145_0031', 'BLA_103949_5171', 'NDA_215309', 'NDA_022271_0015', 'NDA_203414_0016']


## feature engineering

In [33]:
df.columns

Index(['FDA Application Number(s) ', 'Pediatric Labeling Approval Date',
       'Trade Name', 'Generic Name', 'Type of Legislation', 'Indication',
       'Indication(s) Studied', 'Labeling Change Summary',
       'Therapeutic Category', 'Dosage Form(s)', 'Route(s) of Administration',
       'Pharmacological Class', 'Studied in Neonates', 'Indicated in Neonates',
       'Product Labeling Link', 'Study Number', 'Ages Studied', 'Study Type',
       'Study Design', 'Patients Enrolled', 'Patients Analyzed',
       'Number of Centers', 'Number of Countries',
       'Total # of Hispanic/Latino', 'Total # of Non-Hispanic/Non-Latino',
       'Total #  of Unknown Ethnicity', 'Total #  of Asian',
       'Total #  of Black', 'Total #  of White',
       'Total #  of Native Hawaiian or Pacific Islander',
       'Total #  of American Indian/Alaska Native', 'Total #  of Other Race',
       'Total #  of Unknown Race', 'Country Names', 'Notes', 'app_id',
       'resolved_label', 'peds_study_type', 'effi

In [34]:
labels = ["None", "Partial", "Full", "Unlabeled"]
df = df[df.resolved_label.isin(labels)].copy()

In [36]:
df.to_csv("/Users/srinivasana/Documents/peds_agents/data/merged_df_for_lr.csv")