In [3]:


import re, numpy as np, pandas as pd
from pathlib import Path
import pandas as pd
from pathlib import Path



In [4]:
REPO_ROOT = Path.cwd().parent   
RAW_XLS   = REPO_ROOT / "data" / "raw" / "web_fdaaa_clean.xlsx"
OUT_DIR  = REPO_ROOT / "outputs"
OUT_CSV  = OUT_DIR / "label_features.csv"


In [5]:

########################################################################
# 0.  LOAD
########################################################################
raw = pd.read_excel(RAW_XLS, dtype=str).rename(
        columns=lambda c: c.strip())        # strip accidental spaces



In [6]:
raw.columns

Index(['FDA Application Number(s)', 'Pediatric Labeling Approval Date',
       'Trade Name', 'Generic Name', 'Type of Legislation', 'Indication',
       'Indication(s) Studied', 'Labeling Change Summary',
       'Therapeutic Category', 'Dosage Form(s)', 'Route(s) of Administration',
       'Pharmacological Class', 'Studied in Neonates', 'Indicated in Neonates',
       'Product Labeling Link', 'Study Number', 'Ages Studied', 'Study Type',
       'Study Design', 'Patients Enrolled', 'Patients Analyzed',
       'Number of Centers', 'Number of Countries',
       'Total # of Hispanic/Latino', 'Total # of Non-Hispanic/Non-Latino',
       'Total #  of Unknown Ethnicity', 'Total #  of Asian',
       'Total #  of Black', 'Total #  of White',
       'Total #  of Native Hawaiian or Pacific Islander',
       'Total #  of American Indian/Alaska Native', 'Total #  of Other Race',
       'Total #  of Unknown Race', 'Country Names', 'Notes', 'kind', 'root',
       'supplement', 'canon_id'],
      dtyp

In [7]:
########################################################################
# 1.  -------- total_studies ------------------------------------------
########################################################################
def max_study_num(cell) -> int:
    if pd.isna(cell):                   # NaN, None, etc.
        return 0
    txt  = " ".join(cell) if isinstance(cell, list) else str(cell)
    nums = re.findall(r"\d+(?:\.\d+)?", txt)
    return int(max(map(float, nums))) if nums else 0

raw["total_studies"] = raw["Study Number"].apply(max_study_num)



In [8]:
########################################################################
# 2.  -------- Study-Type / Study-Design one-hots ----------------------
########################################################################
def join(series):                         # helper for groupby
    return "\n".join(str(v) for v in series if pd.notna(v) and str(v).strip())

by_canon = (raw
    .groupby("canon_id", as_index=False)
    .agg({
        "total_studies":        "max",
        "Study Type":           join,
        "Study Design":         join,
        "Labeling Change Summary": "first",
        "Indication":           "first",
    })
)

type_buckets = {
    "Efficacy":        r"\befficacy\b",
    "Safety":          r"\bsafety\b",
    "Pharmacokinetic": r"\bpk\b|\bpharmacokinetic",
    "Pharmacodynamic": r"\bpd\b|\bpharmacodynamic",
    "Tolerability":    r"\btolerab",
}

for col, pat in type_buckets.items():
    by_canon[col] = by_canon["Study Type"].str.contains(pat, flags=re.I, regex=True).astype(int)

by_canon["Other_Type"] = (by_canon[list(type_buckets)] .sum(axis=1) == 0).astype(int)

design_buckets = {
    "Randomized_DoubleBlind": r"randomized.*double[- ]blind|double[- ]blind.*randomized",
    "Randomized_SingleBlind": r"randomized.*single[- ]blind|single[- ]blind.*randomized",
    "Open_Label":             r"open[- ]label",
    "Placebo_Control":        r"placebo(?!,? *run[- ]in)",
    "Active_Comparator":      r"active +(control|comparator)",
    "Dose_Escalation":        r"dose +(escalation|ranging|finding)",
    "Population_PK":          r"population +pk|pop +pk|traditional +pk|trad +pk",
}

for col, pat in design_buckets.items():
    by_canon[col] = by_canon["Study Design"].str.contains(pat, flags=re.I, regex=True).astype(int)

by_canon["Other_Design"] = (by_canon[list(design_buckets)].sum(axis=1) == 0).astype(int)




  by_canon[col] = by_canon["Study Design"].str.contains(pat, flags=re.I, regex=True).astype(int)
  by_canon[col] = by_canon["Study Design"].str.contains(pat, flags=re.I, regex=True).astype(int)


In [9]:
########################################################################
# 3.  -------- Ages: lowest / highest per canon -----------------------
########################################################################
_unit = {"YEAR":1, "YEARS":1, "MONTH":1/12, "MONTHS":1/12, "DAY":1/365,
         "DAYS":1/365, "HOUR":1/8760, "HOURS":1/8760}
rng  = re.compile(r"(\d+(?:\.\d+)?)\s*([A-Z]+)\s*(?:-|TO)\s*(\d+(?:\.\d+)?)\s*([A-Z]+)")
older= re.compile(r"(\d+(?:\.\d+)?)\s*([A-Z]+).*AND\s+OLDER")

def ages_to_years(cell:str)->list[float]:
    if not isinstance(cell,str): return []
    cell = cell.upper().replace("–","-")
    out=[]
    for n1,u1,n2,u2 in rng.findall(cell):
        out += [float(n1)*_unit.get(u1,1), float(n2)*_unit.get(u2,1)]
    for n,u in older.findall(cell):
        out.append(float(n)*_unit.get(u,1))
    return out

raw["__ages"] = raw["Ages Studied"].apply(ages_to_years)

age_agg = (raw.groupby("canon_id")["__ages"]
              .sum()                            # flatten lists
              .apply(lambda lst: pd.Series({
                  "age_min": min(lst) if lst else np.nan,
                  "age_max": max(lst) if lst else np.nan}))
              .reset_index())

by_canon = by_canon.merge(age_agg, on="canon_id", how="left")



In [10]:
########################################################################
# 4.  -------- Boolean neonate flags -----------------------------------
########################################################################
for fld in ["Studied in Neonates", "Indicated in Neonates"]:
    by_canon[fld] = (raw
        .groupby("canon_id")[fld]
        .first()
        .fillna("").str.strip().str.upper().eq("X")
        .astype(int)
        .values)



In [11]:

# ------------------------------------------------------------------
# 0.  helper to convert messy cell → list[int]
# ------------------------------------------------------------------
def to_int_list(cell):
    """
    "90\n219"  -> [90, 219]
    "NNPS"     -> []
    "" / NaN   -> []
    686        -> [686]
    """
    if cell is None or (isinstance(cell, float) and np.isnan(cell)):
        return []
    txt = str(cell).strip().lower()
    if txt in {"nnps", "n/a", ""}:
        return []
    return list(map(int, re.findall(r"\d+", txt)))

# ------------------------------------------------------------------
# 1.  turn *every* numeric column into lists of ints
# ------------------------------------------------------------------
numeric_cols = [
    "Patients Enrolled", "Patients Analyzed",
    "Number of Centers", "Number of Countries",
    "Total # of Hispanic/Latino", "Total # of Non-Hispanic/Non-Latino",
    "Total #  of Unknown Ethnicity", "Total #  of Asian",
    "Total #  of Black", "Total #  of White",
    "Total #  of Native Hawaiian or Pacific Islander",
    "Total #  of American Indian/Alaska Native",
    "Total #  of Other Race", "Total #  of Unknown Race",
]

for c in numeric_cols:
    raw[c] = raw[c].apply(to_int_list)

# ------------------------------------------------------------------
# 2.  long-format all numeric columns separately, then stack
# ------------------------------------------------------------------
long_numeric = []

for col in numeric_cols:
    tmp = (raw[["canon_id", col]]
           .explode(col)                      # explode *one* column
           .assign(metric=col)               # keep the column name
           .rename(columns={col: "value"}))  # values → "value"
    long_numeric.append(tmp)

long_numeric = pd.concat(long_numeric, ignore_index=True)
long_numeric["value"] = pd.to_numeric(long_numeric["value"], errors="coerce")

# ------------------------------------------------------------------
# 3.  aggregate per canon_id / metric
# ------------------------------------------------------------------
# default: sum everything
aggfunc = "sum"

pivot = (long_numeric
         .dropna(subset=["value"])
         .pivot_table(index="canon_id",
                      columns="metric",
                      values="value",
                      aggfunc=aggfunc))

# fix the two “max” metrics
for max_col in ["Number of Centers", "Number of Countries"]:
    pivot[max_col] = (long_numeric
                      .query("metric == @max_col")
                      .groupby("canon_id")["value"]
                      .max())

num_clean = (pivot
             .fillna(0)          # empty → 0
             .astype(int)
             .reset_index())

# ------------------------------------------------------------------
# 4.  merge back into your feature table
# ------------------------------------------------------------------
by_canon = by_canon.merge(num_clean, on="canon_id", how="left")
by_canon[numeric_cols] = by_canon[numeric_cols].fillna(0).astype(int)



In [12]:
by_canon.columns

Index(['canon_id', 'total_studies', 'Study Type', 'Study Design',
       'Labeling Change Summary', 'Indication', 'Efficacy', 'Safety',
       'Pharmacokinetic', 'Pharmacodynamic', 'Tolerability', 'Other_Type',
       'Randomized_DoubleBlind', 'Randomized_SingleBlind', 'Open_Label',
       'Placebo_Control', 'Active_Comparator', 'Dose_Escalation',
       'Population_PK', 'Other_Design', 'age_min', 'age_max',
       'Studied in Neonates', 'Indicated in Neonates', 'Number of Centers',
       'Number of Countries', 'Patients Analyzed', 'Patients Enrolled',
       'Total #  of American Indian/Alaska Native', 'Total #  of Asian',
       'Total #  of Black', 'Total #  of Native Hawaiian or Pacific Islander',
       'Total #  of Other Race', 'Total #  of Unknown Ethnicity',
       'Total #  of Unknown Race', 'Total #  of White',
       'Total # of Hispanic/Latino', 'Total # of Non-Hispanic/Non-Latino'],
      dtype='object')

In [13]:
#######################################################################
# 6.  -------- metadata carry-over -----------------------------
########################################################################
meta_cols = [
    'canon_id','Pediatric Labeling Approval Date','Trade Name','Generic Name',
    'Type of Legislation','Indication','Indication(s) Studied', 
    'Therapeutic Category','Dosage Form(s)','Route(s) of Administration',
    'Pharmacological Class','Product Labeling Link', 'kind','root','supplement'
]


meta_one = (raw[meta_cols]
            .replace({'':np.nan})
            .groupby('canon_id',as_index=False).first())

by_canon = by_canon.merge(meta_one,on='canon_id',how='left')



In [14]:
by_canon

Unnamed: 0,canon_id,total_studies,Study Type,Study Design,Labeling Change Summary,Indication_x,Efficacy,Safety,Pharmacokinetic,Pharmacodynamic,...,Indication_y,Indication(s) Studied,Therapeutic Category,Dosage Form(s),Route(s) of Administration,Pharmacological Class,Product Labeling Link,kind,root,supplement
0,ANDA_071961_ORIG1,2,"Efficacy,Safety,Pharmacokinetic_x000D_\nEffica...","Double-Blind,Parallel Group,Dose Ranging_x000D...",-\tEfficacy in the pediatric population was es...,,1,1,1,0,...,,Immediate reduction of blood pressure in hyper...,Cardiac Drugs,INJECTABLE,INTRAVENOUS,,https://www.accessdata.fda.gov/drugsatfda_docs...,ANDA,071961,ORIG1
1,ANDA_072370,0,,,- Population PK analysis in 87 pediatric patie...,Partial-onset seizures,0,0,0,0,...,Partial-onset seizures,Status epilepticus in pediatric patients 3 mon...,Seizures,INJECTABLE,INTRAVENOUS,Benzodiazepine,https://cderoneanalytics.fda.gov/search360/?or...,ANDA,072370,
2,ANDA_214745,0,,,- Population PK analysis in 87 pediatric patie...,Partial-onset seizures,0,0,0,0,...,Partial-onset seizures,Status epilepticus in pediatric patients 3 mon...,Seizures,INJECTABLE,INTRAVENOUS,Benzodiazepine,https://www.fda.gov/media/167713/download?atta...,ANDA,214745,
3,ANDA_999907_ORIG1,1,"Safety,Pharmacokinetic","Multicenter,Open-Label",-\tLabeling revised to include neonatal dosing...,,0,1,1,0,...,,Neonatal dosing for meningitis and septicemia,Antibacterials,,,,https://www.fda.gov/media/127633/download,ANDA,999907,ORIG1
4,BLA_101069_5846,1,"Safety,Immunogenicity","Phase 3,Non-Inferiority",See Package Insert for new information on biol...,Other,0,1,0,0,...,Other,Active immunization for the prevention of meas...,Preventive Vaccine,INJECTABLE_x000D_\nINJECTABLE,SUBCUTANEOUS_x000D_\nINTRAMUSCULAR,Vaccine,https://www.fda.gov/media/75191/download,BLA,101069,5846
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1053,NDA_999847_ORIG1,0,,,-\tExpanded the indication from adults to pedi...,Other,0,0,0,0,...,Other,Control of serum phosphorus in children 6 year...,Chronic kidney disease on dialysis,,,,http://www.accessdata.fda.gov/drugsatfda_docs/...,NDA,999847,ORIG1
1054,NDA_999901_ORIG1,2,"Efficacy,Safety","Placebo,Single-Blind",-\tSafety and effectiveness for migraine preve...,,1,1,0,0,...,,Migraine Prophylaxis,Migraine,,,,http://www.accessdata.fda.gov/drugsatfda_docs/...,NDA,999901,ORIG1
1055,NDA_999902_ORIG1,2,"Efficacy,Safety","Placebo,Single-Blind",-\tSafety and effectiveness for migraine preve...,,1,1,0,0,...,,Migraine Prophylaxis,Migraine,,,,http://www.accessdata.fda.gov/drugsatfda_docs/...,NDA,999902,ORIG1
1056,NDA_999903_0049,1,"Efficacy,Safety",Open-Label,-\tInformation on postmarketing clinical study...,Influenza,1,1,0,0,...,Influenza,Prophylaxis of influenza,Antivirals,CAPSULE_x000D_\nFOR SUSPENSION,ORAL_x000D_\nORAL,Neuraminidase Inhibitor,http://www.accessdata.fda.gov/drugsatfda_docs/...,NDA,999903,0049


In [15]:
by_canon.columns

Index(['canon_id', 'total_studies', 'Study Type', 'Study Design',
       'Labeling Change Summary', 'Indication_x', 'Efficacy', 'Safety',
       'Pharmacokinetic', 'Pharmacodynamic', 'Tolerability', 'Other_Type',
       'Randomized_DoubleBlind', 'Randomized_SingleBlind', 'Open_Label',
       'Placebo_Control', 'Active_Comparator', 'Dose_Escalation',
       'Population_PK', 'Other_Design', 'age_min', 'age_max',
       'Studied in Neonates', 'Indicated in Neonates', 'Number of Centers',
       'Number of Countries', 'Patients Analyzed', 'Patients Enrolled',
       'Total #  of American Indian/Alaska Native', 'Total #  of Asian',
       'Total #  of Black', 'Total #  of Native Hawaiian or Pacific Islander',
       'Total #  of Other Race', 'Total #  of Unknown Ethnicity',
       'Total #  of Unknown Race', 'Total #  of White',
       'Total # of Hispanic/Latino', 'Total # of Non-Hispanic/Non-Latino',
       'Pediatric Labeling Approval Date', 'Trade Name', 'Generic Name',
       'Type of 

In [16]:
by_canon.shape

(1058, 52)

## now merging with llm and manually annotated datasets for the final dataset release

In [17]:
DATA_DIR   = REPO_ROOT / "data"
SPLIT_DIR  = DATA_DIR / "processed"/"splits"          # data/splits/train.csv …
OUTPUT_DIR = REPO_ROOT / "outputs"

AUG_DIR    = OUTPUT_DIR / "splits_with_feats"
AUG_DIR.mkdir(parents=True, exist_ok=True)

# ---------------------------------------------------------------
# 2   Load the original splits
# ---------------------------------------------------------------
read_opts = dict(dtype=str, na_filter=False)

df_train = pd.read_csv(SPLIT_DIR / "train.csv", **read_opts)
df_dev   = pd.read_csv(SPLIT_DIR / "dev.csv",   **read_opts)
df_test  = pd.read_csv(SPLIT_DIR / "test.csv",  **read_opts)

for d in (df_train, df_dev, df_test):
    d["canon_id"] = d["canon_id"].str.strip()   # hygiene

In [18]:
# ➋ augment with engineered features
df_train = df_train.merge(by_canon, on="canon_id", how="inner")
df_dev   = df_dev  .merge(by_canon, on="canon_id", how="inner")
df_test  = df_test .merge(by_canon, on="canon_id", how="inner")



In [19]:
# ➌ save back alongside raw splits
df_train.to_csv(AUG_DIR / "train_full.csv", index=False)
df_dev  .to_csv(AUG_DIR / "dev_full.csv",   index=False)
df_test .to_csv(AUG_DIR / "test_full.csv",  index=False)

In [20]:
# ➍ publish one combined file, handy for NeurIPS camera-ready release
combined = pd.concat([df_train, df_dev, df_test], ignore_index=True)


In [21]:
df_train.columns

Index(['canon_id', 'resolved_label', 'peds_study_type', 'efficacy_summary',
       'pk_summary', 'lowest_age_band', 'highest_age_band', 'rationale',
       'confidence', 'summary_json', 'txt_file', 'manual_label_resolved',
       'label', 'is_gold', 'total_studies', 'Study Type', 'Study Design',
       'Labeling Change Summary', 'Indication_x', 'Efficacy', 'Safety',
       'Pharmacokinetic', 'Pharmacodynamic', 'Tolerability', 'Other_Type',
       'Randomized_DoubleBlind', 'Randomized_SingleBlind', 'Open_Label',
       'Placebo_Control', 'Active_Comparator', 'Dose_Escalation',
       'Population_PK', 'Other_Design', 'age_min', 'age_max',
       'Studied in Neonates', 'Indicated in Neonates', 'Number of Centers',
       'Number of Countries', 'Patients Analyzed', 'Patients Enrolled',
       'Total #  of American Indian/Alaska Native', 'Total #  of Asian',
       'Total #  of Black', 'Total #  of Native Hawaiian or Pacific Islander',
       'Total #  of Other Race', 'Total #  of Unknown E

In [22]:
combined

Unnamed: 0,canon_id,resolved_label,peds_study_type,efficacy_summary,pk_summary,lowest_age_band,highest_age_band,rationale,confidence,summary_json,...,Indication_y,Indication(s) Studied,Therapeutic Category,Dosage Form(s),Route(s) of Administration,Pharmacological Class,Product Labeling Link,kind,root,supplement
0,BLA_103976_5231,NotExtrapolated,RCT,Asthma: Two controlled RCT trials in patients ...,Safety profile assessed in the asthma studies.,6,<12,Pediatric evidence for asthma includes two con...,high,"{""PediatricSummary"":[{""section"":""Asthma"",""summ...",...,Asthma,Postmarketing study,Antiasthmatic,INJECTABLE,SUBCUTANEOUS,,https://www.accessdata.fda.gov/drugsatfda_docs...,BLA,103976,5231
1,BLA_125294_0045,Partial,PK+Safety,"There is no pediatric efficacy RCT; rather, sa...",Pediatric pharmacokinetic data (geometric mean...,1 month,<17 years,The extrapolation is based on PK and safety ev...,high,"{""PediatricSummary"":[{""section"":""8.4 Pediatric...",...,Other,Reduce the duration of severe neutropenia in ...,Hematologic Malignancies,,,Leukocyte Growth Factor,https://www.accessdata.fda.gov/drugsatfda_docs...,BLA,125294,0045
2,BLA_125477_0039,Partial,PK+Safety,"A single-arm, open-label study in 23 pediatric...",Pharmacokinetic data in pediatric patients wer...,1,16,While there is available pediatric evidence de...,medium,"{""PediatricSummary"":[{""section"":""8.4 Pediatric...",...,Solid tumor,Relapsed or refractory solid tumors,Solid Tumor,INJECTABLE,INTRAVENOUS,Other,https://www.accessdata.fda.gov/drugsatfda_docs...,BLA,125477,0039
3,BLA_125526_ORIG1,NotExtrapolated,RCT,The clinical program included adolescents aged...,Adolescents showed a mean apparent clearance a...,12,17,Pediatric efficacy evidence exists from an RCT...,medium,"{""PediatricSummary"":[{""section"":""8.4 Pediatric...",...,,Treatment in severe asthma in patients 12 year...,Antiasthmatic,INJECTABLE,INTRAVENOUS,Interleukin-5 Antagonist,http://www.accessdata.fda.gov/drugsatfda_docs/...,BLA,125526,ORIG1
4,BLA_761039_0015,Partial,PK+Safety,Effectiveness in pediatric patients is extrapo...,Pediatric pharmacokinetic data and exposure mo...,Not specified,Not specified,No pediatric efficacy RCT was conducted; inste...,high,"{""PediatricSummary"":[{""section"":""8.4 Pediatric...",...,Chemotherapy induced neutropenia,Decrease the incidence of infection as manifes...,Hematopoietic Growth Factors,INJECTABLE,SUBCUTANEOUS,Leukocyte Growth Factor,https://www.accessdata.fda.gov/drugsatfda_docs...,BLA,761039,0015
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
732,NDA_212477_ORIG1,NotExtrapolated,RCT,"Study 1116, an open‐label multicenter clinical...",Pharmacokinetic data supports weight‐based dos...,3 years,18 years,The availability of a pediatric efficacy trial...,high,"{""PediatricSummary"":[{""section"":""8.4 Pediatric...",...,Hepatitis C,Treatment of chronic hepatitis C virus,Antivirals,PELLETS,ORAL,Hepatitis C Virus NS5A Inhibitor; Hepatitis C ...,https://www.accessdata.fda.gov/drugsatfda_docs...,NDA,212477,ORIG1
733,NDA_212887_0006,Partial,PK+Safety,Safety and efficacy in adolescents have been e...,The MOCHA trial provided pharmacokinetic data ...,12 years,18 years,Pediatric evidence is limited to adolescents w...,high,"{""PediatricSummary"":[{""section"":""8.4 Pediatric...",...,HIV – 1 infection (prevention of transmission ...,HIV-1 treatment and pre-exposure prophylaxis,Antivirals,TABLET,ORAL,Human Immunodeficiency Virus Integrase Strand ...,https://www.accessdata.fda.gov/drugsatfda_docs...,NDA,212887,0006
734,NDA_213871_0004,NotExtrapolated,RCT,"Clinical studies, including Trial-AD04, evalua...",Although specific PK details were not provided...,12,18,The pediatric label is supported by at least o...,high,"{""PediatricSummary"":[{""section"":""8.4 Pediatric...",...,Atopic dermatitis,Moderate-to-severe atopic dermatitis in pediat...,Dermatitis Agents,TABLET,ORAL,Janus Kinase Inhibitor,https://www.accessdata.fda.gov/drugsatfda_docs...,NDA,213871,0004
735,NDA_761055_0012,NotExtrapolated,RCT,Atopic Dermatitis: Dupixent has been establish...,"In the asthma study, higher Dupilumab exposure...",12 years,17 years,The available pediatric data include randomize...,high,"{""PediatricSummary"":[{""section"":""Atopic Dermat...",...,Atopic dermatitis,Moderate-to-severe atopic dermatitis in pediat...,Dermatitis Agents,INJECTABLE,SUBCUTANEOUS,Interleukin-4 Receptor alpha Antagonist,https://www.accessdata.fda.gov/drugsatfda_docs...,NDA,761055,0012


In [23]:
column_order = ['canon_id', 'kind', 'root',
       'supplement', 'Product Labeling Link', 'label', 'is_gold', 'peds_study_type', 'efficacy_summary', 'pk_summary', 'rationale',
       'confidence', 'Pediatric Labeling Approval Date', 'Trade Name', 'Generic Name',
       'Type of Legislation', 'Indication_y', 'Indication(s) Studied',
       'Therapeutic Category', 'Dosage Form(s)', 'Route(s) of Administration',
       'Pharmacological Class','Labeling Change Summary', 'age_min', 'age_max', 'total_studies','Efficacy', 'Safety',
       'Pharmacokinetic', 'Pharmacodynamic', 'Tolerability', 'Other_Type',
       'Randomized_DoubleBlind', 'Randomized_SingleBlind', 'Open_Label',
       'Placebo_Control', 'Active_Comparator', 'Dose_Escalation',
       'Population_PK', 'Other_Design', 
       'Studied in Neonates', 'Indicated in Neonates', 'Number of Centers',
       'Number of Countries', 'Patients Analyzed', 'Patients Enrolled',
       'Total #  of American Indian/Alaska Native', 'Total #  of Asian',
       'Total #  of Black', 'Total #  of Native Hawaiian or Pacific Islander',
       'Total #  of Other Race', 'Total #  of Unknown Ethnicity',
       'Total #  of Unknown Race', 'Total #  of White',
       'Total # of Hispanic/Latino', 'Total # of Non-Hispanic/Non-Latino' ]

In [24]:
combined = combined[column_order]

In [25]:
combined

Unnamed: 0,canon_id,kind,root,supplement,Product Labeling Link,label,is_gold,peds_study_type,efficacy_summary,pk_summary,...,Total # of American Indian/Alaska Native,Total # of Asian,Total # of Black,Total # of Native Hawaiian or Pacific Islander,Total # of Other Race,Total # of Unknown Ethnicity,Total # of Unknown Race,Total # of White,Total # of Hispanic/Latino,Total # of Non-Hispanic/Non-Latino
0,BLA_103976_5231,BLA,103976,5231,https://www.accessdata.fda.gov/drugsatfda_docs...,NotExtrapolated,1,RCT,Asthma: Two controlled RCT trials in patients ...,Safety profile assessed in the asthma studies.,...,0,0,0,0,0,0,0,0,0,0
1,BLA_125294_0045,BLA,125294,0045,https://www.accessdata.fda.gov/drugsatfda_docs...,Partial,1,PK+Safety,"There is no pediatric efficacy RCT; rather, sa...",Pediatric pharmacokinetic data (geometric mean...,...,0,0,0,0,0,0,0,50,0,50
2,BLA_125477_0039,BLA,125477,0039,https://www.accessdata.fda.gov/drugsatfda_docs...,Partial,1,PK+Safety,"A single-arm, open-label study in 23 pediatric...",Pharmacokinetic data in pediatric patients wer...,...,0,0,3,0,0,1,3,23,6,22
3,BLA_125526_ORIG1,BLA,125526,ORIG1,http://www.accessdata.fda.gov/drugsatfda_docs/...,Partial,1,RCT,The clinical program included adolescents aged...,Adolescents showed a mean apparent clearance a...,...,0,0,0,0,0,0,0,0,0,0
4,BLA_761039_0015,BLA,761039,0015,https://www.accessdata.fda.gov/drugsatfda_docs...,Partial,1,PK+Safety,Effectiveness in pediatric patients is extrapo...,Pediatric pharmacokinetic data and exposure mo...,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
732,NDA_212477_ORIG1,NDA,212477,ORIG1,https://www.accessdata.fda.gov/drugsatfda_docs...,Partial,1,RCT,"Study 1116, an open‐label multicenter clinical...",Pharmacokinetic data supports weight‐based dos...,...,0,0,0,0,0,0,0,0,0,0
733,NDA_212887_0006,NDA,212887,0006,https://www.accessdata.fda.gov/drugsatfda_docs...,Partial,1,PK+Safety,Safety and efficacy in adolescents have been e...,The MOCHA trial provided pharmacokinetic data ...,...,0,0,0,0,0,0,0,0,0,0
734,NDA_213871_0004,NDA,213871,0004,https://www.accessdata.fda.gov/drugsatfda_docs...,NotExtrapolated,1,RCT,"Clinical studies, including Trial-AD04, evalua...",Although specific PK details were not provided...,...,0,0,0,0,0,0,0,0,0,0
735,NDA_761055_0012,NDA,761055,0012,https://www.accessdata.fda.gov/drugsatfda_docs...,NotExtrapolated,1,RCT,Atopic Dermatitis: Dupixent has been establish...,"In the asthma study, higher Dupilumab exposure...",...,0,0,0,0,0,0,0,0,0,0


In [26]:
combined = combined.rename(columns={'Indication_y': 'Indication'})

In [27]:
combined.shape

(737, 56)

In [28]:
combined = combined.rename(columns={'canon_id': 'fda_application_id'})

In [29]:
combined.to_csv(OUT_DIR  /"pedx_dataset.csv", index=False)

print("— augmented split sizes —")
print("train", df_train.shape, "dev", df_dev.shape, "test", df_test.shape)
print(f"✓ combined dataset written → {OUT_DIR / 'pedx_dataset.csv'}")

— augmented split sizes —
train (687, 65) dev (16, 65) test (34, 65)
✓ combined dataset written → /data1/home/srinivasana/PedXBench/outputs/pedx_dataset.csv
