In [7]:
import re, pandas as pd, numpy as np

ID_RE = re.compile(
    r'\b(?P<kind>NDA|BLA|ANDA)\s+'      # KIND
    r'(?P<root>\d{5,7})'                # ROOT  (keep zeros)
    r'(?:\s*/\s*(?P<sup>\d{1,4}))?'     # /0086  (keep zeros)
    r'(?:\s*-\s*Original\s*(?P<orig>\d+))?',   # - Original 1
    flags=re.I)

def parse_ids(cell:str):
    """
    Return list of dicts with canon_id, kind, root, supplement.
    Leading zeros are kept intact.
    """
    if pd.isna(cell):
        return []

    hits = []
    for m in ID_RE.finditer(str(cell)):
        kind = m.group('kind').upper()
        root = m.group('root')                # keep zeros

        if m.group('sup'):                    # /0086
            sup = m.group('sup')              # keep zeros
            canon = f"{kind}_{root}_{sup}"
        elif m.group('orig'):                 # - Original 1
            sup = f"ORIG{m.group('orig')}"
            canon = f"{kind}_{root}_{sup}"
        else:
            sup, canon = np.nan, f"{kind}_{root}"

        hits.append(dict(kind=kind, root=root,
                         supplement=sup, canon_id=canon))
    return hits

# ---------- how to use on a dataframe --------------------------
def explode_ids(df, col="FDA Application Number(s) "):
    rows = []
    for _, row in df.iterrows():
        ids = parse_ids(row[col])
        if not ids:                           # keep row with NaNs if no ID
            rows.append({**row.to_dict(),
                          **dict(kind=np.nan, root=np.nan,
                                 supplement=np.nan, canon_id=np.nan)})
        else:
            for d in ids:                     # explode 1-row-per-ID
                rows.append({**row.to_dict(), **d})
    return pd.DataFrame(rows)

# example
df_raw   = pd.read_excel("/Users/srinivasana/Documents/peds_agents/data/web_fdaaa_bpca_prea_pediatric_study_characteristics-_1_17_2024_1_53_pm_est_-_debbie.avantfda.hhs_.gov_.xlsx", sheet_name=0)
df_clean = explode_ids(df_raw)

df_clean.to_excel("/Users/srinivasana/Documents/peds_agents/data/web_fdaaa_clean.xlsx", index=False)


In [2]:
import pandas as pd

In [8]:
df = pd.read_excel("/Users/srinivasana/Documents/peds_agents/data/web_fdaaa_clean.xlsx", engine = "openpyxl")