In [None]:
# Load required Libraries  pandas, numpy,pyreadstat(to read sas files)
import pandas as pd
import numpy as np
import pyreadstat
from pathlib import Path

In [None]:
# ------ Config (adjust paths/study id) ------
Raw_Dir = Path("F:\\OneDrive\\Clinical Programming\\Data for Practice\\1508_DATASETS_SAS_14Jul2015")
STUDYID = "CLBG-1508"
OUT_AE = Path("sdtm_AE.csv")

In [None]:
    # ----------  Small helper functions ----------
#1)-Date Parser= """Return ISO date 'YYYY-MM-DD' for a value or np.nan if not parseable."""
def to_iso(x):
                t=pd.to_datetime(x,errors="coerce")
                return t.strftime("%Y-%m-%d") if pd.notna(t) else np.nan

In [None]:
 # 2) Return the actual column name in df matching the first candidate (case-insensitive), or None if none found.

def first_col(df, candidates):
    """Return first matching column name from df (case-insensitive)"""
    cols = [c.lower() for c in df.columns]
    for cand in candidates:
        if cand.lower() in cols:
            return next(c for c in df.columns if c.lower() == cand.lower())
    return None

    
#3) Function to  calculate event day

def days_from_rf(event_iso, rf_iso):
    """Compute study day = (event_date - rf_date).days + 1"""
    if pd.isna(event_iso) or pd.isna(rf_iso):
        return pd.NA
    e = pd.to_datetime(event_iso, errors="coerce")
    r = pd.to_datetime(rf_iso, errors="coerce")
    if pd.isna(e) or pd.isna(r):
        return pd.NA
    return int((e - r).days) + 1

print("Block 2 complete: Helper functions defined")


In [None]:
# Create a list of datasets that we will be working with

DATASETS= ["DM","AE","EX","DS","CM"]

In [None]:
# ---------- load raw datasets ----------
#Assign them to a dictionary to automaticall pull required dataset

datasets={}

for name in DATASETS:

    p=Raw_Dir/f"{name.upper()}.sas7bdat"

    if p.exists():
        df,_=pyreadstat.read_sas7bdat(str(p))
        df.columns=[str(c).lower().strip() for c in df.columns]
        # strip string columns

        for col in df.select_dtypes(include=object):
            df[col]=df[col].astype(str).str.strip().replace({"":np.nan})
            datasets[name]=df

    else:

        datasets[name]= None

In [None]:
dm_raw = datasets["DM"]
ae_raw = datasets["AE"]
ex_raw = datasets["EX"]
ds_raw = datasets["DS"]
cm_raw = datasets["CM"]

In [None]:
# ---------- Block 3: Read raw AE SAS dataset ----------
ae_path = Raw_Dir / "AE.sas7bdat"
if not ae_path.exists():
    raise FileNotFoundError(f"AE file not found at {ae_path}")

ae_raw, meta = pyreadstat.read_sas7bdat(str(ae_path))

# normalize column names to lower-case
ae_raw.columns = [str(c).strip().lower() for c in ae_raw.columns]

# strip string columns
for col in ae_raw.select_dtypes(include="object"):
    ae_raw[col] = ae_raw[col].astype(str).str.strip().replace({"": pd.NA})

print(f"Block 3 complete: AE raw dataset loaded ({len(ae_raw)} rows, {len(ae_raw.columns)} columns)")
ae_raw.tail()


In [None]:
# ---------- Block 4: Build minimal subjects info ----------
dm_path = Raw_Dir / "DM.sas7bdat"
if not dm_path.exists():
    raise FileNotFoundError(f"DM file not found at {dm_path}")

dm_raw, meta_dm = pyreadstat.read_sas7bdat(str(dm_path))
dm_raw.columns = [str(c).strip().lower() for c in dm_raw.columns]

# Determine subject column
subj_col = first_col(dm_raw, ["usubjid","subjid","subjectid","subject_id","subjno","subject"])
if subj_col is None:
    raise ValueError("No subject column found in DM dataset")

# Minimal subjects DataFrame
subjects = pd.DataFrame()
subjects["SUBJID"] = dm_raw[subj_col].astype(str).str.strip()
subjects["USUBJID"] = STUDYID + "-" + subjects["SUBJID"]

# RFSTDTC column (study reference start date) â€” try to get from DM
rf_col = first_col(dm_raw, ["rfstdtc","rfstdt","rfstd"])
if rf_col and rf_col in dm_raw.columns:
    subjects["RFSTDTC"] = dm_raw[rf_col].apply(lambda x: to_iso(x))
else:
    subjects["RFSTDTC"] = pd.NA

print(f"Block 4 complete: subjects dataframe created ({len(subjects)} subjects)")
subjects.head()


In [None]:
# ---------- Block 5: Map AE columns ----------
df = ae_raw.copy()

subj_col = first_col(df, ["usubjid","subjid","subjectid","subject_id","subjno","subject"])
term_col = first_col(df, ["aeterm","term","aeverbatim"])
decod_col = first_col(df, ["aedecod","ae_decod","pt"])
llt_col = first_col(df, ["aeterm_llt","aellt","aelltcd"])
hlt_col = first_col(df, ["aehlt","aehltcd"])
soc_col = first_col(df, ["aeterm_soc","aesoc","aebodsys"])
sev_col = first_col(df, ["aesev","severity"])
ser_col = first_col(df, ["aeser","serious"])
acn_col = first_col(df, ["aeacn","actiontaken"])
out_col = first_col(df, ["aeout","outcome"])
start_col = first_col(df, ["aestdtc","aestdt","aestd"])
end_col = first_col(df, ["aeendtc","aeendt","aeend"])

# Initialize work DataFrame
work = pd.DataFrame()
work["SUBJID"] = df[subj_col].astype(str).str.strip() if subj_col else pd.NA
work["AETERM"] = df[term_col] if term_col else pd.NA
work["AEDECOD"] = df[decod_col] if decod_col else pd.NA
work["AELLT"] = df[llt_col] if llt_col else pd.NA
work["AEHLT"] = df[hlt_col] if hlt_col else pd.NA
work["AESOC"] = df[soc_col] if soc_col else pd.NA
work["AESEV"] = df[sev_col] if sev_col else pd.NA
work["AESER"] = df[ser_col] if ser_col else pd.NA
work["AEACN"] = df[acn_col] if acn_col else pd.NA
work["AEOUT"] = df[out_col] if out_col else pd.NA
work["AESTDTC"] = df[start_col].apply(lambda x: to_iso(x) if pd.notna(x) else pd.NA) if start_col else pd.NA
work["AEENDTC"] = df[end_col].apply(lambda x: to_iso(x) if pd.notna(x) else pd.NA) if end_col else pd.NA

print("Block 5 complete: AE columns mapped")
work.tail()


In [None]:
# ---------- Block 6: Merge subjects, compute study days ----------
work = work.merge(subjects[["SUBJID","USUBJID","RFSTDTC"]], on="SUBJID", how="left")

work["AESTDY"] = work.apply(lambda r: days_from_rf(r.get("AESTDTC"), r.get("RFSTDTC")), axis=1)
work["AEENDY"] = work.apply(lambda r: days_from_rf(r.get("AEENDTC"), r.get("RFSTDTC")), axis=1)

# AESEQ per subject
work = work.sort_values(["SUBJID","AESTDTC"]).reset_index(drop=True)
work["AESEQ"] = work.groupby("SUBJID").cumcount() + 1
work["AESPID"] = pd.NA

print("Block 6 complete: Study days and sequence numbers calculated")
work.head()


In [None]:
work

In [None]:
# ---------- Block 7: Final AE SDTM dataset ----------

# Test the sample output and start mapping out the variables
ae_out = pd.DataFrame({
    "STUDYID": STUDYID,
    "DOMAIN": "AE",
    "USUBJID": work["USUBJID"],
    "AESEQ": work["AESEQ"],
    "AESPID": work["AESPID"],
    "AETERM": work["AETERM"],
    "AELLT": work["AELLT"],
    "AEDECOD": work["AEDECOD"],
    "AEHLT": work["AEHLT"],
    "AESOC": work["AESOC"],
    "AESEV": work["AESEV"],
    "AESER": work["AESER"],
    "AEACN": work["AEACN"],
    "AEOUT": work["AEOUT"],
    "AESTDTC": work["AESTDTC"],
    "AEENDTC": work["AEENDTC"],
    "AESTDY": work["AESTDY"],
    "AEENDY": work["AEENDY"]
})




In [None]:
# Save to CSV
ae_out.to_csv(OUT_AE, index=False)
print(f"Block 7 complete: AE SDTM file saved ({len(ae_out)} rows) -> {OUT_AE}")