### !pip install pandas
!pip install pyreadstat
import pandas as pd        # pandas is used to read/write tables and to transform data
import numpy as np         # numpy for missing value constant np.nan
from pathlib import Path   # Path helps build file paths that work on any OS
import pyreadstat


In [2]:
# Load required Libraries  pandas, numpy,pyreadstat(to read sas files)
import pandas as pd
import numpy as np
import pyreadstat
from pathlib import Path

In [3]:
# Define File Path ,assign studyid
Raw_Dir=Path("F:\\OneDrive\\Clinical Programming\\Data for Practice\\1508_DATASETS_SAS_14Jul2015")
STUDYID = "CLBG-1508"
OUT_DM = Path("sdtm_DM.csv")

In [4]:
# Create a list of datasets that we will be working with
DATASETS= ["DM","AE","EX","DS","CM"]

In [5]:
# ---------- helper functions ----------

#1)-Date Parser= """Return ISO date 'YYYY-MM-DD' for a value or np.nan if not parseable."""

def to_iso(x):
    t=pd.to_datetime(x,errors="coerce")
    return t.strftime("%Y-%m-%d") if pd.notna(t) else np.nan

In [6]:

#   2) Return the actual column name in df matching the first candidate (case-insensitive), or None if none found.
def first_col(df, candidates):
    if df is None:
        return None
    cols = [c.lower() for c in df.columns]
    for cand in candidates:
        if cand.lower() in cols:
            return cand.lower()
    return None

In [10]:
# def pull_var(subjects, datasets, varname, candidates)

#Simple pull_var safe against missing columns/rows.

#The pull_var helper  safely searches across datasets for a variable. 
#Returns a Series aligned to subjects. index.

def pull_var(subjects, datasets, varname, candidates):
    subjects = subjects.copy()
    subjects["SUBJID"] = subjects["SUBJID"].astype(str).str.strip()
    
    # create empty column first
    subjects[varname] = pd.NA

    for ds_name in ["DM","CM","AE","EX","DS"]:
        df = datasets.get(ds_name)
        if df is None or df.empty:
            continue 

        # case-insensitive mapping
        cols_map = {c.lower(): c for c in df.columns}

        # find the first candidate column
        found_col = next((cols_map[c.lower()] for c in candidates if c.lower() in cols_map), None)
        if not found_col:
            continue

        # find subject column
        subj_col_candidates = ["usubjid","subjid","subjectid","subjno","subject","subject_id"]
        found_sub = next((cols_map[s.lower()] for s in subj_col_candidates if s.lower() in cols_map), None)
        if not found_sub:
            continue

        # build tmp and normalize
        tmp = df[[found_sub, found_col]].copy()
        tmp.rename(columns={found_sub: "SUBJID", found_col: varname}, inplace=True)
        tmp["SUBJID"] = tmp["SUBJID"].astype(str).str.strip()

        # merge safely
        subjects = subjects.merge(tmp[["SUBJID", varname]], on="SUBJID", how="left", suffixes=('', '_tmp'))

        # fill NA in original column with new values if available
        subjects[varname] = subjects[varname].combine_first(subjects.get(f"{varname}_tmp"))
        if f"{varname}_tmp" in subjects.columns:
            subjects.drop(columns=[f"{varname}_tmp"], inplace=True)

    return subjects[varname]


In [11]:
# ---------- load raw datasets ----------
#Assign them to a dictionary to automaticall pull required dataset

datasets={}

for name in DATASETS:

    p=Raw_Dir/f"{name.upper()}.sas7bdat"

    if p.exists():
        df,_=pyreadstat.read_sas7bdat(str(p))
        df.columns=[str(c).lower().strip() for c in df.columns]
        # strip string columns

        for col in df.select_dtypes(include=object):
            df[col]=df[col].astype(str).str.strip().replace({"":np.nan})
            datasets[name]=df

    else:

        datasets[name]= None
        

  df[col]=df[col].astype(str).str.strip().replace({"":np.nan})
  df[col]=df[col].astype(str).str.strip().replace({"":np.nan})
  df[col]=df[col].astype(str).str.strip().replace({"":np.nan})
  df[col]=df[col].astype(str).str.strip().replace({"":np.nan})
  df[col]=df[col].astype(str).str.strip().replace({"":np.nan})
  df[col]=df[col].astype(str).str.strip().replace({"":np.nan})
  df[col]=df[col].astype(str).str.strip().replace({"":np.nan})
  df[col]=df[col].astype(str).str.strip().replace({"":np.nan})
  df[col]=df[col].astype(str).str.strip().replace({"":np.nan})
  df[col]=df[col].astype(str).str.strip().replace({"":np.nan})
  df[col]=df[col].astype(str).str.strip().replace({"":np.nan})
  df[col]=df[col].astype(str).str.strip().replace({"":np.nan})
  df[col]=df[col].astype(str).str.strip().replace({"":np.nan})
  df[col]=df[col].astype(str).str.strip().replace({"":np.nan})
  df[col]=df[col].astype(str).str.strip().replace({"":np.nan})
  df[col]=df[col].astype(str).str.strip().replace({"":n

In [12]:
dm_raw=datasets["DM"]

In [13]:
dm_raw = datasets["DM"]
ae_raw = datasets["AE"]
ex_raw = datasets["EX"]
ds_raw = datasets["DS"]
cm_raw = datasets["CM"]

In [16]:
# ---------- build subjects and site columns ----------

# Prefer DM for consistent subject list (typical in projects)

if datasets["DM"] is not None:
    subj_col=first_col(datasets["DM"],['usubjid','subjid','subjectid','subject_id','subjno'])
    site_col=first_col(datasets["DM"],['site','siteid','site_id','SITE','SITEID','SITE_ID'])
    subjects=pd.DataFrame({"SUBJID":datasets["DM"][subj_col].astype(str).str.strip()})
  


In [17]:
subjects["USUBJID"] = STUDYID + "-" + subjects["SUBJID"].astype(str)

In [18]:
#Function to pull all the date columns and append them into a single list 

def collect_dates(df):
    """
    Return list of dicts: {"SUBJID": str, "DATE": Timestamp} for valid parsed dates in df.
    Safe: handles missing subject column, lower-cases column checks, skips invalid parses.
    """
    if df is None:
        return []

    # find subject column (using your first_col helper which returns lowercased name)
    sub_col = first_col(df, ["usubjid","subjid","subjectid","subject_id","subjno","subject"])
    if sub_col is None:
        # nothing to do if no subject id column
        return []

    # detect likely date columns (case-insensitive)
    date_cols = [c for c in df.columns if any(k in c.lower() for k in ["date","dtc","dth","start","end","brthdt"])]
    if not date_cols:
        return []

    rows = []
    for _, r in df.iterrows():
        raw_sub = r.get(sub_col, None)
        if pd.isna(raw_sub):
            continue
        sub = str(raw_sub).strip()
        if not sub:
            continue

        for dc in date_cols:
            raw_val = r.get(dc, None)
            if pd.isna(raw_val):
                continue
            parsed = pd.to_datetime(raw_val, errors="coerce")
            if pd.isna(parsed):
                # skip invalid/partial that couldn't be parsed
                continue
            rows.append({"SUBJID": sub, "DATE": parsed, "SRC": getattr(df, "__name__", "") or dc})
    return rows

In [19]:
# Create a variable list called all dates to hold all dates as a list
all_dates=[]
for df in [dm_raw, ae_raw, ex_raw, ds_raw, cm_raw]:
    all_dates += collect_dates(df)

In [20]:
dates_df=pd.DataFrame(all_dates)

In [21]:
# From the dates list derive RFSDTC and RFENDTC by taking the min and max of the dates
if not dates_df.empty:
    minmax=dates_df.groupby("SUBJID")["DATE"].agg(["min","max"]).reset_index()
    minmax.rename(columns={"min":"RFSTDC","max":"RFENDTC"},inplace=True)
    minmax["RFSTDC"]=minmax["RFSTDC"].apply(to_iso)
    minmax["RFENDTC"]=minmax["RFENDTC"].apply(to_iso)
    subjects = subjects.merge(minmax, on="SUBJID", how="left")
else:
    subjects["RFSTDTC"] = np.nan
    subjects["RFENDTC"] = np.nan

In [22]:
# ---------- pull remaining DM variables ----------

subjects["AGEU"]= "YEARS"
subjects["DTHDTC"]=pull_var(subjects,datasets,"DTHDTC",["death","dth"]) #usually available in DD(death details dataset)


In [23]:
# Many studies store actual exposure dates in EX. If present we create RFXSTDTC and RFXENDTC
# by taking the first start and last end per subject (simple and common approach).
ex_raw 
if ex_raw is not None:
    sub_col = first_col(ex_raw, ["usubjid","subjid","subjectid","sub_id","subjno"])
    start_col = first_col(ex_raw, ["exstdt","exstdtc","exstart","start"])
    end_col = first_col(ex_raw, ["exendt","exendtc","exend","end"])
    ex_tmp = ex_raw[[c for c in [sub_col,start_col,end_col] if c]].rename(columns={sub_col:"SUBJID"})
    if start_col: 
        ex_tmp["RFXSTDTC"] = ex_tmp[start_col].apply(to_iso)
    if end_col: 
        ex_tmp["RFXENDTC"] = ex_tmp[end_col].apply(to_iso)
    
    agg = (
    ex_tmp.groupby("SUBJID")
          .agg({"RFXSTDTC": "first", "RFXENDTC": "last"})
          .reset_index()
)
subjects = subjects.merge(agg, on="SUBJID", how="left")


In [24]:
# ---------- pull remaining DM variables ----------
subjects["AGEU"] = "YEARS"
subjects["DTHDTC"]=pull_var(subjects,datasets,"DTHDTC",["death","dth"])
subjects["DTHFL"]=subjects["DTHDTC"].apply(lambda x:"Y" if pd.notna(x) else np.nan)            

In [25]:
subjects["SITEID"] = pull_var(subjects, datasets,"SITEID", ["site","siteid"])

In [26]:
subjects["INVNAM"] = pull_var(subjects, datasets, "INVNAM", ["investigator","inv_name"])

In [27]:
subjects["COUNTRY"] = pull_var(subjects, datasets, "COUNTRY", ["country","ctry"])

In [28]:
subjects["BRTHDTC"] = pull_var(subjects, datasets,"BRTHDTC", ["brthdt","birth_date","dob"])

In [29]:
subjects["AGE"]=pull_var(subjects,datasets,"AGE",["age","AGE"])

  subjects[varname] = subjects[varname].combine_first(subjects.get(f"{varname}_tmp"))


In [30]:
subjects["SEX"]=pull_var(subjects,datasets,"SEX",["SEX","sex","gender"]).astype(str).str.upper().replace({"MALE":"M","FEMALE":"F"})

In [31]:
subjects["RACE"]=pull_var(subjects,datasets,"RACE",["race","RACE"])

In [32]:
subjects["ETHNIC"]=pull_var(subjects,datasets,"ETHNIC", ["ethnic","ethnicity"])

In [33]:
subjects["ARM"]=pull_var(subjects,datasets,"ARM",["arm","treatment"])

In [34]:
subjects["ARMCD"] = subjects["ARM"].astype(str).str.upper().str.replace(r"\s+","_", regex=True)
subjects["ACTARM"] = subjects["ARM"]
subjects["ACTARMCD"] = subjects["ACTARM"].astype(str).str.upper().str.replace(r"\s+","_", regex=True)

In [35]:
dm_out = pd.DataFrame({
    "STUDYID": STUDYID,
    "DOMAIN": "DM",
    "USUBJID": subjects["USUBJID"],
    "SUBJID": subjects["SUBJID"],
    "RFSTDTC": subjects.get("RFSTDTC",np.nan),
    "RFENDTC": subjects.get("RFENDTC",np.nan),
    "RFXSTDTC": subjects.get("RFXSTDTC", np.nan),
    "RFXENDTC": subjects.get("RFXENDTC", np.nan),
    "RFICDTC": np.nan,
    "DTHDTC": subjects["DTHDTC"],
    "DTHFL": subjects["DTHFL"],
    "SITEID": subjects["SITEID"],
    "INVNAM": subjects["INVNAM"],
    "BRTHDTC": subjects["BRTHDTC"],
    "AGE": subjects["AGE"],
    "AGEU": subjects["AGEU"],
    "SEX": subjects["SEX"],
    "RACE": subjects["RACE"],
    "ETHNIC": subjects["ETHNIC"],
    "ARMCD": subjects["ARMCD"],
    "ARM": subjects["ARM"],
    "ACTARMCD": subjects["ACTARMCD"],
    "ACTARM": subjects["ACTARM"],
    "COUNTRY": subjects["COUNTRY"]
})

In [36]:
dm_out.to_csv(OUT_DM,index=False)