## 0. Generating a small synthetic mother-baby cohorts for the demonstration

In [1]:
import pandas as pd
import numpy as np

In [2]:
n = 500000
column_names = [
    'ENROLID_BABY','SEX','REGION','LOCATION','GESTATIONAL_AGE','AGE_MOM',
    'RDS_Baby','NAS_Baby','Postmaturity_Baby','ROP_Baby','SGA_Baby',
    'Anemia_Mom','Asthma_Mom','SUD_Alcohol_Mom','Anxiety_Mom','Bipolar_Disorder_Mom',
    'PTB_Mom','Autoimmune_Mom','APLS_Mom','STD_Mom','Hyperemesis_Gravidarum_Mom',
    'Headache_Mom','ADHD_Mom','Depression_Mom','Eclampsia_Mom','Epilepsy_Mom',
    'Infertility_Mom','GDM_Mom','Ondansetron, Oral','Sertraline, Oral','Oxycodone, Oral','Acetaminophen, Oral'
]

syn_df = pd.DataFrame(np.nan, index=range(n), columns=column_names)

In [3]:
syn_df

Unnamed: 0,ENROLID_BABY,SEX,REGION,LOCATION,GESTATIONAL_AGE,AGE_MOM,RDS_Baby,NAS_Baby,Postmaturity_Baby,ROP_Baby,...,ADHD_Mom,Depression_Mom,Eclampsia_Mom,Epilepsy_Mom,Infertility_Mom,GDM_Mom,"Ondansetron, Oral","Sertraline, Oral","Oxycodone, Oral","Acetaminophen, Oral"
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
499995,,,,,,,,,,,...,,,,,,,,,,
499996,,,,,,,,,,,...,,,,,,,,,,
499997,,,,,,,,,,,...,,,,,,,,,,
499998,,,,,,,,,,,...,,,,,,,,,,


In [4]:
import uuid

syn_df['ENROLID_BABY'] = ['ENR_' + uuid.uuid4().hex[:5].upper() for _ in range(len(syn_df))]
syn_df['SEX'] = np.random.choice([1, 2], size=len(syn_df))
syn_df['REGION'] = np.random.choice([1, 2, 3, 4, 5], size=len(syn_df))
syn_df['LOCATION'] = np.random.choice(np.arange(1, 50), size=len(syn_df))


In [5]:
syn_df['GESTATIONAL_AGE'] = np.random.normal(loc=38.66, scale=1.52, size=500000)
syn_df['GESTATIONAL_AGE'] = syn_df['GESTATIONAL_AGE'].round().astype(int)
syn_df['AGE_MOM'] = (np.random.normal(loc=31.77, scale=0.68, size=len(syn_df))
                     .clip(20, 45)
                     .round()
                     .astype(int)
                     )

In [6]:
n = len(syn_df)  # 50001

# Defining positive counts per each column
pos_counts = pd.Series({
    "RDS_Baby": 17220,
    "NAS_Baby": 420,
    "Postmaturity_Baby": 15190,
    "ROP_Baby": 3600,
    "SGA_Baby": 6770,
    "Anemia_Mom": 37150,
    "Asthma_Mom": 14230,
    "SUD_Alcohol_Mom": 250,
    "Anxiety_Mom": 12760,
    "Bipolar_Disorder_Mom": 1410,
    "PTB_Mom": 40330,
    "Autoimmune_Mom": 2690,
    "APLS_Mom": 3230,
    "STD_Mom": 280,
    "Hyperemesis_Gravidarum_Mom": 6540,
    "Headache_Mom": 10440,
    "ADHD_Mom": 670,
    "Depression_Mom": 10430,
    "Eclampsia_Mom": 820,
    "Epilepsy_Mom": 1520,
    "Infertility_Mom": 620,
    "GDM_Mom": 70840,
    "Ondansetron, Oral": 8650,
    "Sertraline, Oral": 6530,
    "Oxycodone, Oral": 2870,
    "Acetaminophen, Oral": 21000
}).astype(int)

for col, k in pos_counts.items():
    if col not in syn_df.columns:
        pass
    else:
        syn_df[col] = 0  # set the column to 0 first

    # randomly select k indices to set to 1
    if k > 0:
        idx = np.random.choice(n, size=min(k, n), replace=False)
        syn_df.loc[idx, col] = 1

    syn_df[col] = syn_df[col].astype("Int8")



In [7]:
syn_df.sum()

ENROLID_BABY                  ENR_8AF1FENR_756BEENR_F24D4ENR_E1CBFENR_7D976E...
SEX                                                                      749653
REGION                                                                  1499933
LOCATION                                                               12498720
GESTATIONAL_AGE                                                        19329449
AGE_MOM                                                                15885090
RDS_Baby                                                                  17220
NAS_Baby                                                                    420
Postmaturity_Baby                                                         15190
ROP_Baby                                                                   3600
SGA_Baby                                                                   6770
Anemia_Mom                                                                37150
Asthma_Mom                              

In [8]:
# Add associations between RDS and Ondansetron
np.random.seed(42)

# Distinguish indices based on Ondansetron usage
ond_idx = syn_df.index[syn_df['Ondansetron, Oral'] == 1]
non_ond_idx = syn_df.index[syn_df['Ondansetron, Oral'] == 0]

# Set RDS rates based on Ondansetron usage
rds_rate_ond = 0.15  # 15%
rds_rate_non = 0.03  # 3%

# Randomly assign RDS_Baby based on the defined rates
syn_df.loc[ond_idx, 'RDS_Baby'] = np.random.binomial(1, rds_rate_ond, len(ond_idx))
syn_df.loc[non_ond_idx, 'RDS_Baby'] = np.random.binomial(1, rds_rate_non, len(non_ond_idx))

syn_df['RDS_Baby'] = syn_df['RDS_Baby'].astype('Int8')


In [9]:
syn_df.to_csv('synthetic_baby_mom_data.csv', index=False)