## 0. Generating a small synthetic mother-baby cohorts for the demonstration

In [18]:
import pandas as pd
import numpy as np

In [29]:
n = 50001
column_names = [
    'ENROLID_BABY','SEX','REGION','LOCATION','GESTATIONAL_AGE','AGE_MOM',
    'RDS_Baby','NAS_Baby','Postmaturity_Baby','ROP_Baby','SGA_Baby',
    'Anemia_Mom','Asthma_Mom','SUD_Alcohol_Mom','Anxiety_Mom','Bipolar_Disorder_Mom',
    'PTB_Mom','Autoimmune_Mom','APLS_Mom','STD_Mom','Hyperemesis_Gravidarum_Mom',
    'Headache_Mom','ADHD_Mom','Depression_Mom','Eclampsia_Mom','Epilepsy_Mom',
    'Infertility_Mom','GDM_Mom','Ondansetron, Oral','Sertraline, Oral','Oxycodone, Oral'
]

syn_df = pd.DataFrame(np.nan, index=range(n), columns=column_names)

In [30]:
syn_df

Unnamed: 0,ENROLID_BABY,SEX,REGION,LOCATION,GESTATIONAL_AGE,AGE_MOM,RDS_Baby,NAS_Baby,Postmaturity_Baby,ROP_Baby,...,Headache_Mom,ADHD_Mom,Depression_Mom,Eclampsia_Mom,Epilepsy_Mom,Infertility_Mom,GDM_Mom,"Ondansetron, Oral","Sertraline, Oral","Oxycodone, Oral"
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49996,,,,,,,,,,,...,,,,,,,,,,
49997,,,,,,,,,,,...,,,,,,,,,,
49998,,,,,,,,,,,...,,,,,,,,,,
49999,,,,,,,,,,,...,,,,,,,,,,


In [31]:
import uuid

syn_df['ENROLID_BABY'] = ['ENR_' + uuid.uuid4().hex[:5].upper() for _ in range(len(syn_df))]
syn_df['SEX'] = np.random.choice([1, 2], size=len(syn_df))
syn_df['REGION'] = np.random.choice([1, 2, 3, 4, 5], size=len(syn_df))
syn_df['LOCATION'] = np.random.choice(np.arange(1, 50), size=len(syn_df))


In [32]:
syn_df['GESTATIONAL_AGE'] = np.random.normal(loc=38.66, scale=1.52, size=50001)
syn_df['GESTATIONAL_AGE'] = syn_df['GESTATIONAL_AGE'].round().astype(int)
syn_df['AGE_MOM'] = (np.random.normal(loc=31.77, scale=0.68, size=len(syn_df))
                     .clip(20, 45)
                     .round()
                     .astype(int)
                     )

In [33]:

n = len(syn_df)  # 50001

# 실제 positive 개수(=1의 개수) 사전: float로 온 건 int로 바꿔줌
pos_counts = pd.Series({
    "RDS_Baby": 1722,
    "NAS_Baby": 42,
    "Postmaturity_Baby": 1519,
    "ROP_Baby": 360,
    "SGA_Baby": 677,
    "Anemia_Mom": 3715,
    "Asthma_Mom": 1423,
    "SUD_Alcohol_Mom": 25,
    "Anxiety_Mom": 1276,
    "Bipolar_Disorder_Mom": 141,
    "PTB_Mom": 4033,
    "Autoimmune_Mom": 269,
    "APLS_Mom": 323,
    "STD_Mom": 28,
    "Hyperemesis_Gravidarum_Mom": 654,
    "Headache_Mom": 1044,
    "ADHD_Mom": 67,
    "Depression_Mom": 1043,
    "Eclampsia_Mom": 82,
    "Epilepsy_Mom": 152,
    "Infertility_Mom": 62,
    "GDM_Mom": 7084,
    "Ondansetron, Oral": 865.0,
    "Sertraline, Oral": 653.0,
    "Oxycodone, Oral": 287.0,
}).astype(int)

for col, k in pos_counts.items():
    if col not in syn_df.columns:
        pass
    else:
        syn_df[col] = 0  # set the column to 0 first

    # randomly select k indices to set to 1
    if k > 0:
        idx = np.random.choice(n, size=min(k, n), replace=False)
        syn_df.loc[idx, col] = 1

    syn_df[col] = syn_df[col].astype("Int8")



In [34]:
syn_df.to_csv('synthetic_baby_mom_data.csv', index=False)