## 1. Calculate Unadjusted Odds Ratios

Disclaimer:
- The dataset shared in this notebook is a **synthetic dataset** created for demonstration purposes only.
- All results presented in our paper are based on the Merative™ MarketScan® Databases, which contain de-identified real-world healthcare data and cannot be publicly shared.
- For details about the actual maternal and neonatal datasets, please refer to the **Data Availability** and **Methods** sections of our paper.


In [11]:
import os
import sys
import warnings
# Function to set thread limits for external libraries to avoid oversubscription in the shared server
def set_threads_for_external_libraries(n_threads=1):
    if ("numpy" in sys.modules) or ("scipy" in sys.modules) or ("sklearn" in sys.modules):
        warnings.warn("Call set_threads_for_external_libraries() before importing numpy/scipy/sklearn for full effect.")
    for k in ["OMP_NUM_THREADS","OPENBLAS_NUM_THREADS","MKL_NUM_THREADS","VECLIB_MAXIMUM_THREADS","NUMEXPR_NUM_THREADS"]:
        os.environ[k] = str(n_threads)
        
set_threads_for_external_libraries(n_threads=64)



In [12]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
from statsmodels.tools.sm_exceptions import PerfectSeparationError
import time

In [13]:
## Load the dataset (This is a synthetic dataset for demonstration purposes. Please read "Data Availability" section, and replace it with your actual dataset.)
df = pd.read_csv('../0_data/synthetic_baby_mom_data.csv')
## make a dataframe for testing
baby_cols_sel = ['RDS_Baby','NAS_Baby','Postmaturity_Baby','ROP_Baby','SGA_Baby']
med_cols_sel = ['Ondansetron, Oral','Sertraline, Oral','Oxycodone, Oral']
df_sel = df[baby_cols_sel + med_cols_sel]
df_sel.shape

(50001, 8)

In [14]:
## Prepare a function to infer baby and medication columns
def infer_columns(df, outcome_suffix="_Baby", med_start_col=None):
    if med_start_col is not None:
        baby_cols = [i for i in df.columns if i.__contains__('_Baby')]
        start_idx = df.columns.get_loc(med_start_col)
        med_cols  = list(df.columns[start_idx:])
        odds_df_cols = baby_cols + med_cols
        odds_df = df[odds_df_cols]
    else:
        print("Please check the column names to infer the split between baby and medication columns.")
        pass
    return start_idx, list(baby_cols), list(med_cols), odds_df

In [15]:
med_start_loc, baby_cols, med_cols, odds_df = infer_columns(df_sel, outcome_suffix="_Baby", med_start_col='Ondansetron, Oral')
odds_df

Unnamed: 0,RDS_Baby,NAS_Baby,Postmaturity_Baby,ROP_Baby,SGA_Baby,"Ondansetron, Oral","Sertraline, Oral","Oxycodone, Oral"
0,1,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
49996,0,0,0,0,0,0,0,0
49997,0,0,0,0,0,0,0,0
49998,0,0,0,0,0,0,0,0
49999,0,0,0,1,0,0,0,0


In [16]:
## Compute unadjusted odds ratios
def compute_unadjusted_or(odds_df, baby_cols, med_cols, med_start_loc, min_count=10, verbose_every=500):
    """
    Returns a DataFrame with Disease, Medication, OR, p-value, Count, 95% CI
    Assumes binary 0/1 columns.
    """
    results = []
    n_done = 0

    for med in med_cols:
        for dz in baby_cols:
            # co-occurrence count (both 1)
            count = int(((odds_df[dz] != 0) & (odds_df[med] != 0)).sum())
            if count <= min_count:
                continue

            # Logistic regression: dz ~ med (unadjusted)
            # Use Q() to safely reference arbitrary column names in formula
            formula = f"Q('{dz}') ~ Q('{med}')"
            print(formula)
            try:
                reg = smf.logit(formula, data=odds_df).fit(disp=0)
                pval = float(reg.pvalues.get(f"Q('{med}')", np.nan))
                coef = float(reg.params.get(f"Q('{med}')", np.nan))
                se   = float(reg.bse.get(f"Q('{med}')", np.nan))
                OR   = float(np.exp(coef)) if np.isfinite(coef) else np.nan

                # confint returns 2 x k DataFrame
                ci = reg.conf_int(alpha=0.05)
                if f"Q('{med}')" in ci.index:
                    ll, ul = ci.loc[f"Q('{med}')", :].tolist()
                    OR_LL, OR_UL = float(np.exp(ll)), float(np.exp(ul))
                    print(ll,ul,OR_LL,OR_UL)
                else:
                    OR_LL = OR_UL = np.nan

            except PerfectSeparationError:
                pval = OR = se = OR_LL = OR_UL = np.nan
            except Exception:
                pval = OR = se = OR_LL = OR_UL = np.nan

            results.append({
                "Disease": dz,
                "Medication": med,
                "odds ratio": OR,
                "p-val": pval,
                "Count": count,
                "95% CI (LL)": OR_LL,
                "95% CI (UL)": OR_UL,
            })

            n_done += 1
            if verbose_every and (n_done % verbose_every == 0):
                print(f"... processed {n_done} pairs")

    return pd.DataFrame(results)

In [17]:
unadjusted_odds = compute_unadjusted_or(odds_df, baby_cols, med_cols,med_start_loc, min_count=10, verbose_every=500)
unadjusted_odds


Q('RDS_Baby') ~ Q('Ondansetron, Oral')
-0.36001969280160995 0.37482862039020715 0.6976625870048346 1.4547420801233466
Q('Postmaturity_Baby') ~ Q('Ondansetron, Oral')
-0.31334359968347225 0.44676579309003295 0.7309986984913945 1.5632481328940253
Q('SGA_Baby') ~ Q('Ondansetron, Oral')
-0.5497936774964362 0.5999557206672496 0.5770688603907208 1.8220381199720863
Q('RDS_Baby') ~ Q('Sertraline, Oral')
-0.4508826975427133 0.40478069005577383 0.637065567151665 1.4989737241623307
Q('Postmaturity_Baby') ~ Q('Sertraline, Oral')
-0.4395760507110314 0.45660375336773057 0.6443095177514082 1.578703204467751
Q('RDS_Baby') ~ Q('Oxycodone, Oral')
-0.49285877721839144 0.7162186194704143 0.6108775327989427 2.0466792864102277


Unnamed: 0,Disease,Medication,odds ratio,p-val,Count,95% CI (LL),95% CI (UL)
0,RDS_Baby,"Ondansetron, Oral",1.007432,0.968493,30,0.697663,1.454742
1,Postmaturity_Baby,"Ondansetron, Oral",1.068987,0.730821,28,0.730999,1.563248
2,SGA_Baby,"Ondansetron, Oral",1.025398,0.931855,12,0.577069,1.822038
3,RDS_Baby,"Sertraline, Oral",0.977213,0.9159,22,0.637066,1.498974
4,Postmaturity_Baby,"Sertraline, Oral",1.00855,0.970294,20,0.64431,1.578703
5,RDS_Baby,"Oxycodone, Oral",1.118155,0.717296,11,0.610878,2.046679


In [18]:
def Benjamini_Hochberg_correction(df, p_value_column,p_value=0.05):
    """
    Benjamini-Hochberg correction for multiple hypothesis testing.
    
    Parameters
    ----------
    df : pandas.DataFrame
        DataFrame containing p-values.
    p_value_column : str
        Column name of p-values.
    q_value_column : str
        Column name of q-values.
        
    Returns
    -------
    None
    """
    df = df.sort_values(by=p_value_column, ascending=True)
    df = df.reset_index().drop(columns=['index'],axis=1)
    df['k']=df.index+1
    df['m']=df.shape[0]
    df['a']=0.05
    df['B-H critical value']=df['k']*df['a']/df['m']
    df['BH-significance']=(df['p-val']<df['B-H critical value'])
    BH_true_df = df[df['BH-significance']==True]
    return BH_true_df

In [19]:
Benjamini_Hochberg_correction(unadjusted_odds,'p-val').to_csv('../0_data/results/unadjusted_odds_results.csv',index=False)
