## 1. Calculate Unadjusted Odds Ratios

Disclaimer:
- The dataset shared in this notebook is a **synthetic dataset** created for demonstration purposes only.
- All results presented in our paper are based on the Merative™ MarketScan® Databases, which contain de-identified real-world healthcare data and cannot be publicly shared.
- For details about the actual maternal and neonatal datasets, please refer to the **Data Availability** and **Methods** sections of our paper.


In [1]:
import os
import sys
import warnings
# Function to set thread limits for external libraries to avoid oversubscription in the shared server
def set_threads_for_external_libraries(n_threads=1):
    if ("numpy" in sys.modules) or ("scipy" in sys.modules) or ("sklearn" in sys.modules):
        warnings.warn("Call set_threads_for_external_libraries() before importing numpy/scipy/sklearn for full effect.")
    for k in ["OMP_NUM_THREADS","OPENBLAS_NUM_THREADS","MKL_NUM_THREADS","VECLIB_MAXIMUM_THREADS","NUMEXPR_NUM_THREADS"]:
        os.environ[k] = str(n_threads)
        
set_threads_for_external_libraries(n_threads=64)

In [2]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
from statsmodels.tools.sm_exceptions import PerfectSeparationError
import time

In [3]:
## Load the dataset (This is a synthetic dataset for demonstration purposes. Please read "Data Availability" section, and replace it with your actual dataset.)
df = pd.read_csv('../0_data/synthetic_baby_mom_data.csv')
## make a dataframe for testing
baby_cols_sel = ['RDS_Baby','NAS_Baby','Postmaturity_Baby','ROP_Baby','SGA_Baby']
med_cols_sel = ['Ondansetron, Oral','Sertraline, Oral','Oxycodone, Oral']
df_sel = df[baby_cols_sel + med_cols_sel]
df_sel.shape

(500000, 8)

In [4]:
## Prepare a function to infer baby and medication columns
def infer_columns(df, outcome_suffix="_Baby", med_start_col=None):
    if med_start_col is not None:
        baby_cols = [i for i in df.columns if i.__contains__(outcome_suffix)]
        start_idx = df.columns.get_loc(med_start_col)
        med_cols  = list(df.columns[start_idx:])
        odds_df_cols = baby_cols + med_cols
        odds_df = df[odds_df_cols]
    else:
        print("Please check the column names to infer the split between baby and medication columns.")
        pass
    return start_idx, list(baby_cols), list(med_cols), odds_df

In [5]:
med_start_loc, baby_cols, med_cols, odds_df = infer_columns(df_sel, outcome_suffix="_Baby", med_start_col='Ondansetron, Oral')
odds_df

Unnamed: 0,RDS_Baby,NAS_Baby,Postmaturity_Baby,ROP_Baby,SGA_Baby,"Ondansetron, Oral","Sertraline, Oral","Oxycodone, Oral"
0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
499995,0,0,0,0,0,0,0,0
499996,1,0,0,0,0,0,0,0
499997,0,0,0,0,0,0,0,0
499998,0,0,0,0,0,0,0,0


In [6]:
## Compute unadjusted odds ratios
def compute_unadjusted_or(odds_df, baby_cols, med_cols, med_start_loc, min_count=10, verbose_every=500):
    """
    Returns a DataFrame with Disease, Medication, OR, p-value, Count, 95% CI
    Assumes binary 0/1 columns.
    """
    results = []
    n_done = 0

    for med in med_cols:
        for dz in baby_cols:
            # co-occurrence count (both 1)
            count = int(((odds_df[dz] != 0) & (odds_df[med] != 0)).sum())
            if count <= min_count:
                continue

            # Logistic regression: dz ~ med (unadjusted)
            # Use Q() to safely reference arbitrary column names in formula
            formula = f"Q('{dz}') ~ Q('{med}')"
            print(formula)
            try:
                reg = smf.logit(formula, data=odds_df).fit(disp=0)
                pval = float(reg.pvalues.get(f"Q('{med}')", np.nan))
                coef = float(reg.params.get(f"Q('{med}')", np.nan))
                se   = float(reg.bse.get(f"Q('{med}')", np.nan))
                OR   = float(np.exp(coef)) if np.isfinite(coef) else np.nan

                # confint returns 2 x k DataFrame
                ci = reg.conf_int(alpha=0.05)
                if f"Q('{med}')" in ci.index:
                    ll, ul = ci.loc[f"Q('{med}')", :].tolist()
                    OR_LL, OR_UL = float(np.exp(ll)), float(np.exp(ul))
                    print(ll,ul,OR_LL,OR_UL)
                else:
                    OR_LL = OR_UL = np.nan

            except PerfectSeparationError:
                pval = OR = se = OR_LL = OR_UL = np.nan
            except Exception:
                pval = OR = se = OR_LL = OR_UL = np.nan

            results.append({
                "Disease": dz,
                "Medication": med,
                "odds ratio": OR,
                "p-val": pval,
                "Count": count,
                "95% CI (LL)": OR_LL,
                "95% CI (UL)": OR_UL,
            })

            n_done += 1
            if verbose_every and (n_done % verbose_every == 0):
                print(f"... processed {n_done} pairs")

    return pd.DataFrame(results)

In [7]:
unadjusted_odds = compute_unadjusted_or(odds_df, baby_cols, med_cols,med_start_loc, min_count=10, verbose_every=500)
unadjusted_odds


Q('RDS_Baby') ~ Q('Ondansetron, Oral')


1.6543060263617506 1.777860715032336 5.229449562497881 5.917184325336613
Q('Postmaturity_Baby') ~ Q('Ondansetron, Oral')
-0.16154705771075234 0.09021986562601657 0.8508264925763235 1.094414881467688
Q('ROP_Baby') ~ Q('Ondansetron, Oral')
-0.48156978705413905 0.07290198395271325 0.6178127956623125 1.0756251032267106
Q('SGA_Baby') ~ Q('Ondansetron, Oral')
-0.3954954891488951 0.007669002537672248 0.6733463207833594 1.0076984846555894
Q('RDS_Baby') ~ Q('Sertraline, Oral')
-0.07635060098590538 0.19377935105694508 0.9264913208395642 1.2138284234471775
Q('Postmaturity_Baby') ~ Q('Sertraline, Oral')
-0.10571784904123031 0.1743575034678847 0.8996784574073091 1.1904810908709273
Q('ROP_Baby') ~ Q('Sertraline, Oral')
-0.2891634690937996 0.28846872608982493 0.7488897750632683 1.3343826175082218
Q('SGA_Baby') ~ Q('Sertraline, Oral')
-0.22949870284050097 0.1963351763034379 0.7949319997907667 1.2169347246673492
Q('RDS_Baby') ~ Q('Oxycodone, Oral')
-0.19417226995223433 0.22026131407672275 0.82351602498

Unnamed: 0,Disease,Medication,odds ratio,p-val,Count,95% CI (LL),95% CI (UL)
0,RDS_Baby,"Ondansetron, Oral",5.562699,0.0,1269,5.22945,5.917184
1,Postmaturity_Baby,"Ondansetron, Oral",0.964965,0.57871,254,0.850826,1.094415
2,ROP_Baby,"Ondansetron, Oral",0.81519,0.148578,51,0.617813,1.075625
3,SGA_Baby,"Ondansetron, Oral",0.823729,0.059376,97,0.673346,1.007698
4,RDS_Baby,"Sertraline, Oral",1.060472,0.394203,221,0.926491,1.213828
5,Postmaturity_Baby,"Sertraline, Oral",1.034916,0.630986,205,0.899678,1.190481
6,ROP_Baby,"Sertraline, Oral",0.999653,0.998119,47,0.74889,1.334383
7,SGA_Baby,"Sertraline, Oral",0.983555,0.878682,87,0.794932,1.216935
8,RDS_Baby,"Oxycodone, Oral",1.01313,0.901805,93,0.823516,1.246402
9,Postmaturity_Baby,"Oxycodone, Oral",0.938385,0.571356,82,0.752922,1.169531


In [8]:
def Benjamini_Hochberg_correction(df, p_value_column,p_value=0.05):
    """
    Benjamini-Hochberg correction for multiple hypothesis testing.
    
    Parameters
    ----------
    df : pandas.DataFrame
        DataFrame containing p-values.
    p_value_column : str
        Column name of p-values.
    q_value_column : str
        Column name of q-values.
        
    Returns
    -------
    None
    """
    df = df.sort_values(by=p_value_column, ascending=True)
    df = df.reset_index().drop(columns=['index'],axis=1)
    df['k']=df.index+1
    df['m']=df.shape[0]
    df['a']=0.05
    df['B-H critical value']=df['k']*df['a']/df['m']
    df['BH-significance']=(df[p_value_column]<df['B-H critical value'])
    BH_true_df = df[df['BH-significance']==True]
    return BH_true_df

In [9]:
Benjamini_Hochberg_correction(unadjusted_odds,'p-val')


Unnamed: 0,Disease,Medication,odds ratio,p-val,Count,95% CI (LL),95% CI (UL),k,m,a,B-H critical value,BH-significance
0,RDS_Baby,"Ondansetron, Oral",5.562699,0.0,1269,5.22945,5.917184,1,12,0.05,0.004167,True


In [10]:
## Save the result
Benjamini_Hochberg_correction(unadjusted_odds,'p-val').to_csv('../0_data/results/unadjusted_odds_results.csv',index=False)
