In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler

In [3]:
all_data = pd.read_excel("De-identified PMAD data.xlsx")

In [4]:
# Extract relevant variables for model fitting
outcome = 'PHQ9_risk2'
data = all_data[['MOM_AGE','MOM_RACE','ETHNIC_GROUP','MARITAL_STATUS','FINANCIAL_CLASS',
                 'LBW','PTB',
                 'DELIVERY_METHOD','NICU_ADMIT','MFCU_ADMIT',
                 'PREE','GDM','GHTN',
                 'MOM_BMI','MOM_LOS','CHILD_LOS',
                 'HIST_ANXIETY','HIST_DEPRESS','HIST_BIPOLAR','HIST_PMAD','MENTAL_HEALTH_DX_CUTOFF',
                 'MED_PSYCH','MED_CARDIO',
                 outcome,'PHQ9_VALUE','PMAD_risk']]

In [5]:
data = data.dropna() # keep only complete data
data = data.drop(['PHQ9_VALUE','PMAD_risk'], axis=1)
print("N:",data.shape)

N: (11377, 24)


In [6]:
# get dummy variables
data = pd.get_dummies(data)

In [9]:
# split into X and y
X = data.drop([outcome], axis=1)
Y = data[[outcome]]

race = data[['MOM_RACE_Asian or Native Hawaiian or Other Pacific Islander',
             'MOM_RACE_Black or African American',
             'MOM_RACE_Multiracial',
             'MOM_RACE_Other',
             'MOM_RACE_Unknown',
             'MOM_RACE_White',
             'MOM_RACE_Hispanic White']]
strat_df = pd.concat([Y,race],axis=1)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, train_size=0.90, test_size=0.10, shuffle=True, stratify=strat_df, random_state=2024)

In [10]:
# Split the data into training and validation sets
race = X_train[['MOM_RACE_Asian or Native Hawaiian or Other Pacific Islander',
                'MOM_RACE_Black or African American',
                'MOM_RACE_Multiracial',
                'MOM_RACE_Other',
                'MOM_RACE_Unknown',
                'MOM_RACE_White',
                'MOM_RACE_Hispanic White']]
strat_df = pd.concat([y_train,race],axis=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, train_size=0.85, test_size=0.15, shuffle=True, stratify=strat_df, random_state=0)

In [12]:
print(X_train.head())

       MOM_AGE  FINANCIAL_CLASS  LBW  PTB  DELIVERY_METHOD  NICU_ADMIT  \
5471      33.0              0.0  0.0  0.0              0.0         0.0   
587       33.0              0.0  0.0  0.0              0.0         0.0   
9875      27.0              0.0  0.0  0.0              0.0         0.0   
14360     37.0              0.0  0.0  0.0              1.0         0.0   
13343     29.0              0.0  0.0  0.0              0.0         0.0   

       MFCU_ADMIT  PREE  GDM  GHTN  ...  ETHNIC_GROUP_Non-Hispanic  \
5471          0.0   0.0  0.0   0.0  ...                          1   
587           0.0   0.0  0.0   0.0  ...                          1   
9875          0.0   0.0  0.0   0.0  ...                          1   
14360         0.0   0.0  0.0   0.0  ...                          1   
13343         0.0   0.0  0.0   0.0  ...                          1   

       ETHNIC_GROUP_Unknown  MARITAL_STATUS_Divorced  \
5471                      0                        0   
587                   

In [19]:
rus = RandomUnderSampler(sampling_strategy = "auto", random_state=0)
X_train_rus, y_train_rus = rus.fit_resample(X_train, y_train)
y_train_rus.value_counts()

PHQ9_risk2
0.0           383
1.0           383
dtype: int64

In [25]:
print(X_train.shape)
print(X_train_rus.shape)
print(X_val.shape)
print(X_test.shape)

(8703, 38)
(766, 38)
(1536, 38)
(1138, 38)


In [26]:
print(X_train.columns)

Index(['MOM_AGE', 'FINANCIAL_CLASS', 'LBW', 'PTB', 'DELIVERY_METHOD',
       'NICU_ADMIT', 'MFCU_ADMIT', 'PREE', 'GDM', 'GHTN', 'MOM_BMI', 'MOM_LOS',
       'CHILD_LOS', 'HIST_ANXIETY', 'HIST_DEPRESS', 'HIST_BIPOLAR',
       'HIST_PMAD', 'MENTAL_HEALTH_DX_CUTOFF', 'MED_PSYCH', 'MED_CARDIO',
       'MOM_RACE_Asian or Native Hawaiian or Other Pacific Islander',
       'MOM_RACE_Black or African American', 'MOM_RACE_Hispanic White',
       'MOM_RACE_Multiracial', 'MOM_RACE_Other', 'MOM_RACE_Unknown',
       'MOM_RACE_White', 'ETHNIC_GROUP_Hispanic', 'ETHNIC_GROUP_Non-Hispanic',
       'ETHNIC_GROUP_Unknown', 'MARITAL_STATUS_Divorced',
       'MARITAL_STATUS_Domestic Partner', 'MARITAL_STATUS_Legally Separated',
       'MARITAL_STATUS_Married', 'MARITAL_STATUS_Significant Other',
       'MARITAL_STATUS_Single', 'MARITAL_STATUS_Unknown',
       'MARITAL_STATUS_Widowed'],
      dtype='object')


In [28]:
print(y_train.head())
y = pd.Series(y_train)
print(y.head())

       PHQ9_risk2
5471          0.0
587           0.0
9875          0.0
14360         0.0
13343         0.0


ValueError: The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().