In [1]:
import pandas as pd
import numpy as np

In [2]:
# read in the data
data = pd.read_csv('data_2018.csv')

In [3]:
from sklearn.ensemble import RandomForestClassifier
best_rf = RandomForestClassifier(bootstrap=True, max_depth=50, max_features='log2', 
                                 n_estimators=200,
                                 min_samples_leaf=4, 
                                 min_samples_split=8,
                                 random_state=99)

In [4]:
best_rf.fit(data.drop(columns=['mntlhlth', 'marital']), y=data['marital'])

In [5]:
data['propensity'] = best_rf.predict_proba(data.drop(columns=['mntlhlth', 'marital']))[:,1]
data.shape

(1393, 11)

In [6]:
# remove propensity score that is 1 or zero
data = data.loc[~data['propensity'].isin([1,0])]
data.shape

(1393, 11)

In [7]:
def ipw_cal(propensity_score, marital_status):
    '''
    Calculates IPW score for given propensity score and marital status
    :param propensity_score: propensity score
    :param marital_status: marital status
    :return: get the inverse propensity score weights
    '''
    if marital_status == 1:
        weighting = 1/propensity_score

    if marital_status == 0:
        weighting = 1/(1-propensity_score)

    return weighting

In [8]:
data['weighting'] = [ipw_cal(x, y) for x, y in zip(data['propensity'], data['marital'])]

In [9]:
# outcome model
import statsmodels.api as sm
Y = np.array(data['mntlhlth'],dtype=float)
X = np.array(data.drop(columns=['mntlhlth', 'propensity', 'weighting']),dtype=float)
weights = np.array(data['weighting'],dtype=float)
wls_model = sm.WLS(Y,X, weights=weights)
wls_model.exog_names[:] = list(data.drop(columns=['mntlhlth', 'propensity', 'weighting']).columns)
results = wls_model.fit()
results.params

array([ 1.60441640e-04, -1.33557685e+00, -7.64815883e-03,  1.77868411e+00,
        1.43603584e-01,  4.38308955e-01, -8.00892622e-01, -2.50706766e+00,
       -2.47362755e+00])

In [10]:
results.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.062
Model:,WLS,Adj. R-squared:,0.057
Method:,Least Squares,F-statistic:,11.46
Date:,"Sat, 04 May 2024",Prob (F-statistic):,7.61e-16
Time:,10:20:06,Log-Likelihood:,-4617.4
No. Observations:,1393,AIC:,9253.0
Df Residuals:,1384,BIC:,9300.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Unnamed: 0,0.0002,0.000,0.597,0.551,-0.000,0.001
marital,-1.3356,0.354,-3.775,0.000,-2.030,-0.642
degree,-0.0076,0.152,-0.050,0.960,-0.306,0.291
satfin,1.7787,0.251,7.081,0.000,1.286,2.271
neisafe,0.1436,0.279,0.514,0.607,-0.404,0.692
relpersn,0.4383,0.178,2.464,0.014,0.089,0.787
race_is_white,-0.8009,1.463,-0.547,0.584,-3.672,2.070
race_is_black,-2.5071,1.504,-1.667,0.096,-5.458,0.443
race_is_other,-2.4736,1.496,-1.654,0.098,-5.407,0.460

0,1,2,3
Omnibus:,723.483,Durbin-Watson:,2.023
Prob(Omnibus):,0.0,Jarque-Bera (JB):,4022.249
Skew:,2.466,Prob(JB):,0.0
Kurtosis:,9.706,Cond. No.,58200.0
