In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json

In [15]:
# read in the data
data = pd.read_csv('data_2018_hlthmntl_marital_trim.csv')

In [16]:
fname = '2018_marital_trim_PS.json'
with open(fname, 'r') as infile:
    param_dict = json.load(infile)

param_dict

{'bootstrap': True,
 'max_depth': 60,
 'max_features': 'log2',
 'min_samples_leaf': 6,
 'min_samples_split': 3,
 'n_estimators': 200,
 'test_score': 0.6809,
 'train_score': 0.728}

In [17]:
from sklearn.ensemble import RandomForestClassifier
best_rf = RandomForestClassifier(bootstrap=param_dict['bootstrap'], 
                                 max_depth=param_dict['max_depth'], 
                                 max_features=param_dict['max_features'],
                                 n_estimators=param_dict['n_estimators'],
                                 min_samples_leaf=param_dict['min_samples_leaf'],
                                 min_samples_split=param_dict['min_samples_split'],
                                 random_state=99)

In [18]:
best_rf.fit(data.drop(columns=['hlthmntl', 'marital']), y=data['marital'])

In [19]:
data['propensity'] = best_rf.predict_proba(data.drop(columns=['hlthmntl', 'marital']))[:,1]
data.shape

(1633, 10)

In [20]:
# remove propensity score that is 1 or zero
data = data.loc[~data['propensity'].isin([1,0])]
data.shape

(1633, 10)

In [21]:
def ipw_cal(propensity_score, marital_status):
    '''
    Calculates IPW score for given propensity score and marital status
    :param propensity_score: propensity score
    :param marital_status: marital status
    :return: get the inverse propensity score weights
    '''
    if marital_status == 1:
        weighting = 1/propensity_score

    if marital_status == 0:
        weighting = 1/(1-propensity_score)

    return weighting

In [22]:
data['weighting'] = [ipw_cal(x, y) for x, y in zip(data['propensity'], data['marital'])]

In [23]:
# outcome model
import statsmodels.api as sm
Y = np.array(data['hlthmntl'],dtype=float)
X = np.array(data.drop(columns=['hlthmntl', 'propensity', 'weighting']),dtype=float)
weights = np.array(data['weighting'],dtype=float)
wls_model = sm.WLS(Y,X, weights=weights)
wls_model.exog_names[:] = list(data.drop(columns=['hlthmntl', 'propensity', 'weighting']).columns)
results = wls_model.fit()
results.params

array([-0.16519747, -0.08827853,  0.26719421,  0.03807991,  0.09608193,
        1.75357641,  1.70677639,  1.82883981])

In [24]:
results.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.085
Model:,WLS,Adj. R-squared:,0.081
Method:,Least Squares,F-statistic:,21.51
Date:,"Sat, 11 May 2024",Prob (F-statistic):,6.73e-28
Time:,16:01:33,Log-Likelihood:,-2269.3
No. Observations:,1633,AIC:,4555.0
Df Residuals:,1625,BIC:,4598.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
marital,-0.1652,0.046,-3.584,0.000,-0.256,-0.075
degree,-0.0883,0.020,-4.493,0.000,-0.127,-0.050
satfin,0.2672,0.033,8.107,0.000,0.203,0.332
neisafe,0.0381,0.036,1.049,0.294,-0.033,0.109
relpersn,0.0961,0.023,4.179,0.000,0.051,0.141
race_is_white,1.7536,0.115,15.279,0.000,1.528,1.979
race_is_black,1.7068,0.131,13.039,0.000,1.450,1.964
race_is_other,1.8288,0.131,13.962,0.000,1.572,2.086

0,1,2,3
Omnibus:,28.726,Durbin-Watson:,1.942
Prob(Omnibus):,0.0,Jarque-Bera (JB):,29.819
Skew:,0.329,Prob(JB):,3.35e-07
Kurtosis:,3.08,Cond. No.,37.3
