In [1]:
import pandas as pd
import numpy as np

In [2]:
# read in the data
data = pd.read_csv('data_2018.csv')

In [3]:
from sklearn.ensemble import RandomForestClassifier
best_rf = RandomForestClassifier(bootstrap=True, max_depth=50, max_features='log2', 
                                 n_estimators=200,
                                 min_samples_leaf=4, 
                                 min_samples_split=8,
                                 random_state=99)

In [4]:
best_rf.fit(data.drop(columns=['hlthmntl', 'marital']), y=data['marital'])

In [5]:
data['propensity'] = best_rf.predict_proba(data.drop(columns=['hlthmntl', 'marital']))[:,1]
data.shape

(2296, 10)

In [6]:
# remove propensity score that is 1 or zero
data = data.loc[~data['propensity'].isin([1,0])]
data.shape

(2296, 10)

In [7]:
def ipw_cal(propensity_score, marital_status):
    '''
    Calculates IPW score for given propensity score and marital status
    :param propensity_score: propensity score
    :param marital_status: marital status
    :return: get the inverse propensity score weights
    '''
    if marital_status == 1:
        weighting = 1/propensity_score

    if marital_status == 0:
        weighting = 1/(1-propensity_score)

    return weighting

In [8]:
data['weighting'] = [ipw_cal(x, y) for x, y in zip(data['propensity'], data['marital'])]

In [9]:
# outcome model
import statsmodels.api as sm
Y = np.array(data['hlthmntl'],dtype=float)
X = np.array(data.drop(columns=['hlthmntl', 'propensity', 'weighting']),dtype=float)
weights = np.array(data['weighting'],dtype=float)
wls_model = sm.WLS(Y,X, weights=weights)
wls_model.exog_names[:] = list(data.drop(columns=['hlthmntl', 'propensity', 'weighting']).columns)
results = wls_model.fit()
results.params

array([-0.16546256, -0.11795423,  0.23948252,  0.06691603,  0.05946957,
        1.90743082,  1.84701997,  1.9777008 ])

In [10]:
results.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.087
Model:,WLS,Adj. R-squared:,0.084
Method:,Least Squares,F-statistic:,31.19
Date:,"Sat, 04 May 2024",Prob (F-statistic):,1.6e-41
Time:,10:47:35,Log-Likelihood:,-3130.2
No. Observations:,2296,AIC:,6276.0
Df Residuals:,2288,BIC:,6322.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
marital,-0.1655,0.038,-4.303,0.000,-0.241,-0.090
degree,-0.1180,0.017,-7.113,0.000,-0.150,-0.085
satfin,0.2395,0.027,8.875,0.000,0.187,0.292
neisafe,0.0669,0.030,2.202,0.028,0.007,0.126
relpersn,0.0595,0.019,3.084,0.002,0.022,0.097
race_is_white,1.9074,0.095,20.162,0.000,1.722,2.093
race_is_black,1.8470,0.108,17.075,0.000,1.635,2.059
race_is_other,1.9777,0.111,17.800,0.000,1.760,2.196

0,1,2,3
Omnibus:,45.058,Durbin-Watson:,2.043
Prob(Omnibus):,0.0,Jarque-Bera (JB):,47.229
Skew:,0.347,Prob(JB):,5.55e-11
Kurtosis:,3.114,Cond. No.,36.6
