In [1]:
import pandas as pd

In [2]:
# Read Data
data1 = pd.read_csv("chalfin_data-1.csv")
print(data1.info())

# Drop required Null variables - In this case related to the Dependent variable 
data1 = data1.dropna(subset=['dlogpc_murder'])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 276 entries, 0 to 275
Data columns (total 60 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   year             276 non-null    float64
 1   FMSANAME         276 non-null    object 
 2   STATE            276 non-null    object 
 3   instrument       276 non-null    float64
 4   fbnonmex         276 non-null    float64
 5   ushisp           276 non-null    float64
 6   black            276 non-null    float64
 7   age0_14          276 non-null    float64
 8   age15_24         276 non-null    float64
 9   age25_39         276 non-null    float64
 10  age40_54         276 non-null    float64
 11  age55            276 non-null    float64
 12  employed         276 non-null    float64
 13  educ1            276 non-null    float64
 14  educ2            276 non-null    float64
 15  educ3            276 non-null    float64
 16  educ4            276 non-null    float64
 17  dfbnonmex       

In [3]:
# Find Exog, Endog and Instrument Variables
data1.corr()['dins'].tail(20)

  data1.corr()['dins'].tail(20)


logpc_assault      0.119209
dlogpc_assault     0.086688
logpc_burglary     0.037386
dlogpc_burglary   -0.010226
logpc_larceny     -0.051306
dlogpc_larceny    -0.040081
logpc_motor        0.089838
dlogpc_motor      -0.064888
grp_1              0.238221
grp_2              0.208392
grp_3             -0.097372
grp_4             -0.086340
grp_5              0.108906
grp_6              0.111609
grp_7             -0.119369
grp_8             -0.126288
grp_9             -0.141909
grp_10            -0.149694
grp_11            -0.070029
grp_12            -0.089167
Name: dins, dtype: float64

In [4]:
# Stage 1 - First-stage model
import statsmodels.api as sm

# Weights added - according to the article
W = data1['popweight']

first_stage = sm.WLS.from_formula('dmexfb_alt ~ 1 + dins + dfbnonmex + dushisp + dblack + deduc1 + deduc2 + '
                                  'deduc3 + deduc4 + dage0_14 + dage15_24 + dage25_39 + dage40_54 + demployed + '
                                  'dusbirths + grp_1 + grp_2 + grp_3 + grp_4 + grp_5 + grp_6 + grp_7 + grp_8 + grp_9 + '
                                  'grp_10 + grp_11', data = data1, weights = W, missing = 'drop').fit()
# get the estimates
estimates = first_stage.summary()
print(estimates)

                            WLS Regression Results                            
Dep. Variable:             dmexfb_alt   R-squared:                       0.813
Model:                            WLS   Adj. R-squared:                  0.783
Method:                 Least Squares   F-statistic:                     27.07
Date:                Sat, 22 Apr 2023   Prob (F-statistic):           1.76e-44
Time:                        00:03:18   Log-Likelihood:                -234.66
No. Observations:                 182   AIC:                             521.3
Df Residuals:                     156   BIC:                             604.6
Df Model:                          25                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -1.7871      0.909     -1.965      0.0

In [5]:
# Stage 2 - Second-stage of the model 

# Gain predict values from first-stage and replace them
predicted_values = first_stage.predict()
data1['dmexfb_alt_predicted'] = data1['dmexfb_alt']
data1.loc[~data1['dmexfb_alt'].isna(), 'dmexfb_alt_predicted'] = predicted_values

# Second-stage model
second_stage = sm.WLS.from_formula('dlogpc_murder ~ dfbnonmex + dushisp + dblack + deduc1 + deduc2 + '
                                   'deduc3 + deduc4 + dage0_14 + dage15_24 + dage25_39 + dage40_54 +' 
                                   'demployed + dusbirths + dmexfb_alt_predicted + grp_1 + grp_2 + grp_3 +'
                                   'grp_4 + grp_5 + grp_6 + grp_7 + grp_8 + grp_9 + grp_10 + grp_11', 
                                    data = data1, weights = W, missing = 'drop').fit()
    
# Summary function
estimates = second_stage.summary()
print(estimates)

                            WLS Regression Results                            
Dep. Variable:          dlogpc_murder   R-squared:                       0.622
Model:                            WLS   Adj. R-squared:                  0.561
Method:                 Least Squares   F-statistic:                     10.25
Date:                Sat, 22 Apr 2023   Prob (F-statistic):           5.59e-22
Time:                        00:03:19   Log-Likelihood:                -88.249
No. Observations:                 182   AIC:                             228.5
Df Residuals:                     156   BIC:                             311.8
Df Model:                          25                                         
Covariance Type:            nonrobust                                         
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
Intercept                0.3457 

In [8]:
#IV-2SLS Package in Python 
from linearmodels.iv import IV2SLS

# Weights added - according to the article
W = data1['popweight']

# 2SLS Model 
model = IV2SLS.from_formula(formula='dlogpc_murder ~ 1 + dfbnonmex + dushisp + dblack + deduc1 + deduc2 + deduc3 + '
                            'deduc4 + dage0_14 + dage15_24 + dage25_39 + dage40_54 + demployed + dusbirths +' 
                            'grp_1 + grp_2 + grp_3 + grp_4 + grp_5 + grp_6 + grp_7 + grp_8 + grp_9 + grp_10 + '
                            'grp_11 + [dmexfb_alt ~ dins]', data=data1, weights = W).fit()
print(model.summary)

                          IV-2SLS Estimation Summary                          
Dep. Variable:          dlogpc_murder   R-squared:                      0.6134
Estimator:                    IV-2SLS   Adj. R-squared:                 0.5514
No. Observations:                 182   F-statistic:                    402.66
Date:                Sat, Apr 22 2023   P-value (F-stat)                0.0000
Time:                        00:03:57   Distribution:                 chi2(25)
Cov. Estimator:                robust                                         
                                                                              
                             Parameter Estimates                              
            Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------
Intercept      0.3457     0.4151     0.8328     0.4050     -0.4679      1.1593
dage0_14      -4.9722     5.1829    -0.9593     0.33