## Econ 430
### UCLA, Master of Quantitative Economics
#### Dr. Randall R. Rojas
Note: For more details on the codes, please see our textbook; Using Python for Introductory Econometrics by F. Heiss & D. Brunner.

# 1) Instrumental Variables
## Example: Return to Education for Married Women

In [1]:
#Note: You may need to first install install "linearmodels"
# pip install linearmodels

import wooldridge as woo
import numpy as np
import pandas as pd
import linearmodels.iv as iv
import statsmodels.formula.api as smf

mroz = woo.dataWoo('mroz')

# restrict to non-missing wage observations:
mroz = mroz.dropna(subset=['lwage'])

cov_yz = np.cov(mroz['lwage'], mroz['fatheduc'])[1, 0]
cov_xy = np.cov(mroz['educ'], mroz['lwage'])[1, 0]
cov_xz = np.cov(mroz['educ'], mroz['fatheduc'])[1, 0]
var_x = np.var(mroz['educ'], ddof=1)
x_bar = np.mean(mroz['educ'])
y_bar = np.mean(mroz['lwage'])

# OLS slope parameter manually:
b_ols_man = cov_xy / var_x
print(f'b_ols_man: {b_ols_man}\n')

# IV slope parameter manually:
b_iv_man = cov_yz / cov_xz
print(f'b_iv_man: {b_iv_man}\n')

# OLS automatically:
reg_ols = smf.ols(formula='np.log(wage) ~ educ', data=mroz)
results_ols = reg_ols.fit()

# print regression table:
table_ols = pd.DataFrame({'b': round(results_ols.params, 4),
                          'se': round(results_ols.bse, 4),
                          't': round(results_ols.tvalues, 4),
                          'pval': round(results_ols.pvalues, 4)})
print(f'table_ols: \n{table_ols}\n')

# IV automatically:
reg_iv = iv.IV2SLS.from_formula(formula='np.log(wage) ~ 1 + [educ ~ fatheduc]',
                                data=mroz)
results_iv = reg_iv.fit(cov_type='unadjusted', debiased=True)

# print regression table:
table_iv = pd.DataFrame({'b': round(results_iv.params, 4),
                         'se': round(results_iv.std_errors, 4),
                         't': round(results_iv.tstats, 4),
                         'pval': round(results_iv.pvalues, 4)})
print(f'table_iv: \n{table_iv}\n')


b_ols_man: 0.10864865517467516

b_iv_man: 0.05917347999936595

table_ols: 
                b      se       t   pval
Intercept -0.1852  0.1852 -0.9998  0.318
educ       0.1086  0.0144  7.5451  0.000

table_iv: 
                b      se       t    pval
Intercept  0.4411  0.4461  0.9888  0.3233
educ       0.0592  0.0351  1.6839  0.0929



## Example: College Proximity as an IV for Education

In [2]:
import wooldridge as woo
import numpy as np
import pandas as pd
import linearmodels.iv as iv
import statsmodels.formula.api as smf

card = woo.dataWoo('card')

# checking for relevance with reduced form:
reg_redf = smf.ols(
    formula='educ ~ nearc4 + exper + I(exper**2) + black + smsa +'
    'south + smsa66 + reg662 + reg663 + reg664 + reg665 + reg666 +'
    'reg667 + reg668 + reg669', data=card)
results_redf = reg_redf.fit()

# print regression table:
table_redf = pd.DataFrame({'b': round(results_redf.params, 4),
                           'se': round(results_redf.bse, 4),
                           't': round(results_redf.tvalues, 4),
                           'pval': round(results_redf.pvalues, 4)})
print(f'table_redf: \n{table_redf}\n')

# OLS:
reg_ols = smf.ols(
    formula='np.log(wage) ~ educ + exper + I(exper**2) + black + smsa +'
    'south + smsa66 + reg662 + reg663 + reg664 + reg665 +'
    'reg666 + reg667 + reg668 + reg669', data=card)
results_ols = reg_ols.fit()

# print regression table:
table_ols = pd.DataFrame({'b': round(results_ols.params, 4),
                          'se': round(results_ols.bse, 4),
                          't': round(results_ols.tvalues, 4),
                          'pval': round(results_ols.pvalues, 4)})
print(f'table_ols: \n{table_ols}\n')

# IV automatically:
reg_iv = iv.IV2SLS.from_formula(
    formula='np.log(wage)~ 1 + exper + I(exper**2) + black + smsa + '
            'south + smsa66 + reg662 + reg663 + reg664 + reg665 +'
            'reg666 + reg667 + reg668 + reg669 + [educ ~ nearc4]',
    data=card)
results_iv = reg_iv.fit(cov_type='unadjusted', debiased=True)

# print regression table:
table_iv = pd.DataFrame({'b': round(results_iv.params, 4),
                         'se': round(results_iv.std_errors, 4),
                         't': round(results_iv.tstats, 4),
                         'pval': round(results_iv.pvalues, 4)})
print(f'table_iv: \n{table_iv}\n')


table_redf: 
                     b      se        t    pval
Intercept      16.6383  0.2406  69.1446  0.0000
nearc4          0.3199  0.0879   3.6408  0.0003
exper          -0.4125  0.0337 -12.2415  0.0000
I(exper ** 2)   0.0009  0.0017   0.5263  0.5987
black          -0.9355  0.0937  -9.9806  0.0000
smsa            0.4022  0.1048   3.8372  0.0001
south          -0.0516  0.1354  -0.3811  0.7032
smsa66          0.0255  0.1058   0.2409  0.8096
reg662         -0.0786  0.1871  -0.4203  0.6743
reg663         -0.0279  0.1834  -0.1524  0.8789
reg664          0.1172  0.2173   0.5394  0.5897
reg665         -0.2726  0.2184  -1.2481  0.2121
reg666         -0.3028  0.2371  -1.2773  0.2016
reg667         -0.2168  0.2344  -0.9250  0.3550
reg668          0.5239  0.2675   1.9587  0.0502
reg669          0.2103  0.2025   1.0386  0.2991

table_ols: 
                    b      se        t    pval
Intercept      4.6208  0.0742  62.2476  0.0000
educ           0.0747  0.0035  21.3510  0.0000
exper          0.

# 2) Two Stage Least Squares
## Example: Return to Education for Married Women

In [3]:
import wooldridge as woo
import numpy as np
import pandas as pd
import linearmodels.iv as iv
import statsmodels.formula.api as smf

mroz = woo.dataWoo('mroz')

# restrict to non-missing wage observations:
mroz = mroz.dropna(subset=['lwage'])

# 1st stage (reduced form):
reg_redf = smf.ols(formula='educ ~ exper + I(exper**2) + motheduc + fatheduc',
                   data=mroz)
results_redf = reg_redf.fit()
mroz['educ_fitted'] = results_redf.fittedvalues

# print regression table:
table_redf = pd.DataFrame({'b': round(results_redf.params, 4),
                           'se': round(results_redf.bse, 4),
                           't': round(results_redf.tvalues, 4),
                           'pval': round(results_redf.pvalues, 4)})
print(f'table_redf: \n{table_redf}\n')

# 2nd stage:
reg_secstg = smf.ols(formula='np.log(wage) ~ educ_fitted + exper + I(exper**2)',
                     data=mroz)
results_secstg = reg_secstg.fit()

# print regression table:
table_secstg = pd.DataFrame({'b': round(results_secstg.params, 4),
                             'se': round(results_secstg.bse, 4),
                             't': round(results_secstg.tvalues, 4),
                             'pval': round(results_secstg.pvalues, 4)})
print(f'table_secstg: \n{table_secstg}\n')

# IV automatically:
reg_iv = iv.IV2SLS.from_formula(
    formula='np.log(wage) ~ 1 + exper + I(exper**2) +'
            '[educ  ~ motheduc + fatheduc]',
    data=mroz)
results_iv = reg_iv.fit(cov_type='unadjusted', debiased=True)

# print regression table:
table_iv = pd.DataFrame({'b': round(results_iv.params, 4),
                         'se': round(results_iv.std_errors, 4),
                         't': round(results_iv.tstats, 4),
                         'pval': round(results_iv.pvalues, 4)})
print(f'table_iv: \n{table_iv}\n')


table_redf: 
                    b      se        t    pval
Intercept      9.1026  0.4266  21.3396  0.0000
exper          0.0452  0.0403   1.1236  0.2618
I(exper ** 2) -0.0010  0.0012  -0.8386  0.4022
motheduc       0.1576  0.0359   4.3906  0.0000
fatheduc       0.1895  0.0338   5.6152  0.0000

table_secstg: 
                    b      se       t    pval
Intercept      0.0481  0.4198  0.1146  0.9088
educ_fitted    0.0614  0.0330  1.8626  0.0632
exper          0.0442  0.0141  3.1361  0.0018
I(exper ** 2) -0.0009  0.0004 -2.1344  0.0334

table_iv: 
                    b      se       t    pval
Intercept      0.0481  0.4003  0.1202  0.9044
exper          0.0442  0.0134  3.2883  0.0011
I(exper ** 2) -0.0009  0.0004 -2.2380  0.0257
educ           0.0614  0.0314  1.9530  0.0515



# 3) Testing  for Exogeneity of the Regressors
## Example: Return to Education for Married Women

In [4]:
import wooldridge as woo
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf

mroz = woo.dataWoo('mroz')

# restrict to non-missing wage observations:
mroz = mroz.dropna(subset=['lwage'])

# 1st stage (reduced form):
reg_redf = smf.ols(formula='educ ~ exper + I(exper**2) + motheduc + fatheduc',
                   data=mroz)
results_redf = reg_redf.fit()
mroz['resid'] = results_redf.resid

# 2nd stage:
reg_secstg = smf.ols(formula='np.log(wage)~ resid + educ + exper + I(exper**2)',
                     data=mroz)
results_secstg = reg_secstg.fit()

# print regression table:
table_secstg = pd.DataFrame({'b': round(results_secstg.params, 4),
                             'se': round(results_secstg.bse, 4),
                             't': round(results_secstg.tvalues, 4),
                             'pval': round(results_secstg.pvalues, 4)})
print(f'table_secstg: \n{table_secstg}\n')


table_secstg: 
                    b      se       t    pval
Intercept      0.0481  0.3946  0.1219  0.9030
resid          0.0582  0.0348  1.6711  0.0954
educ           0.0614  0.0310  1.9815  0.0482
exper          0.0442  0.0132  3.3363  0.0009
I(exper ** 2) -0.0009  0.0004 -2.2706  0.0237



# 4) Testing Overidentifying Restrictions
## Example: Return to Education for Married Women

In [5]:
import wooldridge as woo
import numpy as np
import pandas as pd
import linearmodels.iv as iv
import statsmodels.formula.api as smf
import scipy.stats as stats

mroz = woo.dataWoo('mroz')

# restrict to non-missing wage observations:
mroz = mroz.dropna(subset=['lwage'])

# IV regression:
reg_iv = iv.IV2SLS.from_formula(formula='np.log(wage) ~ 1 + exper + I(exper**2) +'
                                        '[educ ~ motheduc + fatheduc]', data=mroz)
results_iv = reg_iv.fit(cov_type='unadjusted', debiased=True)

# print regression table:
table_iv = pd.DataFrame({'b': round(results_iv.params, 4),
                         'se': round(results_iv.std_errors, 4),
                         't': round(results_iv.tstats, 4),
                         'pval': round(results_iv.pvalues, 4)})
print(f'table_iv: \n{table_iv}\n')

# auxiliary regression:
mroz['resid_iv'] = results_iv.resids
reg_aux = smf.ols(formula='resid_iv ~ exper + I(exper**2) + motheduc + fatheduc',
                  data=mroz)
results_aux = reg_aux.fit()

# calculations for test:
r2 = results_aux.rsquared
n = results_aux.nobs
teststat = n * r2
pval = 1 - stats.chi2.cdf(teststat, 1)

print(f'r2: {r2}\n')
print(f'n: {n}\n')
print(f'teststat: {teststat}\n')
print(f'pval: {pval}\n')


table_iv: 
                    b      se       t    pval
Intercept      0.0481  0.4003  0.1202  0.9044
exper          0.0442  0.0134  3.2883  0.0011
I(exper ** 2) -0.0009  0.0004 -2.2380  0.0257
educ           0.0614  0.0314  1.9530  0.0515

r2: 0.0008833444088022224

n: 428.0

teststat: 0.3780714069673512

pval: 0.5386371981604356



# 5) IV with Panel Data 
## Example: Job Training and Worker Productivity

In [6]:
import wooldridge as woo
import pandas as pd
import linearmodels.iv as iv

jtrain = woo.dataWoo('jtrain')

# define panel data (for 1987 and 1988 only):
jtrain_87_88 = jtrain.loc[(jtrain['year'] == 1987) | (jtrain['year'] == 1988), :]
jtrain_87_88 = jtrain_87_88.set_index(['fcode', 'year'])

# manual computation of deviations of entity means:
jtrain_87_88['lscrap_diff1'] = \
    jtrain_87_88.sort_values(['fcode', 'year']).groupby('fcode')['lscrap'].diff()
jtrain_87_88['hrsemp_diff1'] = \
    jtrain_87_88.sort_values(['fcode', 'year']).groupby('fcode')['hrsemp'].diff()
jtrain_87_88['grant_diff1'] = \
    jtrain_87_88.sort_values(['fcode', 'year']).groupby('fcode')['grant'].diff()

# IV regression:
reg_iv = iv.IV2SLS.from_formula(
    formula='lscrap_diff1 ~ 1 + [hrsemp_diff1 ~ grant_diff1]',
    data=jtrain_87_88)
results_iv = reg_iv.fit(cov_type='unadjusted', debiased=True)

# print regression table:
table_iv = pd.DataFrame({'b': round(results_iv.params, 4),
                         'se': round(results_iv.std_errors, 4),
                         't': round(results_iv.tstats, 4),
                         'pval': round(results_iv.pvalues, 4)})
print(f'table_iv: \n{table_iv}\n')

table_iv: 
                   b      se       t    pval
Intercept    -0.0327  0.1270 -0.2573  0.7982
hrsemp_diff1 -0.0142  0.0079 -1.7882  0.0808



Inputs contain missing values. Dropping rows with missing observations.
