# Chapter 16

In [1]:
import pandas as pd
import statsmodels.api as sm
import numpy as np
import matplotlib.pyplot as plt
from linearmodels.iv import IV2SLS
from scipy import stats

In [2]:
# Exercise 1
smoke = pd.read_stata("./stata/SMOKE.DTA")
X = sm.add_constant(smoke[["cigs", "educ", "age", "agesq"]])
model = sm.OLS(smoke.lincome, X, missing="drop").fit()
model.summary()

0,1,2,3
Dep. Variable:,lincome,R-squared:,0.165
Model:,OLS,Adj. R-squared:,0.161
Method:,Least Squares,F-statistic:,39.61
Date:,"Thu, 19 May 2022",Prob (F-statistic):,2.68e-30
Time:,21:09:18,Log-Likelihood:,-798.5
No. Observations:,807,AIC:,1607.0
Df Residuals:,802,BIC:,1630.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,7.7954,0.170,45.741,0.000,7.461,8.130
cigs,0.0017,0.002,1.010,0.313,-0.002,0.005
educ,0.0604,0.008,7.642,0.000,0.045,0.076
age,0.0577,0.008,7.548,0.000,0.043,0.073
agesq,-0.0006,8.34e-05,-7.563,0.000,-0.001,-0.000

0,1,2,3
Omnibus:,264.025,Durbin-Watson:,1.908
Prob(Omnibus):,0.0,Jarque-Bera (JB):,960.743
Skew:,-1.531,Prob(JB):,2.38e-209
Kurtosis:,7.381,Cond. No.,18800.0


In [3]:
X = sm.add_constant(smoke[["educ", "age", "agesq", "lcigpric", "restaurn"]])
model = sm.OLS(smoke.cigs, X, missing="drop").fit()
model.summary()

0,1,2,3
Dep. Variable:,cigs,R-squared:,0.051
Model:,OLS,Adj. R-squared:,0.045
Method:,Least Squares,F-statistic:,8.61
Date:,"Thu, 19 May 2022",Prob (F-statistic):,5.86e-08
Time:,21:09:18,Log-Likelihood:,-3237.0
No. Observations:,807,AIC:,6486.0
Df Residuals:,801,BIC:,6514.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.5801,23.696,0.067,0.947,-44.933,48.093
educ,-0.4501,0.162,-2.785,0.005,-0.767,-0.133
age,0.8225,0.154,5.330,0.000,0.520,1.125
agesq,-0.0096,0.002,-5.711,0.000,-0.013,-0.006
lcigpric,-0.3513,5.766,-0.061,0.951,-11.669,10.966
restaurn,-2.7364,1.110,-2.466,0.014,-4.915,-0.558

0,1,2,3
Omnibus:,226.788,Durbin-Watson:,2.01
Prob(Omnibus):,0.0,Jarque-Bera (JB):,500.148
Skew:,1.543,Prob(JB):,2.48e-109
Kurtosis:,5.313,Cond. No.,131000.0


In [4]:
X = sm.add_constant(smoke[["educ", "age", "agesq"]])
iv = smoke[["lcigpric", "restaurn"]]
IV2SLS(smoke.lincome, X, smoke.cigs, iv).fit(cov_type="unadjusted")

0,1,2,3
Dep. Variable:,lincome,R-squared:,-0.5169
Estimator:,IV-2SLS,Adj. R-squared:,-0.5245
No. Observations:,807,F-statistic:,89.802
Date:,"Thu, May 19 2022",P-value (F-stat),0.0000
Time:,21:09:18,Distribution:,chi2(4)
Cov. Estimator:,unadjusted,,
,,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
const,7.7809,0.2292,33.955,0.0000,7.3318,8.2300
educ,0.0397,0.0162,2.4444,0.0145,0.0079,0.0715
age,0.0938,0.0238,3.9454,0.0001,0.0472,0.1404
agesq,-0.0011,0.0003,-3.8424,0.0001,-0.0016,-0.0005
cigs,-0.0421,0.0261,-1.6117,0.1070,-0.0934,0.0091


C1.i We are regressing log(income) and so the coefficient ($\beta_1$) is the income elasticity with respect to cigrettes. That is, the percent change in income per cigarette smoked per day.

C1.ii The price of a pack of ciagrettes would likely be negatively related to consumption, though they are addictive so this is not assured. We should expect either a negative or zero coefficient. Smoking restrictions are, by definition, intended to reduce smoking and so again we would expect negative or zero.

C1.iii We require at least one exogenous variable and so either the (log) price of a pack or the restaurant restriction should have a non-zero coefficient.

C1.iv The estimate on cigs is positive but small and insignificant.

C1.v The smoking restriction variable is significant in the reduced form (at the 5% level) but not the log price of a pack.

C1.vi The coefficient for cigs is now negative but is still not significant, even at the 10% level. The effect is somewhat large (at least 4 times more than the estimate from OLS in absolute terms. Also a 4% drop in income is large in general).

C1.vii Restaurant restrictions are likely to vary state by state as would incomes. This does not seem entirely in line with the assumption of exogeneity (that is, if restrictions on smoking are more or less likely in states with higher incomes, this is a problem).

In [5]:
# Exercise 2
mroz = pd.read_stata("./stata/MROZ.DTA")
mroz["lhours"] = np.log(mroz.hours)
mroz = mroz[["lhours", "lwage", "educ", "age", "kidslt6", "nwifeinc", "exper",
             "expersq", "motheduc", "fatheduc"]].dropna()
X = sm.add_constant(mroz[["educ", "age", "kidslt6", "nwifeinc"]])
iv = mroz[["exper", "expersq"]]
IV2SLS(mroz.lhours, X, mroz.lwage, iv).fit(cov_type="unadjusted")

  result = getattr(ufunc, method)(*inputs, **kwargs)


0,1,2,3
Dep. Variable:,lhours,R-squared:,-1.7762
Estimator:,IV-2SLS,Adj. R-squared:,-1.8091
No. Observations:,428,F-statistic:,24.395
Date:,"Thu, May 19 2022",P-value (F-stat),0.0002
Time:,21:09:18,Distribution:,chi2(5)
Cov. Estimator:,unadjusted,,
,,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
const,8.3702,0.6842,12.234,0.0000,7.0293,9.7112
educ,-0.2355,0.0704,-3.3459,0.0008,-0.3734,-0.0975
age,-0.0135,0.0112,-1.2111,0.2258,-0.0354,0.0084
kidslt6,-0.4654,0.2178,-2.1368,0.0326,-0.8924,-0.0385
nwifeinc,-0.0139,0.0079,-1.7653,0.0775,-0.0293,0.0015
lwage,1.9943,0.5603,3.5592,0.0004,0.8961,3.0926


In [6]:
X = sm.add_constant(mroz[["age", "kidslt6", "nwifeinc"]])
iv = mroz[["exper", "expersq", "motheduc", "fatheduc"]]
results = IV2SLS(mroz.lhours, X, mroz[["lwage", "educ"]], iv).fit(
    cov_type="unadjusted")
results

0,1,2,3
Dep. Variable:,lhours,R-squared:,-1.4820
Estimator:,IV-2SLS,Adj. R-squared:,-1.5114
No. Observations:,428,F-statistic:,26.282
Date:,"Thu, May 19 2022",P-value (F-stat),0.0001
Time:,21:09:19,Distribution:,chi2(5)
Cov. Estimator:,unadjusted,,
,,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
const,7.2608,1.0122,7.1731,0.0000,5.2768,9.2447
age,-0.0116,0.0105,-1.1037,0.2697,-0.0322,0.0090
kidslt6,-0.5432,0.2098,-2.5885,0.0096,-0.9545,-0.1319
nwifeinc,-0.0189,0.0087,-2.1677,0.0302,-0.0360,-0.0018
lwage,1.8109,0.4943,3.6638,0.0002,0.8422,2.7797
educ,-0.1286,0.0868,-1.4813,0.1385,-0.2988,0.0416


In [7]:
results.wooldridge_overid

Wooldridge's score test of overidentification
H0: Model is not overidentified.
Statistic: 0.6487
P-value: 0.7230
Distributed: chi2(2)
WaldTestStatistic, id: 0x7f20a3b4a9a0

In [8]:
overid_fit = sm.OLS(results.resids, mroz[["age", "kidslt6", "nwifeinc", 
                                          "exper", "expersq", "motheduc",
                                          "fatheduc"]]).fit()
overid_fit.summary()
print("Manual test p-value:", 
      stats.chi2.sf(overid_fit.rsquared * overid_fit.nobs, 2))

Manual test p-value: 0.7999839798648982


C2.i Labour supply elasticity raises now to about 1.99 (up from 1.26).

C2.ii Results above, estimate for lwage falls slightly to 1.81 but is still higher than before.

C2.iii The test fails to reject the null meaning we have not found evidence that any of our IVs are not exogenous. The IVs pass the test.