# Chapter 16

In [1]:
import pandas as pd
import statsmodels.api as sm
import numpy as np
import matplotlib.pyplot as plt
from linearmodels.iv import IV2SLS
from scipy import stats

In [2]:
# Exercise 1
smoke = pd.read_stata("./stata/SMOKE.DTA")
X = sm.add_constant(smoke[["cigs", "educ", "age", "agesq"]])
model = sm.OLS(smoke.lincome, X, missing="drop").fit()
model.summary()

0,1,2,3
Dep. Variable:,lincome,R-squared:,0.165
Model:,OLS,Adj. R-squared:,0.161
Method:,Least Squares,F-statistic:,39.61
Date:,"Fri, 20 May 2022",Prob (F-statistic):,2.68e-30
Time:,16:05:38,Log-Likelihood:,-798.5
No. Observations:,807,AIC:,1607.0
Df Residuals:,802,BIC:,1630.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,7.7954,0.170,45.741,0.000,7.461,8.130
cigs,0.0017,0.002,1.010,0.313,-0.002,0.005
educ,0.0604,0.008,7.642,0.000,0.045,0.076
age,0.0577,0.008,7.548,0.000,0.043,0.073
agesq,-0.0006,8.34e-05,-7.563,0.000,-0.001,-0.000

0,1,2,3
Omnibus:,264.025,Durbin-Watson:,1.908
Prob(Omnibus):,0.0,Jarque-Bera (JB):,960.743
Skew:,-1.531,Prob(JB):,2.38e-209
Kurtosis:,7.381,Cond. No.,18800.0


In [3]:
X = sm.add_constant(smoke[["educ", "age", "agesq", "lcigpric", "restaurn"]])
model = sm.OLS(smoke.cigs, X, missing="drop").fit()
model.summary()

0,1,2,3
Dep. Variable:,cigs,R-squared:,0.051
Model:,OLS,Adj. R-squared:,0.045
Method:,Least Squares,F-statistic:,8.61
Date:,"Fri, 20 May 2022",Prob (F-statistic):,5.86e-08
Time:,16:05:38,Log-Likelihood:,-3237.0
No. Observations:,807,AIC:,6486.0
Df Residuals:,801,BIC:,6514.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.5801,23.696,0.067,0.947,-44.933,48.093
educ,-0.4501,0.162,-2.785,0.005,-0.767,-0.133
age,0.8225,0.154,5.330,0.000,0.520,1.125
agesq,-0.0096,0.002,-5.711,0.000,-0.013,-0.006
lcigpric,-0.3513,5.766,-0.061,0.951,-11.669,10.966
restaurn,-2.7364,1.110,-2.466,0.014,-4.915,-0.558

0,1,2,3
Omnibus:,226.788,Durbin-Watson:,2.01
Prob(Omnibus):,0.0,Jarque-Bera (JB):,500.148
Skew:,1.543,Prob(JB):,2.48e-109
Kurtosis:,5.313,Cond. No.,131000.0


In [4]:
X = sm.add_constant(smoke[["educ", "age", "agesq"]])
iv = smoke[["lcigpric", "restaurn"]]
IV2SLS(smoke.lincome, X, smoke.cigs, iv).fit(cov_type="unadjusted")

0,1,2,3
Dep. Variable:,lincome,R-squared:,-0.5169
Estimator:,IV-2SLS,Adj. R-squared:,-0.5245
No. Observations:,807,F-statistic:,89.802
Date:,"Fri, May 20 2022",P-value (F-stat),0.0000
Time:,16:05:39,Distribution:,chi2(4)
Cov. Estimator:,unadjusted,,
,,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
const,7.7809,0.2292,33.955,0.0000,7.3318,8.2300
educ,0.0397,0.0162,2.4444,0.0145,0.0079,0.0715
age,0.0938,0.0238,3.9454,0.0001,0.0472,0.1404
agesq,-0.0011,0.0003,-3.8424,0.0001,-0.0016,-0.0005
cigs,-0.0421,0.0261,-1.6117,0.1070,-0.0934,0.0091


C1.i We are regressing log(income) and so the coefficient ($\beta_1$) is the income elasticity with respect to cigrettes. That is, the percent change in income per cigarette smoked per day.

C1.ii The price of a pack of ciagrettes would likely be negatively related to consumption, though they are addictive so this is not assured. We should expect either a negative or zero coefficient. Smoking restrictions are, by definition, intended to reduce smoking and so again we would expect negative or zero.

C1.iii We require at least one exogenous variable and so either the (log) price of a pack or the restaurant restriction should have a non-zero coefficient.

C1.iv The estimate on cigs is positive but small and insignificant.

C1.v The smoking restriction variable is significant in the reduced form (at the 5% level) but not the log price of a pack.

C1.vi The coefficient for cigs is now negative but is still not significant, even at the 10% level. The effect is somewhat large (at least 4 times more than the estimate from OLS in absolute terms. Also a 4% drop in income is large in general).

C1.vii Restaurant restrictions are likely to vary state by state as would incomes. This does not seem entirely in line with the assumption of exogeneity (that is, if restrictions on smoking are more or less likely in states with higher incomes, this is a problem).

In [5]:
# Exercise 2
mroz = pd.read_stata("./stata/MROZ.DTA")
mroz["lhours"] = np.log(mroz.hours)
mroz = mroz[["lhours", "lwage", "educ", "age", "kidslt6", "nwifeinc", "exper",
             "expersq", "motheduc", "fatheduc"]].dropna()
X = sm.add_constant(mroz[["educ", "age", "kidslt6", "nwifeinc"]])
iv = mroz[["exper", "expersq"]]
IV2SLS(mroz.lhours, X, mroz.lwage, iv).fit(cov_type="unadjusted")

  result = getattr(ufunc, method)(*inputs, **kwargs)


0,1,2,3
Dep. Variable:,lhours,R-squared:,-1.7762
Estimator:,IV-2SLS,Adj. R-squared:,-1.8091
No. Observations:,428,F-statistic:,24.395
Date:,"Fri, May 20 2022",P-value (F-stat),0.0002
Time:,16:05:39,Distribution:,chi2(5)
Cov. Estimator:,unadjusted,,
,,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
const,8.3702,0.6842,12.234,0.0000,7.0293,9.7112
educ,-0.2355,0.0704,-3.3459,0.0008,-0.3734,-0.0975
age,-0.0135,0.0112,-1.2111,0.2258,-0.0354,0.0084
kidslt6,-0.4654,0.2178,-2.1368,0.0326,-0.8924,-0.0385
nwifeinc,-0.0139,0.0079,-1.7653,0.0775,-0.0293,0.0015
lwage,1.9943,0.5603,3.5592,0.0004,0.8961,3.0926


In [6]:
X = sm.add_constant(mroz[["age", "kidslt6", "nwifeinc"]])
iv = mroz[["exper", "expersq", "motheduc", "fatheduc"]]
results = IV2SLS(mroz.lhours, X, mroz[["lwage", "educ"]], iv).fit(
    cov_type="unadjusted")
results

0,1,2,3
Dep. Variable:,lhours,R-squared:,-1.4820
Estimator:,IV-2SLS,Adj. R-squared:,-1.5114
No. Observations:,428,F-statistic:,26.282
Date:,"Fri, May 20 2022",P-value (F-stat),0.0001
Time:,16:05:39,Distribution:,chi2(5)
Cov. Estimator:,unadjusted,,
,,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
const,7.2608,1.0122,7.1731,0.0000,5.2768,9.2447
age,-0.0116,0.0105,-1.1037,0.2697,-0.0322,0.0090
kidslt6,-0.5432,0.2098,-2.5885,0.0096,-0.9545,-0.1319
nwifeinc,-0.0189,0.0087,-2.1677,0.0302,-0.0360,-0.0018
lwage,1.8109,0.4943,3.6638,0.0002,0.8422,2.7797
educ,-0.1286,0.0868,-1.4813,0.1385,-0.2988,0.0416


In [7]:
results.wooldridge_overid

Wooldridge's score test of overidentification
H0: Model is not overidentified.
Statistic: 0.6487
P-value: 0.7230
Distributed: chi2(2)
WaldTestStatistic, id: 0x7f0ecb2dbee0

In [8]:
overid_fit = sm.OLS(results.resids, mroz[["age", "kidslt6", "nwifeinc", 
                                          "exper", "expersq", "motheduc",
                                          "fatheduc"]]).fit()
overid_fit.summary()
print("Manual test p-value:", 
      stats.chi2.sf(overid_fit.rsquared * overid_fit.nobs, 2))

Manual test p-value: 0.7999839798648982


C2.i Labour supply elasticity raises now to about 1.99 (up from 1.26).

C2.ii Results above, estimate for lwage falls slightly to 1.81 but is still higher than before.

C2.iii The test fails to reject the null meaning we have not found evidence that any of our IVs are not exogenous. The IVs pass the test.

In [9]:
# Exercise 3
openness = pd.read_stata("./stata/OPENNESS.DTA")
X = sm.add_constant(openness[["open"]])
sm.OLS(openness.inf, X).fit().summary()

0,1,2,3
Dep. Variable:,inf,R-squared:,0.045
Model:,OLS,Adj. R-squared:,0.037
Method:,Least Squares,F-statistic:,5.311
Date:,"Fri, 20 May 2022",Prob (F-statistic):,0.023
Time:,16:05:39,Log-Likelihood:,-520.9
No. Observations:,114,AIC:,1046.0
Df Residuals:,112,BIC:,1051.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,25.2342,4.102,6.152,0.000,17.106,33.362
open,-0.2150,0.093,-2.304,0.023,-0.400,-0.030

0,1,2,3
Omnibus:,162.333,Durbin-Watson:,2.106
Prob(Omnibus):,0.0,Jarque-Bera (JB):,6623.109
Skew:,5.343,Prob(JB):,0.0
Kurtosis:,38.779,Cond. No.,81.8


In [10]:
iv = openness[["lland"]]
IV2SLS(openness.inf, np.ones(openness.shape[0]), openness.open, iv).fit(
    cov_type="unadjusted")

0,1,2,3
Dep. Variable:,inf,R-squared:,0.0316
Estimator:,IV-2SLS,Adj. R-squared:,0.0230
No. Observations:,114,F-statistic:,5.7259
Date:,"Fri, May 20 2022",P-value (F-stat),0.0167
Time:,16:05:40,Distribution:,chi2(1)
Cov. Estimator:,unadjusted,,
,,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
exog,29.607,5.6084,5.2790,0.0000,18.614,40.599
open,-0.3329,0.1391,-2.3929,0.0167,-0.6055,-0.0602


In [11]:
sm.OLS(openness.open, sm.add_constant(openness["land"])).fit().summary()

0,1,2,3
Dep. Variable:,open,R-squared:,0.095
Model:,OLS,Adj. R-squared:,0.087
Method:,Least Squares,F-statistic:,11.76
Date:,"Fri, 20 May 2022",Prob (F-statistic):,0.000848
Time:,16:05:40,Log-Likelihood:,-516.69
No. Observations:,114,AIC:,1037.0
Df Residuals:,112,BIC:,1043.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,40.4499,2.342,17.271,0.000,35.809,45.090
land,-1.128e-05,3.29e-06,-3.429,0.001,-1.78e-05,-4.76e-06

0,1,2,3
Omnibus:,72.146,Durbin-Watson:,2.09
Prob(Omnibus):,0.0,Jarque-Bera (JB):,361.21
Skew:,2.18,Prob(JB):,3.6700000000000004e-79
Kurtosis:,10.552,Cond. No.,784000.0


In [12]:
sm.OLS(openness.open, sm.add_constant(openness["lland"])).fit().summary()

0,1,2,3
Dep. Variable:,open,R-squared:,0.448
Model:,OLS,Adj. R-squared:,0.443
Method:,Least Squares,F-statistic:,90.9
Date:,"Fri, 20 May 2022",Prob (F-statistic):,3.93e-16
Time:,16:05:40,Log-Likelihood:,-488.51
No. Observations:,114,AIC:,981.0
Df Residuals:,112,BIC:,986.5
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,121.8385,9.044,13.472,0.000,103.919,139.758
lland,-7.6182,0.799,-9.534,0.000,-9.201,-6.035

0,1,2,3
Omnibus:,55.789,Durbin-Watson:,2.163
Prob(Omnibus):,0.0,Jarque-Bera (JB):,193.082
Skew:,1.747,Prob(JB):,1.18e-42
Kurtosis:,8.333,Cond. No.,62.1


In [13]:
sm.OLS(openness.open, sm.add_constant(openness[["land", "lland"]])
      ).fit().summary()

0,1,2,3
Dep. Variable:,open,R-squared:,0.457
Model:,OLS,Adj. R-squared:,0.448
Method:,Least Squares,F-statistic:,46.77
Date:,"Fri, 20 May 2022",Prob (F-statistic):,1.85e-15
Time:,16:05:40,Log-Likelihood:,-487.54
No. Observations:,114,AIC:,981.1
Df Residuals:,111,BIC:,989.3
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,129.2173,10.471,12.341,0.000,108.468,149.966
land,4.334e-06,3.14e-06,1.382,0.170,-1.88e-06,1.05e-05
lland,-8.3978,0.975,-8.609,0.000,-10.331,-6.465

0,1,2,3
Omnibus:,54.521,Durbin-Watson:,2.172
Prob(Omnibus):,0.0,Jarque-Bera (JB):,178.858
Skew:,1.729,Prob(JB):,1.45e-39
Kurtosis:,8.069,Cond. No.,4530000.0


In [14]:
X = sm.add_constant(openness[["oil", "lpcinc"]])
iv = openness[["lland"]]
IV2SLS(openness.inf, X, openness.open, iv).fit(
    cov_type="unadjusted")

0,1,2,3
Dep. Variable:,inf,R-squared:,0.0349
Estimator:,IV-2SLS,Adj. R-squared:,0.0086
No. Observations:,114,F-statistic:,6.2377
Date:,"Fri, May 20 2022",P-value (F-stat),0.1006
Time:,16:05:40,Distribution:,chi2(3)
Cov. Estimator:,unadjusted,,
,,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
const,24.009,15.752,1.5242,0.1275,-6.8638,54.882
oil,-6.5557,9.6279,-0.6809,0.4959,-25.426,12.315
lpcinc,0.8033,2.0804,0.3861,0.6994,-3.2743,4.8809
open,-0.3370,0.1419,-2.3743,0.0176,-0.6151,-0.0588


C3.i Results above. The result is not very different from the result reported in the book.

C3.ii The log land variable explains more variation than the untransformed variable (as indicated by $R^2$). When running a regression with both variables the transformed variable is significant.

C3.iii The estimate suggests a large negative relationship between oil and inflation (greater than 6%), but the result is not significant.

In [15]:
# Exercise 4
consump = pd.read_stata("./stata/consump.dta")
consump = consump[["gy", "r3", "gc", "gc_1", "gy_1", "r3_1"]].dropna()
X = consump[["gy", "r3"]]
iv = consump[["gc_1", "gy_1", "r3_1"]]
iv_results = IV2SLS(consump.gc, np.ones(consump.shape[0]), X, iv).fit()
iv_results

0,1,2,3
Dep. Variable:,gc,R-squared:,0.6779
Estimator:,IV-2SLS,Adj. R-squared:,0.6578
No. Observations:,35,F-statistic:,18.305
Date:,"Fri, May 20 2022",P-value (F-stat),0.0001
Time:,16:05:40,Distribution:,chi2(2)
Cov. Estimator:,robust,,
,,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
exog,0.0081,0.0034,2.3694,0.0178,0.0014,0.0147
gy,0.5862,0.1371,4.2761,0.0000,0.3175,0.8549
r3,-0.0003,0.0009,-0.2961,0.7671,-0.0021,0.0015


In [16]:
iv_results.wooldridge_overid

Wooldridge's score test of overidentification
H0: Model is not overidentified.
Statistic: 2.0386
P-value: 0.1534
Distributed: chi2(1)
WaldTestStatistic, id: 0x7f0ecb2db3d0

In [17]:
overid_fit = sm.OLS(iv_results.resids, sm.add_constant(iv)).fit()
overid_fit.summary()
print("Manual test p-value:", 
      stats.chi2.sf(overid_fit.rsquared * overid_fit.nobs, 1))

Manual test p-value: 0.14291384678903202


In [18]:
consump = pd.read_stata("./stata/consump.dta")
consump = consump[["gy", "r3", "gc", "gc_2", "gy_2", "r3_2"]].dropna()
X = consump[["gy", "r3"]]
iv = consump[["gc_2", "gy_2", "r3_2"]]
iv_results = IV2SLS(consump.gc, np.ones(consump.shape[0]), X, iv).fit()
iv_results

0,1,2,3
Dep. Variable:,gc,R-squared:,-0.1161
Estimator:,IV-2SLS,Adj. R-squared:,-0.1881
No. Observations:,34,F-statistic:,3.1322
Date:,"Fri, May 20 2022",P-value (F-stat),0.2089
Time:,16:08:52,Distribution:,chi2(2)
Cov. Estimator:,robust,,
,,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
exog,-0.0054,0.0162,-0.3358,0.7370,-0.0372,0.0263
gy,1.2042,0.7973,1.5103,0.1310,-0.3585,2.7669
r3,-0.0004,0.0025,-0.1674,0.8671,-0.0054,0.0046


In [19]:
sm.OLS(consump.gy, sm.add_constant(iv)).fit().summary()

0,1,2,3
Dep. Variable:,gy,R-squared:,0.014
Model:,OLS,Adj. R-squared:,-0.085
Method:,Least Squares,F-statistic:,0.139
Date:,"Fri, 20 May 2022",Prob (F-statistic):,0.936
Time:,16:09:47,Log-Likelihood:,88.045
No. Observations:,34,AIC:,-168.1
Df Residuals:,30,BIC:,-162.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.0208,0.007,3.170,0.003,0.007,0.034
gc_2,-0.0702,0.469,-0.150,0.882,-1.029,0.888
gy_2,0.0937,0.330,0.284,0.778,-0.580,0.768
r3_2,0.0007,0.002,0.445,0.659,-0.003,0.004

0,1,2,3
Omnibus:,0.143,Durbin-Watson:,1.582
Prob(Omnibus):,0.931,Jarque-Bera (JB):,0.094
Skew:,0.102,Prob(JB):,0.954
Kurtosis:,2.843,Cond. No.,425.0


C4.i The test fails to reject the null at even the 10% level and so it passes. We do not have evidence to suggest our IVs are exogenous.

C4.ii The coefficient for gy is larger but no longer significant. This is a fairly substantial change since the PIH was rejected in 16.36.

C4.iii None of the individual coefficients are statistically significant. This is less important than the fact that the F-Statistic is small and so the second lags are not sufficiently correlated with gy. This is important because we expect our IVs to be correlated with the endogenous variable!

Note: Omitting Exercise 5. Subsequent reports (website goes back to 2010, others available through Federal Reserve) use a different base for GDP so in addition to adding > 25 years of data (100+ additional data points by hand), real values need to be updated. I note that the R package "wooldridge" which claims to be updated for the 7th edition only includes the consump data set from this edition and so it may have been dropped from later editions. I'll try to get back to this at a later date but I would rather work on new problems.

In [23]:
# Exercise 6
cement = pd.read_stata("./stata/cement.dta")
X = sm.add_constant(cement[["gcem", "gprcpet", "feb", "mar", "apr", "may",
                            "jun", "jul", "aug", "sep", "oct", "nov", "dec"]])
model = sm.OLS(cement.gprc, X, missing="drop").fit()
model.summary()

0,1,2,3
Dep. Variable:,gprc,R-squared:,0.386
Model:,OLS,Adj. R-squared:,0.358
Method:,Least Squares,F-statistic:,13.72
Date:,"Fri, 20 May 2022",Prob (F-statistic):,1.35e-23
Time:,19:02:56,Log-Likelihood:,924.74
No. Observations:,298,AIC:,-1821.0
Df Residuals:,284,BIC:,-1770.0
Df Model:,13,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.0144,0.003,4.516,0.000,0.008,0.021
gcem,-0.0443,0.009,-4.866,0.000,-0.062,-0.026
gprcpet,0.0628,0.015,4.111,0.000,0.033,0.093
feb,-0.0034,0.005,-0.714,0.476,-0.013,0.006
mar,0.0009,0.005,0.158,0.875,-0.010,0.012
apr,0.0055,0.005,1.045,0.297,-0.005,0.016
may,-0.0087,0.004,-1.964,0.050,-0.017,1.88e-05
jun,-0.0109,0.005,-2.411,0.017,-0.020,-0.002
jul,-0.0111,0.004,-3.036,0.003,-0.018,-0.004

0,1,2,3
Omnibus:,160.517,Durbin-Watson:,1.918
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1624.081
Skew:,1.973,Prob(JB):,0.0
Kurtosis:,13.735,Cond. No.,25.3
