# Chapter 13

In [1]:
import pandas as pd
import statsmodels.api as sm
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.compat import lzip
import statsmodels.stats.api as sms

In [2]:
# Exercise 1
fertil1 = pd.read_stata("./stata/FERTIL1.DTA")
X = sm.add_constant(fertil1[["educ", "age", "agesq", "black", "east", "northcen", "west", "farm", "othrural", "town", "smcity", "y74", "y76", "y78", "y80", "y82", "y84"]])
model = sm.OLS(fertil1.kids, X, missing = "drop").fit()
model.f_test("farm = othrural = town = smcity = 0")

  x = pd.concat(x[::order], 1)


<class 'statsmodels.stats.contrast.ContrastResults'>
<F test: F=array([[1.15876537]]), p=0.3274579660022206, df_denom=1.11e+03, df_num=4>

In [3]:
model.f_test("east = northcen = west = 0")

<class 'statsmodels.stats.contrast.ContrastResults'>
<F test: F=array([[3.01165586]]), p=0.02925801768068351, df_denom=1.11e+03, df_num=3>

In [4]:
X = sm.add_constant(fertil1[["y74", "y76", "y78", "y80", "y82", "y84"]])
y = model.resid ** 2
model = sm.OLS(y, X, missing = "drop").fit()
model.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.015
Model:,OLS,Adj. R-squared:,0.01
Method:,Least Squares,F-statistic:,2.905
Date:,"Fri, 13 Aug 2021",Prob (F-statistic):,0.00816
Time:,01:25:36,Log-Likelihood:,-2940.1
No. Observations:,1129,AIC:,5894.0
Df Residuals:,1122,BIC:,5929.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,3.2873,0.263,12.512,0.000,2.772,3.803
y74,-1.0722,0.362,-2.959,0.003,-1.783,-0.361
y76,-0.7698,0.374,-2.058,0.040,-1.504,-0.036
y78,-0.9559,0.380,-2.516,0.012,-1.701,-0.210
y80,-1.2110,0.381,-3.182,0.002,-1.958,-0.464
y82,-0.9221,0.356,-2.588,0.010,-1.621,-0.223
y84,-1.3719,0.360,-3.807,0.000,-2.079,-0.665

0,1,2,3
Omnibus:,536.62,Durbin-Watson:,2.014
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2684.707
Skew:,2.233,Prob(JB):,0.0
Kurtosis:,9.093,Cond. No.,8.0


In [5]:
X = sm.add_constant(fertil1[["educ", "age", "agesq", "black", "east", "northcen", "west", "farm", "othrural", "town", "smcity", "y74", "y76", "y78", "y80", "y82", "y84", "y74educ", "y76educ", "y78educ", "y80educ", "y82educ", "y84educ"]])
model = sm.OLS(fertil1.kids, X, missing = "drop").fit()
model.f_test("y74educ = y76educ = y78educ = y80educ = y82educ = y84educ = 0")

  x = pd.concat(x[::order], 1)


<class 'statsmodels.stats.contrast.ContrastResults'>
<F test: F=array([[1.4835693]]), p=0.1803366345999419, df_denom=1.10e+03, df_num=6>

In [6]:
model.summary()

0,1,2,3
Dep. Variable:,kids,R-squared:,0.136
Model:,OLS,Adj. R-squared:,0.118
Method:,Least Squares,F-statistic:,7.593
Date:,"Fri, 13 Aug 2021",Prob (F-statistic):,3.4099999999999995e-23
Time:,01:25:36,Log-Likelihood:,-2086.7
No. Observations:,1129,AIC:,4221.0
Df Residuals:,1105,BIC:,4342.0
Df Model:,23,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-8.4773,3.126,-2.712,0.007,-14.612,-2.343
educ,-0.0225,0.054,-0.420,0.675,-0.128,0.083
age,0.5075,0.139,3.653,0.000,0.235,0.780
agesq,-0.0055,0.002,-3.519,0.000,-0.009,-0.002
black,1.0741,0.174,6.183,0.000,0.733,1.415
east,0.2061,0.133,1.548,0.122,-0.055,0.467
northcen,0.3483,0.121,2.876,0.004,0.111,0.586
west,0.1771,0.167,1.058,0.290,-0.151,0.506
farm,-0.0722,0.148,-0.489,0.625,-0.362,0.217

0,1,2,3
Omnibus:,10.028,Durbin-Watson:,2.013
Prob(Omnibus):,0.007,Jarque-Bera (JB):,10.214
Skew:,0.225,Prob(JB):,0.00606
Kurtosis:,2.883,Cond. No.,136000.0


C1.i Results printed above. F-Statistic is about 1.589 and p-value is about 0.328 (they are not jointly significant)

C1.ii Results printed above. The result has a p-value of 0.029 which is significant at the 5% level.

C1.iii The OLS summary provides us the F-test we are interested in with an F-statistic of 2.905 and p-value of 0.00816. We reject the null hypothesis at the 1% level, which is evidence of heteroskedasticity.

C1.iv Results printed above. The resulting p-value is about 0.180 and so the interaction terms are not jointly significant. These interactions allow the education level to vary for a given year and so the high p-value suggests it may not vary over time. However, some of the interactions are individually significant (78, 82, 84) with coefficients (generally) growing in absolute value over time. A growing link between education and a decrease in fertility seems intuitive, and we could explain the lack significance for the joint significance test through the fact that half the year dummies (generally the earlier half) are not significant.

In [7]:
# Exercise 2
cps78_85 = pd.read_stata("./stata/CPS78_85.DTA")
X = sm.add_constant(cps78_85[["y85", "educ", "y85educ", "exper", "expersq", "union", "female", "y85fem"]])
model = sm.OLS(cps78_85.lwage, X, missing = "drop").fit()
model.summary()

  x = pd.concat(x[::order], 1)


0,1,2,3
Dep. Variable:,lwage,R-squared:,0.426
Model:,OLS,Adj. R-squared:,0.422
Method:,Least Squares,F-statistic:,99.8
Date:,"Fri, 13 Aug 2021",Prob (F-statistic):,4.46e-124
Time:,01:25:36,Log-Likelihood:,-574.24
No. Observations:,1084,AIC:,1166.0
Df Residuals:,1075,BIC:,1211.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.4589,0.093,4.911,0.000,0.276,0.642
y85,0.1178,0.124,0.952,0.341,-0.125,0.361
educ,0.0747,0.007,11.192,0.000,0.062,0.088
y85educ,0.0185,0.009,1.974,0.049,0.000,0.037
exper,0.0296,0.004,8.293,0.000,0.023,0.037
expersq,-0.0004,7.75e-05,-5.151,0.000,-0.001,-0.000
union,0.2021,0.030,6.672,0.000,0.143,0.262
female,-0.3167,0.037,-8.648,0.000,-0.389,-0.245
y85fem,0.0851,0.051,1.658,0.098,-0.016,0.186

0,1,2,3
Omnibus:,83.747,Durbin-Watson:,1.918
Prob(Omnibus):,0.0,Jarque-Bera (JB):,317.985
Skew:,-0.271,Prob(JB):,8.920000000000001e-70
Kurtosis:,5.597,Cond. No.,8770.0


In [8]:
cps78_85["y85educ_adj"] = (cps78_85["educ"] - 12) * cps78_85["y85"]
X = sm.add_constant(cps78_85[["y85", "educ", "y85educ_adj", "exper", "expersq", "union", "female", "y85fem"]])
sm.OLS(cps78_85.lwage, X, missing = "drop").fit().summary()

  x = pd.concat(x[::order], 1)


0,1,2,3
Dep. Variable:,lwage,R-squared:,0.426
Model:,OLS,Adj. R-squared:,0.422
Method:,Least Squares,F-statistic:,99.8
Date:,"Fri, 13 Aug 2021",Prob (F-statistic):,4.46e-124
Time:,01:25:36,Log-Likelihood:,-574.24
No. Observations:,1084,AIC:,1166.0
Df Residuals:,1075,BIC:,1211.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.4589,0.093,4.911,0.000,0.276,0.642
y85,0.3393,0.034,9.977,0.000,0.273,0.406
educ,0.0747,0.007,11.192,0.000,0.062,0.088
y85educ_adj,0.0185,0.009,1.974,0.049,0.000,0.037
exper,0.0296,0.004,8.293,0.000,0.023,0.037
expersq,-0.0004,7.75e-05,-5.151,0.000,-0.001,-0.000
union,0.2021,0.030,6.672,0.000,0.143,0.262
female,-0.3167,0.037,-8.648,0.000,-0.389,-0.245
y85fem,0.0851,0.051,1.658,0.098,-0.016,0.186

0,1,2,3
Omnibus:,83.747,Durbin-Watson:,1.918
Prob(Omnibus):,0.0,Jarque-Bera (JB):,317.985
Skew:,-0.271,Prob(JB):,8.920000000000001e-70
Kurtosis:,5.597,Cond. No.,5910.0


In [9]:
cps78_85["rwage"] = np.exp(cps78_85["lwage"])
cps78_85.loc[cps78_85.y85 == 1, "rwage"] = cps78_85["rwage"] / 1.65
cps78_85["lrwage"] = np.log(cps78_85["rwage"])
X = sm.add_constant(cps78_85[["y85", "educ", "y85educ", "exper", "expersq", "union", "female", "y85fem"]])
model = sm.OLS(cps78_85.lrwage, X, missing = "drop").fit()
model.summary()

  x = pd.concat(x[::order], 1)


0,1,2,3
Dep. Variable:,lrwage,R-squared:,0.356
Model:,OLS,Adj. R-squared:,0.351
Method:,Least Squares,F-statistic:,74.35
Date:,"Fri, 13 Aug 2021",Prob (F-statistic):,1.88e-97
Time:,01:25:36,Log-Likelihood:,-574.24
No. Observations:,1084,AIC:,1166.0
Df Residuals:,1075,BIC:,1211.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.4589,0.093,4.911,0.000,0.276,0.642
y85,-0.3830,0.124,-3.094,0.002,-0.626,-0.140
educ,0.0747,0.007,11.192,0.000,0.062,0.088
y85educ,0.0185,0.009,1.974,0.049,0.000,0.037
exper,0.0296,0.004,8.293,0.000,0.023,0.037
expersq,-0.0004,7.75e-05,-5.151,0.000,-0.001,-0.000
union,0.2021,0.030,6.672,0.000,0.143,0.262
female,-0.3167,0.037,-8.648,0.000,-0.389,-0.245
y85fem,0.0851,0.051,1.658,0.098,-0.016,0.186

0,1,2,3
Omnibus:,83.747,Durbin-Watson:,1.918
Prob(Omnibus):,0.0,Jarque-Bera (JB):,317.985
Skew:,-0.271,Prob(JB):,8.920000000000001e-70
Kurtosis:,5.597,Cond. No.,8770.0


In [10]:
print("Share of union membership in 1978", cps78_85.loc[cps78_85.y85 == 0, "union"].sum() / cps78_85[cps78_85.y85 == 0].shape[0])
print("Share of union membership in 1985", cps78_85.loc[cps78_85.y85 == 1, "union"].sum() / cps78_85[cps78_85.y85 == 1].shape[0])

Share of union membership in 1978 0.3054545454545455
Share of union membership in 1985 0.1797752808988764


In [11]:
X = sm.add_constant(cps78_85[["y85", "educ", "y85educ", "exper", "expersq", "union", "female", "y85fem", "y85union"]])
model = sm.OLS(cps78_85.lwage, X, missing = "drop").fit()
model.summary()

  x = pd.concat(x[::order], 1)


0,1,2,3
Dep. Variable:,lwage,R-squared:,0.426
Model:,OLS,Adj. R-squared:,0.421
Method:,Least Squares,F-statistic:,88.63
Date:,"Fri, 13 Aug 2021",Prob (F-statistic):,4.6100000000000004e-123
Time:,01:25:36,Log-Likelihood:,-574.24
No. Observations:,1084,AIC:,1168.0
Df Residuals:,1074,BIC:,1218.0
Df Model:,9,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.4588,0.095,4.851,0.000,0.273,0.644
y85,0.1180,0.126,0.934,0.350,-0.130,0.366
educ,0.0747,0.007,11.158,0.000,0.062,0.088
y85educ,0.0185,0.009,1.968,0.049,5.58e-05,0.037
exper,0.0296,0.004,8.289,0.000,0.023,0.037
expersq,-0.0004,7.76e-05,-5.148,0.000,-0.001,-0.000
union,0.2023,0.039,5.176,0.000,0.126,0.279
female,-0.3167,0.037,-8.620,0.000,-0.389,-0.245
y85fem,0.0850,0.052,1.640,0.101,-0.017,0.187

0,1,2,3
Omnibus:,83.76,Durbin-Watson:,1.918
Prob(Omnibus):,0.0,Jarque-Bera (JB):,318.053
Skew:,-0.271,Prob(JB):,8.62e-70
Kurtosis:,5.598,Cond. No.,9010.0


C2.i y85 would be the change in wages for the base group in the end period (1985). The base group is males with no education which doesn't seem to be a demographic of particular interest.

C2.ii From the hint we replace $educ$ in the interaction with $educ - 12$. The confidence interval we want is in $y85$ which is (0.273, 0.406). The estimated percent increase in nominal wages is 34%.

C2.iii The $y85$ coefficient is the only one that has changed between the two equations.

C2.iv The hint tells us the residuals are the same. It must follow then that the total sum of squares between the regressions are different.

C2.v Union membership was almost cut in half between the two periods with 30.6% of observations belonging to a union in 1978 and 18% of observations belonging to a union in 1985.

C2.vi Adding an interaction for union membership with the $y85$ indicator produces a very small coefficient and wide confidence interval. There is no reason to think that the union premium has changed over time.

C2.vii There's no reason to think that a reduction in union membership should decrease the union premium.

In [12]:
# Exercise 3
kielmc = pd.read_stata("./stata/KIELMC.DTA")
X = sm.add_constant(kielmc[["y81", "ldist", "y81ldist"]])
model = sm.OLS(kielmc.lprice, X).fit()
model.summary()

  x = pd.concat(x[::order], 1)


0,1,2,3
Dep. Variable:,lprice,R-squared:,0.396
Model:,OLS,Adj. R-squared:,0.39
Method:,Least Squares,F-statistic:,69.22
Date:,"Fri, 13 Aug 2021",Prob (F-statistic):,1.87e-34
Time:,01:25:36,Log-Likelihood:,-109.24
No. Observations:,321,AIC:,226.5
Df Residuals:,317,BIC:,241.6
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,8.0585,0.508,15.850,0.000,7.058,9.059
y81,-0.0113,0.805,-0.014,0.989,-1.595,1.573
ldist,0.3167,0.052,6.145,0.000,0.215,0.418
y81ldist,0.0482,0.082,0.589,0.556,-0.113,0.209

0,1,2,3
Omnibus:,10.892,Durbin-Watson:,1.395
Prob(Omnibus):,0.004,Jarque-Bera (JB):,16.703
Skew:,0.224,Prob(JB):,0.000236
Kurtosis:,4.024,Cond. No.,512.0


In [13]:
X = sm.add_constant(kielmc[["y81", "ldist", "y81ldist", "age", "agesq", "rooms", "baths", "lintst", "lland", "larea"]])
model = sm.OLS(kielmc.lprice, X).fit()
model.summary()

  x = pd.concat(x[::order], 1)


0,1,2,3
Dep. Variable:,lprice,R-squared:,0.787
Model:,OLS,Adj. R-squared:,0.78
Method:,Least Squares,F-statistic:,114.6
Date:,"Fri, 13 Aug 2021",Prob (F-statistic):,7.73e-98
Time:,01:25:37,Log-Likelihood:,58.114
No. Observations:,321,AIC:,-94.23
Df Residuals:,310,BIC:,-52.74
Df Model:,10,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,7.6739,0.502,15.300,0.000,6.687,8.661
y81,-0.2254,0.495,-0.456,0.649,-1.199,0.748
ldist,0.0009,0.045,0.021,0.984,-0.087,0.089
y81ldist,0.0625,0.050,1.242,0.215,-0.036,0.161
age,-0.0080,0.001,-5.650,0.000,-0.011,-0.005
agesq,3.57e-05,8.71e-06,4.099,0.000,1.86e-05,5.28e-05
rooms,0.0461,0.017,2.660,0.008,0.012,0.080
baths,0.1010,0.028,3.632,0.000,0.046,0.156
lintst,-0.0600,0.032,-1.891,0.060,-0.122,0.002

0,1,2,3
Omnibus:,66.092,Durbin-Watson:,1.691
Prob(Omnibus):,0.0,Jarque-Bera (JB):,350.051
Skew:,-0.716,Prob(JB):,9.72e-77
Kurtosis:,7.912,Cond. No.,261000.0


C3.i $\delta_1$ should be positive since they are further away from the incinerator (which reduces values). $\beta_1 > 0$ would mean that houses further from the incinerator are worth more even before it was placed.

C3.ii Results listed above. In this case $\delta_1 > 0$, is in line with our expectations. However, the t-statistic is small and so there is no evidence the incinerator had any effect on housing prices.

C3.iii Results listed above. After controlling for housing characteristics $\delta_1$ has not changed much in that it is not statistically significant (although the coefficient is higher).

C3.iv The coefficient for $log(dist)$ is likely significant in the simpler regression do to houses further from the site of the incinerator having more of the characteristics associated with high property values. Once the characteristics of the houses are taken into account, the distance from the eventual location of the incenerator does not have any effect (nor should we expect it to).

In [14]:
# Exercise 4
injury = pd.read_stata("./stata/INJURY.DTA")
injury_ky = injury[injury.ky == 1]
X = sm.add_constant(injury_ky[["afchnge", "highearn", "afhigh", "male", "married", "head", "neck", "upextr", "trunk", "lowback", "lowextr", "occdis", "manuf", "construc"]])
model = sm.OLS(injury_ky.ldurat, X, missing = "drop").fit()
model.summary()

  x = pd.concat(x[::order], 1)


0,1,2,3
Dep. Variable:,ldurat,R-squared:,0.041
Model:,OLS,Adj. R-squared:,0.039
Method:,Least Squares,F-statistic:,16.37
Date:,"Fri, 13 Aug 2021",Prob (F-statistic):,4.81e-40
Time:,01:25:37,Log-Likelihood:,-8778.2
No. Observations:,5349,AIC:,17590.0
Df Residuals:,5334,BIC:,17690.0
Df Model:,14,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.2459,0.106,11.735,0.000,1.038,1.454
afchnge,0.0106,0.045,0.237,0.813,-0.077,0.099
highearn,0.1758,0.052,3.397,0.001,0.074,0.277
afhigh,0.2309,0.070,3.321,0.001,0.095,0.367
male,-0.0979,0.045,-2.198,0.028,-0.185,-0.011
married,0.1221,0.039,3.121,0.002,0.045,0.199
head,-0.5139,0.129,-3.975,0.000,-0.767,-0.260
neck,0.2699,0.161,1.671,0.095,-0.047,0.586
upextr,-0.1785,0.101,-1.765,0.078,-0.377,0.020

0,1,2,3
Omnibus:,23.922,Durbin-Watson:,1.935
Prob(Omnibus):,0.0,Jarque-Bera (JB):,32.504
Skew:,-0.023,Prob(JB):,8.74e-08
Kurtosis:,3.379,Cond. No.,29.5


In [15]:
injury_mi = injury[injury.mi == 1]
X = sm.add_constant(injury_mi[["afchnge", "highearn", "afhigh"]])
model = sm.OLS(injury_mi.ldurat, X, missing = "drop").fit()
model.summary()

  x = pd.concat(x[::order], 1)


0,1,2,3
Dep. Variable:,ldurat,R-squared:,0.012
Model:,OLS,Adj. R-squared:,0.01
Method:,Least Squares,F-statistic:,6.049
Date:,"Fri, 13 Aug 2021",Prob (F-statistic):,0.00043
Time:,01:25:37,Log-Likelihood:,-2647.4
No. Observations:,1524,AIC:,5303.0
Df Residuals:,1520,BIC:,5324.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.4127,0.057,24.908,0.000,1.301,1.524
afchnge,0.0974,0.085,1.149,0.251,-0.069,0.264
highearn,0.1691,0.106,1.602,0.109,-0.038,0.376
afhigh,0.1920,0.154,1.245,0.213,-0.110,0.494

0,1,2,3
Omnibus:,21.618,Durbin-Watson:,1.954
Prob(Omnibus):,0.0,Jarque-Bera (JB):,33.331
Skew:,0.118,Prob(JB):,5.79e-08
Kurtosis:,3.685,Cond. No.,6.16


C4.i The interaction term is still statistically significant (at the 1% level) and the coefficient is higher.

C4.ii The small R-squared indicates that only a small amount of the variation has been explained by the equation. This does not mean the equation is useless since we have no reason to believe that the omitted variables are correlated with any of the variables we have included.

C4.iii The coefficient is not statistically significant. The coefficient is reasonably close. The difference in significance may be because Kentucky has three times as many observations as Michigan, which allowed us to more precisely estimate the interaction.

In [16]:
# Exercise 5
rental = pd.read_stata("./stata/RENTAL.DTA")
X = sm.add_constant(rental[["y90", "lpop", "lavginc", "pctstu"]])
model = sm.OLS(rental.lrent, X, missing = "drop").fit()
model.summary()

  x = pd.concat(x[::order], 1)


0,1,2,3
Dep. Variable:,lrent,R-squared:,0.861
Model:,OLS,Adj. R-squared:,0.857
Method:,Least Squares,F-statistic:,190.9
Date:,"Fri, 13 Aug 2021",Prob (F-statistic):,9.41e-52
Time:,01:25:37,Log-Likelihood:,86.161
No. Observations:,128,AIC:,-162.3
Df Residuals:,123,BIC:,-148.1
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-0.5688,0.535,-1.063,0.290,-1.628,0.490
y90,0.2622,0.035,7.543,0.000,0.193,0.331
lpop,0.0407,0.023,1.807,0.073,-0.004,0.085
lavginc,0.5714,0.053,10.762,0.000,0.466,0.677
pctstu,0.0050,0.001,4.949,0.000,0.003,0.007

0,1,2,3
Omnibus:,34.539,Durbin-Watson:,1.236
Prob(Omnibus):,0.0,Jarque-Bera (JB):,58.256
Skew:,1.255,Prob(JB):,2.24e-13
Kurtosis:,5.15,Cond. No.,1620.0


In [17]:
rental_panel = rental.loc[rental.y90 == 1].set_index("city") - rental.loc[rental.y90 == 0].set_index("city")
X = rental_panel[["y90", "lpop", "lavginc", "pctstu"]]
model = sm.OLS(rental_panel.lrent, X, missing = "drop").fit()
model.summary()

0,1,2,3
Dep. Variable:,lrent,R-squared:,0.322
Model:,OLS,Adj. R-squared:,0.288
Method:,Least Squares,F-statistic:,9.51
Date:,"Fri, 13 Aug 2021",Prob (F-statistic):,3.14e-05
Time:,01:25:37,Log-Likelihood:,65.272
No. Observations:,64,AIC:,-122.5
Df Residuals:,60,BIC:,-113.9
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
y90,0.3855,0.037,10.469,0.000,0.312,0.459
lpop,0.0722,0.088,0.818,0.417,-0.104,0.249
lavginc,0.3100,0.066,4.663,0.000,0.177,0.443
pctstu,0.0112,0.004,2.711,0.009,0.003,0.019

0,1,2,3
Omnibus:,2.653,Durbin-Watson:,1.655
Prob(Omnibus):,0.265,Jarque-Bera (JB):,2.335
Skew:,0.467,Prob(JB):,0.311
Kurtosis:,2.934,Cond. No.,23.0


In [18]:
# Testing to see if linearmodels can do the same thing
# What is up with this R^2?

from linearmodels import FirstDifferenceOLS
other_rental_panel = rental.set_index(["city", "year"])
X = other_rental_panel[["y90", "lpop", "lavginc", "pctstu"]]
FirstDifferenceOLS(other_rental_panel.lrent, X).fit()

  df.index = df.index.set_levels(final_levels, [0, 1])


0,1,2,3
Dep. Variable:,lrent,R-squared:,0.9765
Estimator:,FirstDifferenceOLS,R-squared (Between):,0.9391
No. Observations:,64,R-squared (Within):,0.9765
Date:,"Fri, Aug 13 2021",R-squared (Overall):,0.9392
Time:,01:25:38,Log-likelihood,65.272
Cov. Estimator:,Unadjusted,,
,,F-statistic:,624.15
Entities:,64,P-value,0.0000
Avg Obs:,2.0000,Distribution:,"F(4,60)"
Min Obs:,2.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
y90,0.3855,0.0368,10.469,0.0000,0.3119,0.4592
lpop,0.0722,0.0883,0.8178,0.4167,-0.1045,0.2490
lavginc,0.3100,0.0665,4.6627,0.0000,0.1770,0.4429
pctstu,0.0112,0.0041,2.7114,0.0087,0.0029,0.0195


In [19]:
model.get_robustcov_results("HC3").summary()

0,1,2,3
Dep. Variable:,lrent,R-squared:,0.322
Model:,OLS,Adj. R-squared:,0.288
Method:,Least Squares,F-statistic:,9.598
Date:,"Fri, 13 Aug 2021",Prob (F-statistic):,2.88e-05
Time:,01:25:38,Log-Likelihood:,65.272
No. Observations:,64,AIC:,-122.5
Df Residuals:,60,BIC:,-113.9
Df Model:,3,,
Covariance Type:,HC3,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
y90,0.3855,0.056,6.935,0.000,0.274,0.497
lpop,0.0722,0.074,0.980,0.331,-0.075,0.220
lavginc,0.3100,0.102,3.041,0.003,0.106,0.514
pctstu,0.0112,0.003,3.602,0.001,0.005,0.017

0,1,2,3
Omnibus:,2.653,Durbin-Watson:,1.655
Prob(Omnibus):,0.265,Jarque-Bera (JB):,2.335
Skew:,0.467,Prob(JB):,0.311
Kurtosis:,2.934,Cond. No.,23.0


In [20]:
FirstDifferenceOLS(other_rental_panel.lrent, X).fit(cov_type = "robust")

  df.index = df.index.set_levels(final_levels, [0, 1])


0,1,2,3
Dep. Variable:,lrent,R-squared:,0.9765
Estimator:,FirstDifferenceOLS,R-squared (Between):,0.9391
No. Observations:,64,R-squared (Within):,0.9765
Date:,"Fri, Aug 13 2021",R-squared (Overall):,0.9392
Time:,01:25:38,Log-likelihood,65.272
Cov. Estimator:,Robust,,
,,F-statistic:,624.15
Entities:,64,P-value,0.0000
Avg Obs:,2.0000,Distribution:,"F(4,60)"
Min Obs:,2.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
y90,0.3855,0.0487,7.9132,0.0000,0.2881,0.4830
lpop,0.0722,0.0697,1.0368,0.3040,-0.0671,0.2116
lavginc,0.3100,0.0893,3.4706,0.0010,0.1313,0.4886
pctstu,0.0112,0.0029,3.8159,0.0003,0.0053,0.0171


C5.i Results reported above. The $y90$ dummy is 0.262 and significant at the 1% level. This would mean that rents have increased over 10 years.

C5.ii Given that we started with an unobserved effects model it would suggest that we should not trust the standard errors. This is because the pooled OLS does not account for the unobserved effects which would mean there are endogeneity issues.

C5.iii the coefficient for $pctstu$ increases from 0.005 to 0.0112 and is still significant (though with a smaller t-statistic than the previous one). We may conclude that housing prices increase with the size of the student population, provided that there are not time varying factors that might be affecting $pctstu$ that we have not observed.

C5.iv The error becomes even smaller after adjusting for heteroskedasticity. This is generally encouraging for our results, but would not address the concern for time varying unobserved factors.

In [21]:
# Exercise 6
crime3 = pd.read_stata("./stata/CRIME3.DTA")

crime_diff = crime3.loc[crime3.d78 == 1].set_index("district") - crime3.loc[crime3.d78 == 0].set_index("district")

X = sm.add_constant(crime_diff[["d78", "clrprc1", "clrprc2"]])
model = sm.OLS(crime_diff.lcrime, X, missing = "drop").fit()
model.summary()

  x = pd.concat(x[::order], 1)


0,1,2,3
Dep. Variable:,lcrime,R-squared:,0.193
Model:,OLS,Adj. R-squared:,0.161
Method:,Least Squares,F-statistic:,5.992
Date:,"Fri, 13 Aug 2021",Prob (F-statistic):,0.00465
Time:,01:25:38,Log-Likelihood:,-17.194
No. Observations:,53,AIC:,40.39
Df Residuals:,50,BIC:,46.3
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
d78,0.0857,0.064,1.343,0.185,-0.042,0.214
clrprc1,-0.0040,0.005,-0.858,0.395,-0.014,0.005
clrprc2,-0.0132,0.005,-2.540,0.014,-0.024,-0.003

0,1,2,3
Omnibus:,3.032,Durbin-Watson:,2.203
Prob(Omnibus):,0.22,Jarque-Bera (JB):,2.071
Skew:,-0.344,Prob(JB):,0.355
Kurtosis:,3.681,Cond. No.,23.6


In [22]:
model.f_test("clrprc1 = clrprc2")

<class 'statsmodels.stats.contrast.ContrastResults'>
<F test: F=array([[1.15268996]]), p=0.2881410390959788, df_denom=50, df_num=1>

In [23]:
# Should be equivalent to the test above, using the hint from the textbook
crime_diff["equality_test"] = crime_diff["clrprc1"] + crime_diff["clrprc2"]
sm.OLS(crime_diff.lcrime, crime_diff[["d78", "clrprc1", "equality_test"]], missing = "drop").fit().summary()

0,1,2,3
Dep. Variable:,lcrime,R-squared:,0.193
Model:,OLS,Adj. R-squared:,0.161
Method:,Least Squares,F-statistic:,5.992
Date:,"Fri, 13 Aug 2021",Prob (F-statistic):,0.00465
Time:,01:25:38,Log-Likelihood:,-17.194
No. Observations:,53,AIC:,40.39
Df Residuals:,50,BIC:,46.3
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
d78,0.0857,0.064,1.343,0.185,-0.042,0.214
clrprc1,0.0091,0.009,1.074,0.288,-0.008,0.026
equality_test,-0.0132,0.005,-2.540,0.014,-0.024,-0.003

0,1,2,3
Omnibus:,3.032,Durbin-Watson:,2.203
Prob(Omnibus):,0.22,Jarque-Bera (JB):,2.071
Skew:,-0.344,Prob(JB):,0.355
Kurtosis:,3.681,Cond. No.,37.3


In [24]:
crime_diff["avgclr"] = (crime_diff["clrprc1"] + crime_diff["clrprc2"]) / 2
sm.OLS(crime_diff.lcrime, crime_diff[["d78", "avgclr"]], missing = "drop").fit().summary()

0,1,2,3
Dep. Variable:,lcrime,R-squared:,0.175
Model:,OLS,Adj. R-squared:,0.159
Method:,Least Squares,F-statistic:,10.8
Date:,"Fri, 13 Aug 2021",Prob (F-statistic):,0.00184
Time:,01:25:38,Log-Likelihood:,-17.798
No. Observations:,53,AIC:,39.6
Df Residuals:,51,BIC:,43.54
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
d78,0.0993,0.063,1.587,0.119,-0.026,0.225
avgclr,-0.0167,0.005,-3.286,0.002,-0.027,-0.006

0,1,2,3
Omnibus:,2.947,Durbin-Watson:,2.201
Prob(Omnibus):,0.229,Jarque-Bera (JB):,2.069
Skew:,-0.279,Prob(JB):,0.355
Kurtosis:,3.791,Cond. No.,16.4


C6.i Tests are above. There is no evidence of a difference between the two lags

C6.ii If $\beta_1 = \beta_2$ then the independent variables can become $\beta1(clrprc1 + clrprc2)$. Taking the average divides by two and so $\delta_1 = 2\beta_1$ maintains the equality.

C6.iii The $R^2$ is a smaller than the original model, but it would be hard to justify a model choice off of this alone. On the other hand, our justification for the second model is based on a failure to reject the null hypothesis in part (i) and so is as much of a statement of the data we're working with as it is the model. If pressed I'd likely choose the model from C6.ii, but I'd feel even better with more data.

In [25]:
# Exercise 7
gpa3 = pd.read_stata("./stata/GPA3.DTA")

X = sm.add_constant(gpa3[["spring", "sat", "hsperc", "female", "black", "white", "frstsem", "tothrs", "crsgpa", "season"]])
model = sm.OLS(gpa3.trmgpa, X, missing = "drop").fit()
model.summary()

  x = pd.concat(x[::order], 1)


0,1,2,3
Dep. Variable:,trmgpa,R-squared:,0.478
Model:,OLS,Adj. R-squared:,0.47
Method:,Least Squares,F-statistic:,65.91
Date:,"Fri, 13 Aug 2021",Prob (F-statistic):,8.549999999999999e-95
Time:,01:25:38,Log-Likelihood:,-597.97
No. Observations:,732,AIC:,1218.0
Df Residuals:,721,BIC:,1269.0
Df Model:,10,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-1.7528,0.348,-5.038,0.000,-2.436,-1.070
spring,-0.0580,0.048,-1.208,0.228,-0.152,0.036
sat,0.0017,0.000,11.367,0.000,0.001,0.002
hsperc,-0.0087,0.001,-8.358,0.000,-0.011,-0.007
female,0.3504,0.052,6.758,0.000,0.249,0.452
black,-0.2541,0.123,-2.068,0.039,-0.495,-0.013
white,-0.0233,0.117,-0.199,0.843,-0.254,0.207
frstsem,-0.0347,0.076,-0.456,0.649,-0.184,0.115
tothrs,-0.0003,0.001,-0.466,0.641,-0.002,0.001

0,1,2,3
Omnibus:,14.917,Durbin-Watson:,1.515
Prob(Omnibus):,0.001,Jarque-Bera (JB):,15.556
Skew:,-0.317,Prob(JB):,0.000419
Kurtosis:,3.329,Cond. No.,16300.0


In [26]:
gpa_diff = gpa3.loc[gpa3.term == 2].set_index("id") - gpa3.loc[gpa3.term == 1].set_index("id")

X = sm.add_constant(gpa_diff[["spring", "frstsem", "tothrs", "crsgpa", "season"]])
model = sm.OLS(gpa_diff.trmgpa, X, missing = "drop").fit()
model.summary()

  x = pd.concat(x[::order], 1)


0,1,2,3
Dep. Variable:,trmgpa,R-squared:,0.208
Model:,OLS,Adj. R-squared:,0.199
Method:,Least Squares,F-statistic:,23.7
Date:,"Fri, 13 Aug 2021",Prob (F-statistic):,2.02e-17
Time:,01:25:38,Log-Likelihood:,-316.43
No. Observations:,366,AIC:,642.9
Df Residuals:,361,BIC:,662.4
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
spring,-0.2366,0.206,-1.149,0.252,-0.642,0.168
frstsem,0.0191,0.069,0.276,0.782,-0.117,0.155
tothrs,0.0122,0.014,0.845,0.399,-0.016,0.040
crsgpa,1.1364,0.119,9.564,0.000,0.903,1.370
season,-0.0645,0.043,-1.517,0.130,-0.148,0.019

0,1,2,3
Omnibus:,12.745,Durbin-Watson:,1.98
Prob(Omnibus):,0.002,Jarque-Bera (JB):,20.732
Skew:,-0.222,Prob(JB):,3.15e-05
Kurtosis:,4.078,Cond. No.,97.1


C7.i Results listed above. The coefficient is negative and so it implies when a sport is in season the student athlete's GPA is 0.027 less than it would be otherwise, however, this result is not statistically significant.

C7.ii This comes back to the basic concern of omitted variables. If ability is correlated with season then our estimates are biased (pooling does not remove omitted variable bias).

C7.iii Results reported above. sat, hsperc, female, black, and white all need to be removed because they are fixed and so will be differenced away. The in season effect is larger in magnitude, with the athlete's GPA falling 0.65, but the result is still not statistically significant at any level we have considered so far.

C7.iv The most obvious example of a time varying variable not considered is workload. It stands to reason students may take lighter courseloads in season.

In [27]:
# Exercise 8
vote2 = pd.read_stata("./stata/VOTE2.DTA")

X = sm.add_constant(vote2[["clinexp", "clchexp", "cincshr"]])
model = sm.OLS(vote2.cvote, X, missing = "drop").fit()
model.summary()

  x = pd.concat(x[::order], 1)


0,1,2,3
Dep. Variable:,cvote,R-squared:,0.244
Model:,OLS,Adj. R-squared:,0.229
Method:,Least Squares,F-statistic:,16.43
Date:,"Fri, 13 Aug 2021",Prob (F-statistic):,2.63e-09
Time:,01:25:38,Log-Likelihood:,-541.49
No. Observations:,157,AIC:,1091.0
Df Residuals:,153,BIC:,1103.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-2.5559,0.631,-4.051,0.000,-3.802,-1.310
clinexp,-1.2915,1.382,-0.934,0.352,-4.022,1.439
clchexp,-0.5985,0.711,-0.841,0.401,-2.004,0.807
cincshr,0.1559,0.064,2.448,0.016,0.030,0.282

0,1,2,3
Omnibus:,70.428,Durbin-Watson:,1.83
Prob(Omnibus):,0.0,Jarque-Bera (JB):,506.142
Skew:,1.42,Prob(JB):,1.24e-110
Kurtosis:,11.325,Cond. No.,45.6


In [28]:
model.f_test("clinexp = clchexp = 0")

<class 'statsmodels.stats.contrast.ContrastResults'>
<F test: F=array([[1.51264339]]), p=0.22360306890216736, df_denom=153, df_num=2>

In [29]:
X = sm.add_constant(vote2[["cincshr"]])
model = sm.OLS(vote2.cvote, X, missing = "drop").fit()
model.summary()

  x = pd.concat(x[::order], 1)


0,1,2,3
Dep. Variable:,cvote,R-squared:,0.229
Model:,OLS,Adj. R-squared:,0.224
Method:,Least Squares,F-statistic:,45.97
Date:,"Fri, 13 Aug 2021",Prob (F-statistic):,2.38e-10
Time:,01:25:38,Log-Likelihood:,-543.02
No. Observations:,157,AIC:,1090.0
Df Residuals:,155,BIC:,1096.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-2.6811,0.625,-4.288,0.000,-3.916,-1.446
cincshr,0.2176,0.032,6.780,0.000,0.154,0.281

0,1,2,3
Omnibus:,64.366,Durbin-Watson:,1.844
Prob(Omnibus):,0.0,Jarque-Bera (JB):,438.178
Skew:,1.283,Prob(JB):,7.09e-96
Kurtosis:,10.772,Cond. No.,19.7


In [30]:
vote2_repeats = vote2[vote2.rptchall == 1]

X = sm.add_constant(vote2_repeats[["cincshr"]])
model = sm.OLS(vote2_repeats.cvote, X, missing = "drop").fit()
model.summary()

  x = pd.concat(x[::order], 1)


0,1,2,3
Dep. Variable:,cvote,R-squared:,0.037
Model:,OLS,Adj. R-squared:,0.006
Method:,Least Squares,F-statistic:,1.189
Date:,"Fri, 13 Aug 2021",Prob (F-statistic):,0.284
Time:,01:25:38,Log-Likelihood:,-103.08
No. Observations:,33,AIC:,210.2
Df Residuals:,31,BIC:,213.2
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-2.2498,0.999,-2.253,0.031,-4.286,-0.213
cincshr,0.0924,0.085,1.090,0.284,-0.080,0.265

0,1,2,3
Omnibus:,0.727,Durbin-Watson:,1.434
Prob(Omnibus):,0.695,Jarque-Bera (JB):,0.25
Skew:,0.206,Prob(JB):,0.882
Kurtosis:,3.111,Cond. No.,11.9


C8.i Results above. Only the differenced share of the incumbent's spending is significant.

C8.ii The other two terms are not jointly significant at any level we have considered.

C8.iii When using the income share as the only variable the coefficient is larger and more precise. A 10 percentage point increase in the incumbent's share of spending translates into about a 2 percentage point increase in the share of the final vote.

C8.iv Results above. The significance vanishes and the coefficient is smaller.

In [31]:
# Exercise 9
crime4 = pd.read_stata("./stata/CRIME4.DTA")

crime4_diff = pd.concat([crime4[crime4.year == 87].set_index("county") - crime4[crime4.year == 86].set_index("county"),
                         crime4[crime4.year == 86].set_index("county") - crime4[crime4.year == 85].set_index("county"),
                         crime4[crime4.year == 85].set_index("county") - crime4[crime4.year == 84].set_index("county"),
                         crime4[crime4.year == 84].set_index("county") - crime4[crime4.year == 83].set_index("county"),
                         crime4[crime4.year == 83].set_index("county") - crime4[crime4.year == 82].set_index("county"),
                         crime4[crime4.year == 82].set_index("county") - crime4[crime4.year == 81].set_index("county")])
X = sm.add_constant(crime4_diff[["d83", "d84", "d85", "d86", "d87", "lprbarr", "lprbconv", "lprbpris", "lavgsen", "lpolpc", "lwcon", "lwtuc", "lwtrd", "lwfir", "lwser", "lwmfg", "lwfed", "lwsta", "lwloc"]])
model = sm.OLS(crime4_diff.lcrmrte, X, missing = "drop").fit()
model.summary()

  x = pd.concat(x[::order], 1)


0,1,2,3
Dep. Variable:,lcrmrte,R-squared:,0.445
Model:,OLS,Adj. R-squared:,0.424
Method:,Least Squares,F-statistic:,21.9
Date:,"Fri, 13 Aug 2021",Prob (F-statistic):,1.42e-54
Time:,01:25:39,Log-Likelihood:,254.26
No. Observations:,540,AIC:,-468.5
Df Residuals:,520,BIC:,-382.7
Df Model:,19,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.0199,0.021,0.959,0.338,-0.021,0.061
d83,-0.1109,0.027,-4.135,0.000,-0.164,-0.058
d84,-0.1483,0.044,-3.408,0.001,-0.234,-0.063
d85,-0.1489,0.060,-2.462,0.014,-0.268,-0.030
d86,-0.1174,0.078,-1.509,0.132,-0.270,0.035
d87,-0.0785,0.096,-0.816,0.415,-0.268,0.110
lprbarr,-0.3231,0.030,-10.763,0.000,-0.382,-0.264
lprbconv,-0.2403,0.018,-13.168,0.000,-0.276,-0.204
lprbpris,-0.1694,0.026,-6.473,0.000,-0.221,-0.118

0,1,2,3
Omnibus:,62.332,Durbin-Watson:,2.054
Prob(Omnibus):,0.0,Jarque-Bera (JB):,444.147
Skew:,-0.119,Prob(JB):,3.59e-97
Kurtosis:,7.437,Cond. No.,26.9


In [32]:
# Test for linearmodels
# Note: Our solution has a constant. FirstDifferenceOLS rejects presence of
# a cosnstant, but the constant is equivalent to "d82" for our purposes.
crime4_panel = crime4.set_index(["county", "year"])
X = crime4_panel[["d82", "d83", "d84", "d85", "d86", "d87", "lprbarr", "lprbconv", "lprbpris", "lavgsen", "lpolpc", "lwcon", "lwtuc", "lwtrd", "lwfir", "lwser", "lwmfg", "lwfed", "lwsta", "lwloc"]]
FirstDifferenceOLS(crime4_panel.lcrmrte, X).fit()

  df.index = df.index.set_levels(final_levels, [0, 1])


0,1,2,3
Dep. Variable:,lcrmrte,R-squared:,0.4446
Estimator:,FirstDifferenceOLS,R-squared (Between):,0.9675
No. Observations:,540,R-squared (Within):,0.4395
Date:,"Fri, Aug 13 2021",R-squared (Overall):,0.9664
Time:,01:25:39,Log-likelihood,254.26
Cov. Estimator:,Unadjusted,,
,,F-statistic:,20.813
Entities:,90,P-value,0.0000
Avg Obs:,7.0000,Distribution:,"F(20,520)"
Min Obs:,7.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
d82,0.0199,0.0207,0.9592,0.3379,-0.0208,0.0605
d83,-0.0712,0.0330,-2.1548,0.0316,-0.1360,-0.0063
d84,-0.0887,0.0435,-2.0416,0.0417,-0.1741,-0.0033
d85,-0.0695,0.0582,-1.1928,0.2335,-0.1838,0.0449
d86,-0.0181,0.0697,-0.2599,0.7950,-0.1551,0.1189
d87,0.0406,0.0813,0.4996,0.6176,-0.1190,0.2002
lprbarr,-0.3231,0.0300,-10.763,0.0000,-0.3821,-0.2641
lprbconv,-0.2403,0.0182,-13.168,0.0000,-0.2761,-0.2044
lprbpris,-0.1694,0.0262,-6.4725,0.0000,-0.2208,-0.1180


In [33]:
model.f_test("lwcon = lwtuc = lwtrd = lwfir = lwser = lwmfg = lwfed = lwsta = lwloc = 0")

<class 'statsmodels.stats.contrast.ContrastResults'>
<F test: F=array([[1.25118035]]), p=0.261249436040526, df_denom=520, df_num=9>

C9.i The justice variables from Example 13.9 appear to be relatively unchanged (both in magnitude and significance).

C9.ii The signs for the wage variables appear to be all over the place and not always consistent with expectations (higher wages increasing crime). Only one variable is significant at the 10% level (lwtuc, wage for transportation, utilities, and communication) and the wage variables are not jointly significant.

In [34]:
# Exercise 10
jtrain = pd.read_stata("./stata/JTRAIN.DTA")

X = sm.add_constant(jtrain[["d89", "cgrant", "cgrant_1", "clemploy"]])
model = sm.OLS(jtrain.chrsemp, X, missing = "drop").fit()
model.summary()

  x = pd.concat(x[::order], 1)


0,1,2,3
Dep. Variable:,chrsemp,R-squared:,0.476
Model:,OLS,Adj. R-squared:,0.467
Method:,Least Squares,F-statistic:,55.8
Date:,"Fri, 13 Aug 2021",Prob (F-statistic):,1.92e-33
Time:,01:25:39,Log-Likelihood:,-1098.3
No. Observations:,251,AIC:,2207.0
Df Residuals:,246,BIC:,2224.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-0.7400,1.942,-0.381,0.703,-4.565,3.084
d89,5.4232,2.649,2.047,0.042,0.205,10.641
cgrant,32.6011,2.968,10.983,0.000,26.755,38.447
cgrant_1,1.9969,5.555,0.359,0.720,-8.944,12.938
clemploy,0.7440,4.868,0.153,0.879,-8.844,10.332

0,1,2,3
Omnibus:,127.706,Durbin-Watson:,2.391
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1537.543
Skew:,1.695,Prob(JB):,0.0
Kurtosis:,14.641,Cond. No.,5.73


In [35]:
jtrain[["fcode", "year", "d89", "cgrant", "cgrant_1", "clemploy"]].dropna().fcode.unique().size

146

In [36]:
jtrain["fcode"].unique().size

157

C10.i Results from estimation are above. 146 firms are used to produce the results above. There are a total of 157 firms in the original data set and so if we assume that there were no missing variables one year would be dropped because of the differencing and the estimation would be run on 314 (2 * 157) observations.

C10.ii The coefficient on grant is large and significant. It means that companies that recieved the grant increased training by about 32.6 hours (compared to the base case).

C10.iii It is not particularly surprising that the lagged grant should be insignificant. Training decisions are likely to be made based on resources available today.

C10.iv Among the variables we are using, lemploy is the one associated with firm size. The coefficient is small and not statistically significant (in fact, it has the smallest absolute value for a t-statistic) which suggests to us that larger firms do not train their employees any more or less on average. 

In [37]:
# Excercise 11
mathpnl = pd.read_stata("./stata/mathpnl.dta")

math1998 = mathpnl[mathpnl.year == 1998].set_index("distid") - mathpnl[mathpnl.year == 1997].set_index("distid")
math1997 = mathpnl[mathpnl.year == 1997].set_index("distid") - mathpnl[mathpnl.year == 1996].set_index("distid")
math1996 = mathpnl[mathpnl.year == 1996].set_index("distid") - mathpnl[mathpnl.year == 1995].set_index("distid")
math1995 = mathpnl[mathpnl.year == 1995].set_index("distid") - mathpnl[mathpnl.year == 1994].set_index("distid")
math1994 = mathpnl[mathpnl.year == 1994].set_index("distid") - mathpnl[mathpnl.year == 1993].set_index("distid")
math1993 = mathpnl[mathpnl.year == 1993].set_index("distid") - mathpnl[mathpnl.year == 1992].set_index("distid")

for var in ["y98", "y97", "y96", "y95", "y94", "y93"]:
    math1998[var] = 0
    math1997[var] = 0
    math1996[var] = 0
    math1995[var] = 0
    math1994[var] = 0
    math1993[var] = 0
    if var == "y98":
        math1998[var] = 1
    elif var == "y97":
        math1997[var] = 1
    elif var == "y96":
        math1996[var] = 1
    elif var == "y95":
        math1995[var] = 1
    elif var == "y94":
        math1994[var] = 1
    elif var == "y93":
        math1993[var]
        
math1998["year"] = 1998
math1997["year"] = 1997
math1996["year"] = 1996
math1995["year"] = 1995
math1994["year"] = 1994
math1993["year"] = 1993
        
mathpnl_diff = pd.concat([math1998, math1997, math1996, math1995, math1994, math1993])

X = sm.add_constant(mathpnl_diff[["y94", "y95", "y96", "y97", "y98", "lrexpp", "lenrol", "lunch"]])
model = sm.OLS(mathpnl_diff.math4, X, missing = "drop").fit()
model.summary()

  x = pd.concat(x[::order], 1)


0,1,2,3
Dep. Variable:,math4,R-squared:,0.208
Model:,OLS,Adj. R-squared:,0.206
Method:,Least Squares,F-statistic:,108.0
Date:,"Fri, 13 Aug 2021",Prob (F-statistic):,1.57e-160
Time:,01:25:39,Log-Likelihood:,-12851.0
No. Observations:,3300,AIC:,25720.0
Df Residuals:,3291,BIC:,25770.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.9550,0.518,11.491,0.000,4.939,6.971
y94,0.5211,0.728,0.715,0.474,-0.907,1.949
y95,6.8124,0.779,8.749,0.000,5.286,8.339
y96,-5.2349,0.727,-7.200,0.000,-6.661,-3.809
y97,-8.4885,0.722,-11.754,0.000,-9.904,-7.072
y98,8.9678,0.719,12.469,0.000,7.558,10.378
lrexpp,-3.4473,2.760,-1.249,0.212,-8.859,1.964
lenrol,0.6345,1.029,0.617,0.537,-1.382,2.651
lunch,0.0251,0.055,0.452,0.651,-0.084,0.134

0,1,2,3
Omnibus:,296.891,Durbin-Watson:,1.972
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2008.298
Skew:,0.038,Prob(JB):,0.0
Kurtosis:,6.821,Cond. No.,53.1


In [38]:
X = sm.add_constant(mathpnl_diff[["y95", "y96", "y97", "y98", "lrexpp", "lrexpp_1", "lenrol", "lunch"]])
model = sm.OLS(mathpnl_diff.math4, X, missing = "drop").fit()
model.summary()

  x = pd.concat(x[::order], 1)


0,1,2,3
Dep. Variable:,math4,R-squared:,0.238
Model:,OLS,Adj. R-squared:,0.235
Method:,Least Squares,F-statistic:,106.8
Date:,"Fri, 13 Aug 2021",Prob (F-statistic):,2.14e-155
Time:,01:25:39,Log-Likelihood:,-10751.0
No. Observations:,2750,AIC:,21520.0
Df Residuals:,2741,BIC:,21570.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,6.1586,0.551,11.171,0.000,5.078,7.240
y95,5.7047,0.774,7.367,0.000,4.186,7.223
y96,-6.7959,0.790,-8.606,0.000,-8.344,-5.248
y97,-8.9894,0.738,-12.186,0.000,-10.436,-7.543
y98,8.4530,0.744,11.369,0.000,6.995,9.911
lrexpp,-1.4107,3.037,-0.464,0.642,-7.367,4.545
lrexpp_1,11.0403,2.786,3.963,0.000,5.578,16.503
lenrol,2.1400,1.177,1.818,0.069,-0.168,4.448
lunch,0.0728,0.061,1.184,0.236,-0.048,0.193

0,1,2,3
Omnibus:,221.494,Durbin-Watson:,1.938
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1251.628
Skew:,0.095,Prob(JB):,1.63e-272
Kurtosis:,6.3,Cond. No.,54.0


In [39]:
model = sm.OLS(mathpnl_diff.math4, X, missing = "drop").fit(cov_type = "HC3")
model.summary()

0,1,2,3
Dep. Variable:,math4,R-squared:,0.238
Model:,OLS,Adj. R-squared:,0.235
Method:,Least Squares,F-statistic:,107.4
Date:,"Fri, 13 Aug 2021",Prob (F-statistic):,3.24e-156
Time:,01:25:39,Log-Likelihood:,-10751.0
No. Observations:,2750,AIC:,21520.0
Df Residuals:,2741,BIC:,21570.0
Df Model:,8,,
Covariance Type:,HC3,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,6.1586,0.588,10.470,0.000,5.006,7.312
y95,5.7047,0.800,7.132,0.000,4.137,7.272
y96,-6.7959,0.847,-8.025,0.000,-8.456,-5.136
y97,-8.9894,0.755,-11.912,0.000,-10.468,-7.510
y98,8.4530,0.775,10.913,0.000,6.935,9.971
lrexpp,-1.4107,4.409,-0.320,0.749,-10.052,7.231
lrexpp_1,11.0403,4.530,2.437,0.015,2.162,19.918
lenrol,2.1400,1.442,1.484,0.138,-0.685,4.966
lunch,0.0728,0.146,0.498,0.618,-0.214,0.359

0,1,2,3
Omnibus:,221.494,Durbin-Watson:,1.938
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1251.628
Skew:,0.095,Prob(JB):,1.63e-272
Kurtosis:,6.3,Cond. No.,54.0


In [40]:
model = sm.OLS(mathpnl_diff.math4, X, missing = "drop").fit(cov_type = "HAC", cov_kwds = {"maxlags": 1})
model.summary()

0,1,2,3
Dep. Variable:,math4,R-squared:,0.238
Model:,OLS,Adj. R-squared:,0.235
Method:,Least Squares,F-statistic:,103.0
Date:,"Fri, 13 Aug 2021",Prob (F-statistic):,1.78e-150
Time:,01:25:39,Log-Likelihood:,-10751.0
No. Observations:,2750,AIC:,21520.0
Df Residuals:,2741,BIC:,21570.0
Df Model:,8,,
Covariance Type:,HAC,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,6.1586,0.576,10.688,0.000,5.029,7.288
y95,5.7047,0.784,7.278,0.000,4.168,7.241
y96,-6.7959,0.845,-8.038,0.000,-8.453,-5.139
y97,-8.9894,0.753,-11.942,0.000,-10.465,-7.514
y98,8.4530,0.776,10.894,0.000,6.932,9.974
lrexpp,-1.4107,4.328,-0.326,0.744,-9.894,7.072
lrexpp_1,11.0403,4.343,2.542,0.011,2.528,19.553
lenrol,2.1400,1.398,1.531,0.126,-0.600,4.880
lunch,0.0728,0.141,0.517,0.605,-0.203,0.349

0,1,2,3
Omnibus:,221.494,Durbin-Watson:,1.938
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1251.628
Skew:,0.095,Prob(JB):,1.63e-272
Kurtosis:,6.3,Cond. No.,54.0


In [41]:
mathpnl_diff = mathpnl_diff.set_index([mathpnl_diff.index, mathpnl_diff["year"]])
X = sm.add_constant(mathpnl_diff[["y95", "y96", "y97", "y98", "lrexpp", "lrexpp_1", "lenrol", "lunch"]])
model = sm.OLS(mathpnl_diff.math4, X, missing = "drop").fit(cov_type = "HAC", cov_kwds = {"maxlags": 1})
model.summary()

  x = pd.concat(x[::order], 1)


0,1,2,3
Dep. Variable:,math4,R-squared:,0.238
Model:,OLS,Adj. R-squared:,0.235
Method:,Least Squares,F-statistic:,103.0
Date:,"Fri, 13 Aug 2021",Prob (F-statistic):,1.78e-150
Time:,01:25:39,Log-Likelihood:,-10751.0
No. Observations:,2750,AIC:,21520.0
Df Residuals:,2741,BIC:,21570.0
Df Model:,8,,
Covariance Type:,HAC,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,6.1586,0.576,10.688,0.000,5.029,7.288
y95,5.7047,0.784,7.278,0.000,4.168,7.241
y96,-6.7959,0.845,-8.038,0.000,-8.453,-5.139
y97,-8.9894,0.753,-11.942,0.000,-10.465,-7.514
y98,8.4530,0.776,10.894,0.000,6.932,9.974
lrexpp,-1.4107,4.328,-0.326,0.744,-9.894,7.072
lrexpp_1,11.0403,4.343,2.542,0.011,2.528,19.553
lenrol,2.1400,1.398,1.531,0.126,-0.600,4.880
lunch,0.0728,0.141,0.517,0.605,-0.203,0.349

0,1,2,3
Omnibus:,221.494,Durbin-Watson:,1.938
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1251.628
Skew:,0.095,Prob(JB):,1.63e-272
Kurtosis:,6.3,Cond. No.,54.0


In [42]:
resid_1 = model.resid[model.resid.index.get_level_values(1) > 1994].reset_index().iloc[:,-1]
resid_0 = model.resid[model.resid.index.get_level_values(1) < 1998].reset_index().iloc[:,-1]

# Test for AR(1)
sm.OLS(resid_1, resid_0, missing="drop").fit().summary()

0,1,2,3
Dep. Variable:,y,R-squared (uncentered):,0.211
Model:,OLS,Adj. R-squared (uncentered):,0.211
Method:,Least Squares,F-statistic:,588.3
Date:,"Fri, 13 Aug 2021",Prob (F-statistic):,2.3e-115
Time:,01:25:39,Log-Likelihood:,-8338.6
No. Observations:,2200,AIC:,16680.0
Df Residuals:,2199,BIC:,16680.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,-0.4629,0.019,-24.255,0.000,-0.500,-0.425

0,1,2,3
Omnibus:,170.974,Durbin-Watson:,1.903
Prob(Omnibus):,0.0,Jarque-Bera (JB):,948.611
Skew:,0.005,Prob(JB):,1.03e-206
Kurtosis:,6.217,Cond. No.,1.0


In [43]:
model.f_test("lenrol = lunch = 0")

<class 'statsmodels.stats.contrast.ContrastResults'>
<F test: F=array([[1.42387952]]), p=0.24095613690732742, df_denom=2.74e+03, df_num=2>

C11.i Given that we have a level on the left hand side (math4) and a log on the right hand side (rexpp), the coefficient can be interpreted as a percentage change (from table 2.3 $\Delta y = (\beta_1 / 100)\% \Delta x$) and so dividing by 10 instead of 100 is the percentage point change in math4 in response to a 10% increase in real per student spending.

C11.ii Results above. Every 1% increase in spending apparently decreases the math pass rate by 0.035 percentage points (though the result is not significant).

C11.iii Results from the lag are above. Current spending still isn't significant (and still negative), but the lagged spending is large and significant. It implies a 1% increase in spending in the previous period increases the pass rate by about increases the pass rate by about .11 percentage points.

C11.iv Estimation with HC3 errors are above. Spending is broadly the same (what isn't significant remains so, significant variables remain significant).

C11.v The HAC estimation is above. The lagged spending variable continues to lose a little significance but remains significant (at the 5% level). No other major changes.

C11.vi Using the AR(1) test from the previous chapter we see the residuals are negative and statistically significant. The HAC errors are appropriate given the strong negative serial correlation.

C11.vii The F-test for lenrol and lunch returns a p-value of 0.24, suggesting that they are not jointly significant and so not necessary for the model.

In [44]:
# Exercise 12
murder = pd.read_stata("./stata/MURDER.DTA")
murder_limit = murder[murder.year.isin([90, 93])]

X = sm.add_constant(murder_limit[["d93", "exec", "unem"]])
model = sm.OLS(murder_limit.mrdrte, X, missing="drop").fit()
model.summary()

  x = pd.concat(x[::order], 1)


0,1,2,3
Dep. Variable:,mrdrte,R-squared:,0.102
Model:,OLS,Adj. R-squared:,0.074
Method:,Least Squares,F-statistic:,3.695
Date:,"Fri, 13 Aug 2021",Prob (F-statistic):,0.0144
Time:,01:25:39,Log-Likelihood:,-379.81
No. Observations:,102,AIC:,767.6
Df Residuals:,98,BIC:,778.1
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-5.2780,4.428,-1.192,0.236,-14.065,3.509
d93,-2.0674,2.145,-0.964,0.337,-6.323,2.189
exec,0.1277,0.263,0.485,0.629,-0.395,0.650
unem,2.5289,0.782,3.235,0.002,0.978,4.080

0,1,2,3
Omnibus:,148.824,Durbin-Watson:,1.062
Prob(Omnibus):,0.0,Jarque-Bera (JB):,5022.506
Skew:,5.38,Prob(JB):,0.0
Kurtosis:,35.649,Cond. No.,28.0


In [45]:
murder_limit_diff = murder_limit[murder_limit.year == 93].drop("state", axis=1).set_index("id") - murder_limit[murder_limit.year == 90].drop("state", axis=1).set_index("id")
X = sm.add_constant(murder_limit_diff[["exec", "unem"]])
model = sm.OLS(murder_limit_diff.mrdrte, X, missing="drop").fit()
model.summary()

  x = pd.concat(x[::order], 1)


0,1,2,3
Dep. Variable:,mrdrte,R-squared:,0.11
Model:,OLS,Adj. R-squared:,0.073
Method:,Least Squares,F-statistic:,2.959
Date:,"Fri, 13 Aug 2021",Prob (F-statistic):,0.0614
Time:,01:25:39,Log-Likelihood:,-74.693
No. Observations:,51,AIC:,155.4
Df Residuals:,48,BIC:,161.2
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.4133,0.209,1.974,0.054,-0.008,0.834
exec,-0.1038,0.043,-2.392,0.021,-0.191,-0.017
unem,-0.0666,0.159,-0.420,0.677,-0.386,0.252

0,1,2,3
Omnibus:,0.134,Durbin-Watson:,2.223
Prob(Omnibus):,0.935,Jarque-Bera (JB):,0.05
Skew:,0.067,Prob(JB):,0.976
Kurtosis:,2.927,Cond. No.,5.71


In [46]:
name = ['Lagrange multiplier statistic', 'p-value',
        'f-value', 'f p-value']

test = sms.het_breuschpagan(model.resid, model.model.exog)
lzip(name, test)

[('Lagrange multiplier statistic', 1.2367893292585759),
 ('p-value', 0.5388087123676443),
 ('f-value', 0.5964836975371867),
 ('f p-value', 0.5547754076830905)]

In [47]:
test = sms.het_white(model.resid, model.model.exog)
lzip(name, test)

[('Lagrange multiplier statistic', 2.1472675227170255),
 ('p-value', 0.8284153514682561),
 ('f-value', 0.39558499032658484),
 ('f p-value', 0.8492596753586841)]

In [48]:
model = sm.OLS(murder_limit_diff.mrdrte, X, missing="drop").fit(cov_type="HC3")
model.summary()

0,1,2,3
Dep. Variable:,mrdrte,R-squared:,0.11
Model:,OLS,Adj. R-squared:,0.073
Method:,Least Squares,F-statistic:,3.543
Date:,"Fri, 13 Aug 2021",Prob (F-statistic):,0.0367
Time:,01:25:39,Log-Likelihood:,-74.693
No. Observations:,51,AIC:,155.4
Df Residuals:,48,BIC:,161.2
Df Model:,2,,
Covariance Type:,HC3,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,0.4133,0.204,2.029,0.042,0.014,0.812
exec,-0.1038,0.040,-2.616,0.009,-0.182,-0.026
unem,-0.0666,0.159,-0.419,0.675,-0.378,0.245

0,1,2,3
Omnibus:,0.134,Durbin-Watson:,2.223
Prob(Omnibus):,0.935,Jarque-Bera (JB):,0.05
Skew:,0.067,Prob(JB):,0.976
Kurtosis:,2.927,Cond. No.,5.71


C12.i Results above. The coefficient is positive but not statistically significant and so there is no evidence (in the Pooled OLS) that there is a deterrent effect.

C12.ii Results above. The FD estimates produce a deterrent effect with a negative coefficient that is significant at the 5% level.

C12.iii Both tests reported above. Neither test gives any evidence for heteroskedasticity.

C12.iv Results with HC3 errors above. The result for exec is now significant at the 1% level.

C12.v There is no evidence that we would need to use the robust standard errors. In addition, the ordinary standard errors are the more conservative of the two and so and still show a significant effect. It would be both more consistent and convincing to use the regular errors.

In [49]:
# Exercise 13
wagepan = pd.read_stata("./stata/wagepan.dta")

wagepan1987 = wagepan[wagepan.year == 1987].set_index("nr") - wagepan[wagepan.year == 1986].set_index("nr")
wagepan1986 = wagepan[wagepan.year == 1986].set_index("nr") - wagepan[wagepan.year == 1985].set_index("nr")
wagepan1985 = wagepan[wagepan.year == 1985].set_index("nr") - wagepan[wagepan.year == 1984].set_index("nr")
wagepan1984 = wagepan[wagepan.year == 1984].set_index("nr") - wagepan[wagepan.year == 1983].set_index("nr")
wagepan1983 = wagepan[wagepan.year == 1983].set_index("nr") - wagepan[wagepan.year == 1982].set_index("nr")
wagepan1982 = wagepan[wagepan.year == 1982].set_index("nr") - wagepan[wagepan.year == 1981].set_index("nr")
wagepan1981 = wagepan[wagepan.year == 1981].set_index("nr") - wagepan[wagepan.year == 1980].set_index("nr")

for var in ["d87", "d86", "d85", "d84", "d83", "d82", "d81"]:
    wagepan1987[var] = 0
    wagepan1986[var] = 0
    wagepan1985[var] = 0
    wagepan1984[var] = 0
    wagepan1983[var] = 0
    wagepan1982[var] = 0
    wagepan1981[var] = 0
    if var == "d87":
        wagepan1987[var] = 1
    elif var == "d86":
        wagepan1986[var] = 1
    elif var == "d85":
        wagepan1985[var] = 1
    elif var == "d84":
        wagepan1984[var] = 1
    elif var == "d83":
        wagepan1983[var] = 1
    elif var == "d82":
        wagepan1982[var]
    elif var == "d81":
        wagepan1981[var]
        
wagepan1987["year"] = 1987
wagepan1986["year"] = 1986
wagepan1985["year"] = 1985
wagepan1984["year"] = 1984
wagepan1983["year"] = 1983
wagepan1982["year"] = 1982
wagepan1981["year"] = 1981

wagepan1987["educ"] = wagepan[wagepan.year == 1987].set_index("nr").educ
wagepan1986["educ"] = wagepan[wagepan.year == 1986].set_index("nr").educ
wagepan1985["educ"] = wagepan[wagepan.year == 1985].set_index("nr").educ
wagepan1984["educ"] = wagepan[wagepan.year == 1984].set_index("nr").educ
wagepan1983["educ"] = wagepan[wagepan.year == 1983].set_index("nr").educ
wagepan1982["educ"] = wagepan[wagepan.year == 1982].set_index("nr").educ
wagepan1981["educ"] = wagepan[wagepan.year == 1981].set_index("nr").educ

for data in [wagepan1987, wagepan1986, wagepan1985, wagepan1984, wagepan1983, wagepan1982, wagepan1981]:
    data["d87educ"] = data["d87"] * data["educ"]
    data["d86educ"] = data["d86"] * data["educ"]
    data["d85educ"] = data["d85"] * data["educ"]
    data["d84educ"] = data["d84"] * data["educ"]
    data["d83educ"] = data["d83"] * data["educ"]
    data["d82educ"] = data["d82"] * data["educ"]
    data["d81educ"] = data["d81"] * data["educ"]
    
    data["d87union"] = data["d87"] * data["union"]
    data["d86union"] = data["d86"] * data["union"]
    data["d85union"] = data["d85"] * data["union"]
    data["d84union"] = data["d84"] * data["union"]
    data["d83union"] = data["d83"] * data["union"]
    data["d82union"] = data["d82"] * data["union"]
    data["d81union"] = data["d81"] * data["union"]
        
wagepan_diff = pd.concat([wagepan1987, wagepan1986, wagepan1985, wagepan1984, wagepan1983, wagepan1982, wagepan1981])
X = sm.add_constant(wagepan_diff[["d82", "d83", "d84", "d85", "d86", "d87", "d82educ", "d83educ", "d84educ", "d85educ", "d86educ", "d87educ", "union"]])
model = sm.OLS(wagepan_diff.lwage, X, missing = "drop").fit()
model.summary()

  x = pd.concat(x[::order], 1)


0,1,2,3
Dep. Variable:,lwage,R-squared:,0.003
Model:,OLS,Adj. R-squared:,-0.0
Method:,Least Squares,F-statistic:,0.9137
Date:,"Fri, 13 Aug 2021",Prob (F-statistic):,0.526
Time:,01:25:40,Log-Likelihood:,-2308.5
No. Observations:,3815,AIC:,4641.0
Df Residuals:,3803,BIC:,4716.0
Df Model:,11,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.0890,0.013,6.619,0.000,0.063,0.115
d82,4.609e-16,7.17e-16,0.643,0.521,-9.45e-16,1.87e-15
d83,-0.0674,0.130,-0.517,0.605,-0.323,0.188
d84,-0.0114,0.130,-0.087,0.930,-0.267,0.244
d85,-0.1347,0.130,-1.034,0.301,-0.390,0.121
d86,-0.0732,0.130,-0.562,0.574,-0.329,0.182
d87,-0.0554,0.130,-0.425,0.671,-0.311,0.200
d82educ,1.209e-16,4.95e-16,0.244,0.807,-8.49e-16,1.09e-15
d83educ,0.0022,0.011,0.206,0.836,-0.019,0.024

0,1,2,3
Omnibus:,1027.896,Durbin-Watson:,2.029
Prob(Omnibus):,0.0,Jarque-Bera (JB):,49893.014
Skew:,0.479,Prob(JB):,0.0
Kurtosis:,20.691,Cond. No.,1.74e+17


In [50]:
model.f_test("d82educ = d83educ = d84educ = d85educ = d86educ = d87educ")

<class 'statsmodels.stats.contrast.ContrastResults'>
<F test: F=array([[0.15781758]]), p=0.9777097151613591, df_denom=3.8e+03, df_num=5>

In [51]:
model = sm.OLS(wagepan_diff.lwage, X, missing = "drop").fit(cov_type = "HAC", cov_kwds = {"maxlags": 1})
model.summary()



0,1,2,3
Dep. Variable:,lwage,R-squared:,0.003
Model:,OLS,Adj. R-squared:,-0.0
Method:,Least Squares,F-statistic:,0.88
Date:,"Fri, 13 Aug 2021",Prob (F-statistic):,0.56
Time:,01:25:40,Log-Likelihood:,-2308.5
No. Observations:,3815,AIC:,4641.0
Df Residuals:,3803,BIC:,4716.0
Df Model:,11,,
Covariance Type:,HAC,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,0.0890,0.015,5.781,0.000,0.059,0.119
d82,4.609e-16,6.48e-16,0.711,0.477,-8.1e-16,1.73e-15
d83,-0.0674,0.094,-0.719,0.472,-0.251,0.116
d84,-0.0114,0.112,-0.102,0.919,-0.231,0.208
d85,-0.1347,0.104,-1.292,0.196,-0.339,0.070
d86,-0.0732,0.127,-0.576,0.565,-0.322,0.176
d87,-0.0554,0.121,-0.457,0.648,-0.293,0.182
d82educ,1.209e-16,4.35e-16,0.278,0.781,-7.31e-16,9.73e-16
d83educ,0.0022,0.008,0.289,0.772,-0.013,0.017

0,1,2,3
Omnibus:,1027.896,Durbin-Watson:,2.029
Prob(Omnibus):,0.0,Jarque-Bera (JB):,49893.014
Skew:,0.479,Prob(JB):,0.0
Kurtosis:,20.691,Cond. No.,1.74e+17


In [52]:
model.f_test("d82educ = d83educ = d84educ = d85educ = d86educ = d87educ")

<class 'statsmodels.stats.contrast.ContrastResults'>
<F test: F=array([[0.21285934]]), p=0.9571916711168037, df_denom=3.8e+03, df_num=5>

In [53]:
X = sm.add_constant(wagepan_diff[["d82", "d83", "d84", "d85", "d86", "d87", "d82educ", "d83educ", "d84educ", "d85educ", "d86educ", "d87educ", "union", "d82union","d83union", "d84union", "d85union", "d86union", "d87union"]])
model = sm.OLS(wagepan_diff.lwage, X, missing = "drop").fit()
model.summary()

  x = pd.concat(x[::order], 1)


0,1,2,3
Dep. Variable:,lwage,R-squared:,0.004
Model:,OLS,Adj. R-squared:,-0.001
Method:,Least Squares,F-statistic:,0.8533
Date:,"Fri, 13 Aug 2021",Prob (F-statistic):,0.625
Time:,01:25:40,Log-Likelihood:,-2306.7
No. Observations:,3815,AIC:,4647.0
Df Residuals:,3798,BIC:,4754.0
Df Model:,16,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.0889,0.013,6.609,0.000,0.063,0.115
d82,3.941e-16,1.27e-15,0.311,0.756,-2.09e-15,2.88e-15
d83,-0.0656,0.130,-0.504,0.614,-0.321,0.190
d84,-0.0111,0.130,-0.085,0.932,-0.267,0.245
d85,-0.1325,0.131,-1.015,0.310,-0.388,0.123
d86,-0.0736,0.130,-0.565,0.572,-0.329,0.182
d87,-0.0460,0.131,-0.352,0.725,-0.302,0.210
d82educ,-1.123e-17,1.34e-16,-0.084,0.933,-2.73e-16,2.51e-16
d83educ,0.0021,0.011,0.189,0.850,-0.019,0.023

0,1,2,3
Omnibus:,1027.407,Durbin-Watson:,2.026
Prob(Omnibus):,0.0,Jarque-Bera (JB):,49766.559
Skew:,0.479,Prob(JB):,0.0
Kurtosis:,20.668,Cond. No.,8.91e+17


In [54]:
model.f_test("d82union = d83union = d84union = d85union = d86union = d87union")

<class 'statsmodels.stats.contrast.ContrastResults'>
<F test: F=array([[0.72093951]]), p=0.6076511689737495, df_denom=3.8e+03, df_num=5>

C13.i Constants can't be estimated using first differences and so the base year ($\beta_0$) and educ ($\beta_1$) aren't available. The remaining parameters can be estimated using FD.

C13.ii Results above. Returns to education do not appear to have changed over time.

C13.iii Results above. The conclusion that results have not changed over time does not change.

C13.iv The base is 0.0814 while the end period is 0.0814 + (-0.0953). This is a large and statistically significant difference.

C13.v The test for differences does not indicate change over time. This is because most of the interactions are not significant and so the big difference is overwhelmed by the smaller changes over time.

In [55]:
# Exercise 14
jtrain3 = pd.read_stata("./stata/jtrain3.dta")
jtrain3["cre"] = jtrain3["re78"] - jtrain3["re75"]

X = sm.add_constant(jtrain3[["train"]])
model = sm.OLS(jtrain3.re78, X, missing="drop").fit()
model.summary()

  x = pd.concat(x[::order], 1)


0,1,2,3
Dep. Variable:,re78,R-squared:,0.061
Model:,OLS,Adj. R-squared:,0.061
Method:,Least Squares,F-statistic:,173.4
Date:,"Fri, 13 Aug 2021",Prob (F-statistic):,2.03e-38
Time:,01:25:40,Log-Likelihood:,-11066.0
No. Observations:,2675,AIC:,22140.0
Df Residuals:,2673,BIC:,22150.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,21.5539,0.304,70.985,0.000,20.959,22.149
train,-15.2048,1.155,-13.169,0.000,-17.469,-12.941

0,1,2,3
Omnibus:,715.201,Durbin-Watson:,0.919
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2698.169
Skew:,1.278,Prob(JB):,0.0
Kurtosis:,7.205,Cond. No.,3.96


In [56]:
model = sm.OLS(jtrain3.cre, X, missing="drop").fit()
model.summary()

0,1,2,3
Dep. Variable:,cre,R-squared:,0.003
Model:,OLS,Adj. R-squared:,0.003
Method:,Least Squares,F-statistic:,8.172
Date:,"Fri, 13 Aug 2021",Prob (F-statistic):,0.00429
Time:,01:25:40,Log-Likelihood:,-10130.0
No. Observations:,2675,AIC:,20260.0
Df Residuals:,2673,BIC:,20280.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,2.4906,0.214,11.637,0.000,2.071,2.910
train,2.3265,0.814,2.859,0.004,0.731,3.922

0,1,2,3
Omnibus:,842.068,Durbin-Watson:,1.81
Prob(Omnibus):,0.0,Jarque-Bera (JB):,25035.791
Skew:,0.868,Prob(JB):,0.0
Kurtosis:,17.886,Cond. No.,3.96


In [57]:
model = sm.OLS(jtrain3.cre, X, missing="drop").fit(cov_type="HC3")
model.summary()

0,1,2,3
Dep. Variable:,cre,R-squared:,0.003
Model:,OLS,Adj. R-squared:,0.003
Method:,Least Squares,F-statistic:,12.91
Date:,"Fri, 13 Aug 2021",Prob (F-statistic):,0.000333
Time:,01:25:40,Log-Likelihood:,-10130.0
No. Observations:,2675,AIC:,20260.0
Df Residuals:,2673,BIC:,20280.0
Df Model:,1,,
Covariance Type:,HC3,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,2.4906,0.217,11.466,0.000,2.065,2.916
train,2.3265,0.648,3.593,0.000,1.057,3.596

0,1,2,3
Omnibus:,842.068,Durbin-Watson:,1.81
Prob(Omnibus):,0.0,Jarque-Bera (JB):,25035.791
Skew:,0.868,Prob(JB):,0.0
Kurtosis:,17.886,Cond. No.,3.96


C14.i Results above. The coefficient for training is negative and so training would appear to have had a negative effect on earnings in 1978 (the result is statistically significant)

C14.ii Results above. The effect is now positive and significant. The difference likely accounts for individual effects (such as low earning workers being more likely to take training).

C14.iii The confidence interval with the normal standard errors is 0.731 to 3.922 but shrinks with the HC3 errors to 1.057 to 3.596. 

In [58]:
# Exercise 15
happiness = pd.read_stata("./stata/happiness.dta")
happiness["highinc"] = 0
happiness.loc[happiness.income == "$25000 or more", "highinc"] = 1

In [59]:
happiness.year.value_counts()

2006    2986
1994    2977
1996    2885
1998    2806
2000    2777
2002    1369
2004    1337
Name: year, dtype: int64

In [60]:
happiness.vhappy.value_counts()

0    11877
1     5260
Name: vhappy, dtype: int64

In [61]:
happiness.vhappy.value_counts(normalize=True)

0    0.693062
1    0.306938
Name: vhappy, dtype: float64

In [62]:
X = sm.add_constant(happiness[["y96", "y98", "y00", "y02", "y04", "y06"]])
model = sm.OLS(happiness.vhappy, X, missing="drop").fit(cov_type="HC3")
model.summary()

  x = pd.concat(x[::order], 1)


0,1,2,3
Dep. Variable:,vhappy,R-squared:,0.0
Model:,OLS,Adj. R-squared:,0.0
Method:,Least Squares,F-statistic:,1.429
Date:,"Fri, 13 Aug 2021",Prob (F-statistic):,0.199
Time:,01:25:40,Log-Likelihood:,-11050.0
No. Observations:,17137,AIC:,22110.0
Df Residuals:,17130,BIC:,22170.0
Df Model:,6,,
Covariance Type:,HC3,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,0.2879,0.008,34.679,0.000,0.272,0.304
y96,0.0161,0.012,1.351,0.177,-0.007,0.039
y98,0.0297,0.012,2.453,0.014,0.006,0.053
y00,0.0294,0.012,2.423,0.015,0.006,0.053
y02,0.0153,0.015,1.021,0.307,-0.014,0.045
y04,0.0255,0.015,1.682,0.093,-0.004,0.055
y06,0.0202,0.012,1.708,0.088,-0.003,0.043

0,1,2,3
Omnibus:,61877.616,Durbin-Watson:,1.939
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3201.568
Skew:,0.837,Prob(JB):,0.0
Kurtosis:,1.702,Cond. No.,7.34


In [63]:
model.f_test("y96 = y98 = y00 = y02 = y04 = y06")

<class 'statsmodels.stats.contrast.ContrastResults'>
<F test: F=array([[0.45490927]]), p=0.8099909353074722, df_denom=1.71e+04, df_num=5>

In [64]:
X = sm.add_constant(happiness[["y96", "y98", "y00", "y02", "y04", "y06", "occattend", "regattend"]])
model = sm.OLS(happiness.vhappy, X, missing="drop").fit(cov_type="HC3")
model.summary()

  x = pd.concat(x[::order], 1)


0,1,2,3
Dep. Variable:,vhappy,R-squared:,0.007
Model:,OLS,Adj. R-squared:,0.007
Method:,Least Squares,F-statistic:,13.57
Date:,"Fri, 13 Aug 2021",Prob (F-statistic):,8.94e-20
Time:,01:25:40,Log-Likelihood:,-10816.0
No. Observations:,16864,AIC:,21650.0
Df Residuals:,16855,BIC:,21720.0
Df Model:,8,,
Covariance Type:,HC3,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,0.2713,0.009,30.515,0.000,0.254,0.289
y96,0.0167,0.012,1.392,0.164,-0.007,0.040
y98,0.0279,0.012,2.293,0.022,0.004,0.052
y00,0.0313,0.012,2.557,0.011,0.007,0.055
y02,0.0157,0.015,1.050,0.294,-0.014,0.045
y04,0.0252,0.015,1.659,0.097,-0.005,0.055
y06,0.0222,0.012,1.866,0.062,-0.001,0.045
occattend,0.0043,0.008,0.531,0.595,-0.011,0.020
regattend,0.1122,0.011,9.847,0.000,0.090,0.135

0,1,2,3
Omnibus:,37858.965,Durbin-Watson:,1.941
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3083.136
Skew:,0.831,Prob(JB):,0.0
Kurtosis:,1.726,Cond. No.,7.71


In [65]:
X = sm.add_constant(happiness[["y96", "y98", "y00", "y02", "y04", "y06", "occattend", "regattend", "highinc", "unem10", "educ", "teens"]])
model = sm.OLS(happiness.vhappy, X, missing="drop").fit(cov_type="HC3")
model.summary()

  x = pd.concat(x[::order], 1)


0,1,2,3
Dep. Variable:,vhappy,R-squared:,0.028
Model:,OLS,Adj. R-squared:,0.027
Method:,Least Squares,F-statistic:,27.6
Date:,"Fri, 13 Aug 2021",Prob (F-statistic):,1.24e-62
Time:,01:25:40,Log-Likelihood:,-6964.8
No. Observations:,11070,AIC:,13960.0
Df Residuals:,11057,BIC:,14050.0
Df Model:,12,,
Covariance Type:,HC3,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,0.1924,0.022,8.662,0.000,0.149,0.236
y96,0.0108,0.015,0.732,0.464,-0.018,0.040
y98,0.0189,0.015,1.275,0.202,-0.010,0.048
y00,0.0271,0.015,1.792,0.073,-0.003,0.057
y02,-0.0198,0.018,-1.095,0.273,-0.055,0.016
y04,-0.0032,0.018,-0.171,0.864,-0.039,0.033
y06,-0.0061,0.014,-0.419,0.675,-0.034,0.022
occattend,-0.0078,0.010,-0.797,0.425,-0.027,0.011
regattend,0.1071,0.014,7.670,0.000,0.080,0.135

0,1,2,3
Omnibus:,11936.767,Durbin-Watson:,1.945
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1889.694
Skew:,0.81,Prob(JB):,0.0
Kurtosis:,1.786,Cond. No.,97.0


In [66]:
X = sm.add_constant(happiness[["y96", "y98", "y00", "y02", "y04", "y06", "occattend", "regattend", "highinc", "unem10", "educ", "teens", "black", "female", "blackfemale"]])
model = sm.OLS(happiness.vhappy, X, missing="drop").fit(cov_type="HC3")
model.summary()

  x = pd.concat(x[::order], 1)


0,1,2,3
Dep. Variable:,vhappy,R-squared:,0.03
Model:,OLS,Adj. R-squared:,0.028
Method:,Least Squares,F-statistic:,23.74
Date:,"Fri, 13 Aug 2021",Prob (F-statistic):,1.54e-65
Time:,01:25:40,Log-Likelihood:,-6955.4
No. Observations:,11070,AIC:,13940.0
Df Residuals:,11054,BIC:,14060.0
Df Model:,15,,
Covariance Type:,HC3,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,0.2037,0.023,8.773,0.000,0.158,0.249
y96,0.0115,0.015,0.780,0.435,-0.017,0.040
y98,0.0201,0.015,1.365,0.172,-0.009,0.049
y00,0.0283,0.015,1.876,0.061,-0.001,0.058
y02,-0.0182,0.018,-1.004,0.315,-0.054,0.017
y04,-0.0029,0.018,-0.156,0.876,-0.039,0.033
y06,-0.0047,0.014,-0.323,0.747,-0.033,0.024
occattend,-0.0043,0.010,-0.435,0.663,-0.023,0.015
regattend,0.1115,0.014,7.941,0.000,0.084,0.139

0,1,2,3
Omnibus:,11733.675,Durbin-Watson:,1.946
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1878.403
Skew:,0.807,Prob(JB):,0.0
Kurtosis:,1.788,Cond. No.,103.0


In [67]:
model.f_test("black = female = blackfemale = 0")

<class 'statsmodels.stats.contrast.ContrastResults'>
<F test: F=array([[7.21967945]]), p=7.754963310751361e-05, df_denom=1.11e+04, df_num=3>

C15.i 2006 has the largest number of observations (2986), 2004 has the smallest (1337). 5260 observations are listed as very happy which is about 31% of the sample.

C15.ii The p-value for the F-test is about 0.8. We do not find any evidence that the proportion of very happy people has changed over time.

C15.iii The proportion of very happy people appears to increase among regular churchgoers due to its positive and significant coefficient (but not occasional churchgoers. Negative coefficient but not significant). The base is non-churchgoers.

C15.iv After adding the other factors the coefficient for regular churchgoing is slightly smaller but still statistically significant.

C15.v All of the new variables but teens are significant. High income and education are positive, which follows intuitions (cliches about money not buying happiness notwithstanding). The negative coefficient for unemployment also has an intuitive interpretation  and is significant. The teens coefficient is negative but not significant. The negative value doesn't have a particularly intuitive interpretation, though it may possibly have a correlation with poverty (larger families generally having lower income and the income variable only accounting for the highest income level).

C15.vi The coefficient for black is negative an right on the border of significance (using HC3 errors). The female coefficient is small and positive but not statistically significant, and the interaction between race and gender is not significant. A test for joint significance among these variables rejects the null and so we should conclude that there is a difference (at the very least by race).