# Chapter 9

In [1]:
import pandas as pd
import statsmodels.api as sm
import numpy as np

In [2]:
from statsmodels.stats.diagnostic import linear_reset

# Exercise 1
ceosal1 = pd.read_stata("stata/CEOSAL1.DTA")
ceosal1["rosneg"] = (ceosal1.ros < 0).astype('int32')

y = ceosal1.lsalary
X = sm.add_constant(ceosal1[["lsales", "roe", "rosneg"]])
model = sm.OLS(y, X).fit()
print(linear_reset(model))

<Wald test (chi2): statistic=[[2.66707016]], p-value=0.26354396347186887, df_denom=2>


In [3]:
linear_reset(model, cov_type = "HC3")

<class 'statsmodels.stats.contrast.ContrastResults'>
<Wald test (chi2): statistic=[[2.99859337]], p-value=0.22328714637257752, df_denom=2>

C1.i The p-value for the RESET test is about 0.26 which could not be considered evidence of functional form misspecification.

C1.ii When considering a heteroskedacticity robust RESET the p-value falls to about 0.22, but this is far from any value we would consider evidence of misspecification.

In [4]:
# Exercise 2
wage2 = pd.read_stata("stata/WAGE2.DTA")
wage2["educ_iq"] = wage2.educ * wage2.IQ
wage2["educ_kww"] = wage2.educ * wage2.KWW

y = wage2.lwage
X = sm.add_constant(wage2[["educ", "exper", "tenure", "married", "south", "urban", "black", "KWW", "educ_kww"]])
model = sm.OLS(y, X).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                  lwage   R-squared:                       0.259
Model:                            OLS   Adj. R-squared:                  0.252
Method:                 Least Squares   F-statistic:                     35.97
Date:                Sun, 31 May 2020   Prob (F-statistic):           8.91e-55
Time:                        23:56:54   Log-Likelihood:                -377.34
No. Observations:                 935   AIC:                             774.7
Df Residuals:                     925   BIC:                             823.1
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          5.3557      0.114     47.113      0.0

In [5]:
wage2["combined"] = wage2.IQ * wage2.KWW
wage2["educ_combined"] = wage2.combined * wage2.educ
X = sm.add_constant(wage2[["educ", "exper", "tenure", "married", "south", "urban", "black", "IQ", "KWW", "educ_combined"]])
model = sm.OLS(y, X).fit()
print(model.summary())
print(model.f_test("(IQ = 0), (KWW = 0)"))

                            OLS Regression Results                            
Dep. Variable:                  lwage   R-squared:                       0.266
Model:                            OLS   Adj. R-squared:                  0.258
Method:                 Least Squares   F-statistic:                     33.53
Date:                Sun, 31 May 2020   Prob (F-statistic):           7.48e-56
Time:                        23:56:54   Log-Likelihood:                -372.88
No. Observations:                 935   AIC:                             767.8
Df Residuals:                     924   BIC:                             821.0
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
const             5.1756      0.128     40.486

C2.i The estimated return to education when using KWW as a proxy for ability is about 5.8%

C2.ii The return to education falls to about 4.9%

C.iii Both IQ and KWW are significant at the 5% level (in fact, IQ is significant at the 1% level) individually. The F-test provides strong evidence that they are jointly significant.

In [6]:
# Exercise 3
jtrain = pd.read_stata("stata/JTRAIN.DTA")
jtrain_1988 = jtrain.loc[jtrain["year"] == 1988, ["lscrap", "grant"]].dropna()
y = jtrain_1988.lscrap
X = sm.add_constant(jtrain_1988.grant)

model = sm.OLS(y, X).fit()
print(model.summary())
print(model.get_robustcov_results(cov_type = "HC3").summary())

                            OLS Regression Results                            
Dep. Variable:                 lscrap   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                 -0.019
Method:                 Least Squares   F-statistic:                   0.01948
Date:                Sun, 31 May 2020   Prob (F-statistic):              0.890
Time:                        23:56:55   Log-Likelihood:                -94.660
No. Observations:                  54   AIC:                             193.3
Df Residuals:                      52   BIC:                             197.3
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.4085      0.241      1.698      0.0

In [7]:
jtrain["lscrap_lag"] = jtrain["lscrap"].shift(1)
jtrain_1988 = jtrain.loc[jtrain["year"] == 1988, ["lscrap", "lscrap_lag", "grant"]].dropna()
y = jtrain_1988.lscrap
X = sm.add_constant(jtrain_1988[["grant", "lscrap_lag"]])

model = sm.OLS(y, X).fit()
model_hc3 = model.get_robustcov_results(cov_type = "HC3")
print(model.summary())
print("One sided p-value for grant is: ", (model.pvalues[1] / 2))
print(model_hc3.summary())
print("One sided p-value for grant is: ", (model_hc3.pvalues[1] / 2))

                            OLS Regression Results                            
Dep. Variable:                 lscrap   R-squared:                       0.873
Model:                            OLS   Adj. R-squared:                  0.868
Method:                 Least Squares   F-statistic:                     174.9
Date:                Sun, 31 May 2020   Prob (F-statistic):           1.47e-23
Time:                        23:56:55   Log-Likelihood:                -39.000
No. Observations:                  54   AIC:                             84.00
Df Residuals:                      51   BIC:                             89.97
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0212      0.089      0.238      0.8

In [8]:
print(model.t_test("lscrap_lag = 1"))
print(model_hc3.t_test("lscrap_lag = 1"))

                             Test for Constraints                             
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
c0             0.8312      0.044     -3.799      0.000       0.742       0.920
                             Test for Constraints                             
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
c0             0.8312      0.088     -1.914      0.061       0.654       1.008


C3.i Quite a lot depends on how grants are assigned. It would be fair to say that grants can be assigned to over or under performing firms, depending on who is administering the program. In either case the factors leading to higher or lower productivity are unobserved but would be correlated with grant.

C3.ii The simple regression does not provide evidence that receiving a job training grant significantly lowers a firm's scrap rate. With a positive coefficient, a significant result would mean the grant increased the scrap rate!

C3.iii Adding the lag turns the coefficient for grant negative. It is statistically significant at the 5% level with a p-value of 0.045 against the one sided test $\beta_{grant} < 0$.

C3.iv The t-test is listed above and the p-value is incredibly small (0.000).

C3.v Robust results are reported below the standard results. The results from part (ii) do not substantively change. While the question did not ask to check part (iii), grant is now only significant at the 10% level when tested against the one sided alternative. The robust standard error is quite a bit larger and the test is now significant only at the 10% level.

In [9]:
# Exercise 4
infmrt = pd.read_stata("stata/infmrt.dta")
infmrt_1990 = infmrt[infmrt.year == 1990]
y = infmrt_1990.infmort
X = sm.add_constant(infmrt_1990[["lpcinc", "lphysic", "lpopul", "DC"]])

model = sm.OLS(y, X).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                infmort   R-squared:                       0.691
Model:                            OLS   Adj. R-squared:                  0.664
Method:                 Least Squares   F-statistic:                     25.71
Date:                Sun, 31 May 2020   Prob (F-statistic):           3.15e-11
Time:                        23:56:55   Log-Likelihood:                -80.968
No. Observations:                  51   AIC:                             171.9
Df Residuals:                      46   BIC:                             181.6
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         23.9548     12.419      1.929      0.0

C4.i The coefficient indicates that once income, number of physicians, and population are accounted for, DC suffers 16 more infant deaths per 1000 live births. This is a large and significant effect.

C4.ii The coefficients are identical to 9.44 (on page 331). Including the dummy accounts for the distortion created by the outlier. This is only true for a single observation.

In [10]:
# Exercise 5
rdchem = pd.read_stata("stata/RDCHEM.DTA")
rdchem["sales"]  = rdchem["sales"] / 1000
rdchem["salessq"] = np.power(rdchem["sales"], 2)
rdchem_limit = rdchem[rdchem.sales < 20]
y = rdchem.rdintens
X = sm.add_constant(rdchem[["sales", "salessq", "profmarg"]])

model = sm.OLS(y, X).fit()
print(model.summary())

y = rdchem_limit.rdintens
X = sm.add_constant(rdchem_limit[["sales", "salessq", "profmarg"]])

model = sm.OLS(y, X).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:               rdintens   R-squared:                       0.190
Model:                            OLS   Adj. R-squared:                  0.104
Method:                 Least Squares   F-statistic:                     2.196
Date:                Sun, 31 May 2020   Prob (F-statistic):              0.111
Time:                        23:56:55   Log-Likelihood:                -61.610
No. Observations:                  32   AIC:                             131.2
Df Residuals:                      28   BIC:                             137.1
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          2.0590      0.626      3.288      0.0

In [11]:
y = rdchem.rdintens
X = sm.add_constant(rdchem[["sales", "salessq", "profmarg"]])
model = sm.QuantReg(y, X).fit(q = .5)
print(model.summary())

y = rdchem_limit.rdintens
X = sm.add_constant(rdchem_limit[["sales", "salessq", "profmarg"]])
model = sm.QuantReg(y, X).fit(q = .5)
print(model.summary())

                         QuantReg Regression Results                          
Dep. Variable:               rdintens   Pseudo R-squared:              0.09743
Model:                       QuantReg   Bandwidth:                       2.034
Method:                 Least Squares   Sparsity:                        4.266
Date:                Sun, 31 May 2020   No. Observations:                   32
Time:                        23:56:55   Df Residuals:                       28
                                        Df Model:                            3
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          1.4043      0.753      1.865      0.073      -0.138       2.947
sales          0.2635      0.167      1.578      0.126      -0.079       0.605
salessq       -0.0060      0.004     -1.343      0.190      -0.015       0.003
profmarg       0.1140      0.053      2.144      0.0

C5.i The largest changes are in sales, with sales changing by 0.05 and the quadratic term falling a further 0.003. More noteably, sales was significant at the 5% level and the quadratic was significant at the 10% level while removing the outlier removes all significance.

C5.ii Removing the outlier flips the sign of the sales coefficients! standard errors grow, but none of the results were significant in the first place.

C5.iii The text introduced LAD as being resilient to outliers. I am not at all confident in a method that flips signs depending on the inclusion of a variable, but one expectation was that we would have a large sample, and we are falling from 32 to 31 observations. Based on these results, I would say OLS is more resilient to outliers.

In [12]:
# Exercise 6
meap93 = pd.read_stata("stata/MEAP93.DTA")
meap93_limit = meap93[meap93.bensal > 0.01]
print(meap93.shape[0] - meap93_limit.shape[0])

4


In [13]:
y = meap93_limit.lsalary
X = sm.add_constant(meap93_limit[["bensal", "lenroll", "lstaff", "droprate", "gradrate"]])

model = sm.OLS(y, X).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                lsalary   R-squared:                       0.354
Model:                            OLS   Adj. R-squared:                  0.346
Method:                 Least Squares   F-statistic:                     43.55
Date:                Sun, 31 May 2020   Prob (F-statistic):           8.75e-36
Time:                        23:56:55   Log-Likelihood:                 272.94
No. Observations:                 404   AIC:                            -533.9
Df Residuals:                     398   BIC:                            -509.9
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         10.7053      0.260     41.119      0.0

C6.i Four of the observations have a benefits/salary ratio of less than 0.01

C6.ii The magnitude of bensal (b/s in the text) is quite a bit smaller than in Example 4.10 (page 156), shrinking from -0.589 to -0.421. The standard error has also grown, though the result is still significant at the 5% level. The remaining coefficients are consistent.

In [14]:
# Exercise 7
loanapp = pd.read_stata("stata/loanapp.dta")
print(loanapp[loanapp.obrat > 40].shape[0])

205


In [15]:
loanapp_reg = loanapp.loc[loanapp.obrat <= 40, ["approve", "white", "hrat", "obrat", "loanprc", "unem", "male", "married", "dep", "sch", "cosign", "chist", "pubrec", "mortlat1", "mortlat2", "vr"]].dropna()

y = loanapp_reg.approve
X = sm.add_constant(loanapp_reg[["white", "hrat", "obrat", "loanprc", "unem", "male", "married", "dep", "sch", "cosign", "chist", "pubrec", "mortlat1", "mortlat2", "vr"]])
model = sm.OLS(y, X).fit()
model_summary = model.summary()
print(model_summary)

                            OLS Regression Results                            
Dep. Variable:                approve   R-squared:                       0.129
Model:                            OLS   Adj. R-squared:                  0.122
Method:                 Least Squares   F-statistic:                     17.35
Date:                Sun, 31 May 2020   Prob (F-statistic):           2.63e-43
Time:                        23:56:55   Log-Likelihood:                -249.57
No. Observations:                1768   AIC:                             531.1
Df Residuals:                    1752   BIC:                             618.8
Df Model:                          15                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.7872      0.055     14.377      0.0

C7.i 205 observations have other debt obligations in excess of 40% of their income

C7.ii The coefficient for white is consistent, moving from 0.1288 to 0.1286. The t statistic shrinks from 6.529 to 6.508. These are not large changes.

C7.iii It does not appear that the estimate for $\beta_{white}$ is overly sensitive to the sample used (at least with regards to the debt obligations)

In [16]:
# Exercise 8
twoyear = pd.read_stata("stata/twoyear.dta")
print("The sample mean of stotal is:", twoyear.stotal.mean())
print("The standard deviation is:", twoyear.stotal.std())

The sample mean of stotal is: 0.04748285189270973
The standard deviation is: 0.8535629510879517


In [17]:
y = twoyear.stotal
X = sm.add_constant(twoyear.jc)

model = sm.OLS(y, X).fit()
print(model.summary())

X = sm.add_constant(twoyear.univ)
model = sm.OLS(y, X).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                 stotal   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     1.032
Date:                Sun, 31 May 2020   Prob (F-statistic):              0.310
Time:                        23:56:55   Log-Likelihood:                -8524.3
No. Observations:                6763   AIC:                         1.705e+04
Df Residuals:                    6761   BIC:                         1.707e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0429      0.011      3.781      0.0

In [18]:
y = twoyear.lwage
X = sm.add_constant(twoyear[["jc", "univ", "exper", "stotal"]])

model = sm.OLS(y, X).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                  lwage   R-squared:                       0.228
Model:                            OLS   Adj. R-squared:                  0.228
Method:                 Least Squares   F-statistic:                     500.2
Date:                Sun, 31 May 2020   Prob (F-statistic):               0.00
Time:                        23:56:55   Log-Likelihood:                -3862.5
No. Observations:                6763   AIC:                             7735.
Df Residuals:                    6758   BIC:                             7769.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          1.4953      0.021     70.473      0.0

In [19]:
twoyear["stotal_sq"] = np.power(twoyear.stotal, 2)

X = sm.add_constant(twoyear[["jc", "univ", "exper", "stotal", "stotal_sq"]])

model = sm.OLS(y, X).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                  lwage   R-squared:                       0.228
Model:                            OLS   Adj. R-squared:                  0.228
Method:                 Least Squares   F-statistic:                     400.2
Date:                Sun, 31 May 2020   Prob (F-statistic):               0.00
Time:                        23:56:56   Log-Likelihood:                -3862.4
No. Observations:                6763   AIC:                             7737.
Df Residuals:                    6757   BIC:                             7778.
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          1.4940      0.021     69.635      0.0

In [20]:
twoyear["stotal_jc"] = twoyear.stotal * twoyear.jc
twoyear["stotal_univ"] = twoyear.stotal * twoyear.univ

X = sm.add_constant(twoyear[["jc", "univ", "exper", "stotal", "stotal_jc", "stotal_univ"]])

model = sm.OLS(y, X).fit()
print(model.summary())
print(model.f_test("(stotal_jc = 0, stotal_univ = 0)"))

                            OLS Regression Results                            
Dep. Variable:                  lwage   R-squared:                       0.229
Model:                            OLS   Adj. R-squared:                  0.228
Method:                 Least Squares   F-statistic:                     334.2
Date:                Sun, 31 May 2020   Prob (F-statistic):               0.00
Time:                        23:56:56   Log-Likelihood:                -3860.5
No. Observations:                6763   AIC:                             7735.
Df Residuals:                    6756   BIC:                             7783.
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
const           1.4959      0.021     70.419      

C8.i The mean is 0.047 and the standard deviation is 0.854.

C8.ii Only univ is statistically related to stotal. We know this because of the significant relationship between univ and stotal (while jc small and not significant).

C8.iii The effect of univ is smaller and more precise.

C8.iv Adding $stotal^2$ produces a small coefficient that is not significantly different from 0.

C8.v The interaction terms are not jointly significant even at the 10% level.

C8.vi The model from part iii should be enough. We did not find any evidence that any of the alternatives explored in iv or v added anything to the analysis.

In [21]:
# Exercise 9

k401subs = pd.read_stata("stata/401ksubs.dta")
y = k401subs.nettfa
X = sm.add_constant(k401subs[["inc", "incsq", "age", "agesq", "male", "e401k"]])

model = sm.OLS(y, X).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                 nettfa   R-squared:                       0.202
Model:                            OLS   Adj. R-squared:                  0.202
Method:                 Least Squares   F-statistic:                     391.6
Date:                Sun, 31 May 2020   Prob (F-statistic):               0.00
Time:                        23:56:56   Log-Likelihood:                -50681.
No. Observations:                9275   AIC:                         1.014e+05
Df Residuals:                    9268   BIC:                         1.014e+05
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         21.1978      9.992      2.121      0.0

In [22]:
import statsmodels.stats.diagnostic as smd
print(smd.het_breuschpagan(model.resid, X))

(346.6439627872171, 8.107902005744488e-72, 59.97177669516518, 3.338601184163747e-73)


In [23]:
model = sm.QuantReg(y, X).fit(q = .5)
print(model.summary())

                         QuantReg Regression Results                          
Dep. Variable:                 nettfa   Pseudo R-squared:               0.1088
Model:                       QuantReg   Bandwidth:                       2.514
Method:                 Least Squares   Sparsity:                        16.54
Date:                Sun, 31 May 2020   No. Observations:                 9275
Time:                        23:56:56   Df Residuals:                     9268
                                        Df Model:                            6
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         12.4912      1.446      8.637      0.000       9.656      15.326
inc           -0.2616      0.011    -24.221      0.000      -0.283      -0.240
incsq          0.0071    8.5e-05     83.381      0.000       0.007       0.007
age           -0.7227      0.070    -10.327      0.0

C9.i e401k indicates that on average people who are eligible for a 401k have \$9,713 higher financial assets.

C9.ii There is strong evidence of heteroskedasticity. The p-value is near zero (F statistic of 59.97)

C9.iii LAD produces a noticeably smaller coefficient for e401k. This would mean the median individiual eligible for a 401k has about \$3,737 more in financial assets.

C9.iv The difference between the two mark the difference between mean and median. Given that we are considering financial assets it makes sense that the mean would be higher than the median