## Chapter 7

In [1]:
import pandas as pd
import statsmodels.api as sm
import numpy as np

In [2]:
# Exercise 1
gpa1 = pd.read_stata("stata/GPA1.DTA")

y = gpa1.colGPA
X = sm.add_constant(gpa1[["PC", "hsGPA", "ACT", "mothcoll", "fathcoll"]])
model = sm.OLS(y, X).fit()
model_summary = model.summary()
print(model_summary)

                            OLS Regression Results                            
Dep. Variable:                 colGPA   R-squared:                       0.222
Model:                            OLS   Adj. R-squared:                  0.193
Method:                 Least Squares   F-statistic:                     7.713
Date:                Mon, 25 May 2020   Prob (F-statistic):           2.08e-06
Time:                        18:38:37   Log-Likelihood:                -42.541
No. Observations:                 141   AIC:                             97.08
Df Residuals:                     135   BIC:                             114.8
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          1.2556      0.335      3.744      0.0

In [3]:
model.f_test("(mothcoll = 0), (fathcoll = 0)")

<class 'statsmodels.stats.contrast.ContrastResults'>
<F test: F=array([[0.24455167]]), p=0.783401666560902, df_denom=135, df_num=2>

In [4]:
gpa1["hsGPAsq"] = gpa1.hsGPA ** 2

X = sm.add_constant(gpa1[["PC", "hsGPA", "hsGPAsq", "ACT", "mothcoll", "fathcoll"]])
model = sm.OLS(y, X).fit()
model_summary = model.summary()
print(model_summary)

                            OLS Regression Results                            
Dep. Variable:                 colGPA   R-squared:                       0.236
Model:                            OLS   Adj. R-squared:                  0.202
Method:                 Least Squares   F-statistic:                     6.904
Date:                Mon, 25 May 2020   Prob (F-statistic):           2.09e-06
Time:                        18:38:37   Log-Likelihood:                -41.266
No. Observations:                 141   AIC:                             96.53
Df Residuals:                     134   BIC:                             117.2
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          5.0403      2.443      2.063      0.0

In [5]:
model.f_test("(hsGPA = 0), (hsGPAsq = 0)")

<class 'statsmodels.stats.contrast.ContrastResults'>
<F test: F=array([[12.74698384]]), p=8.553737864007102e-06, df_denom=134, df_num=2>

C1.i PC decreases slightly and is still statistically significant

C1.ii The two variables are not jointly significant with a p-value of 0.78

C1.iii The quadratic term is not itself significant (though the two are jointly significance) and the variation is not particularly better explained with it.

In [6]:
# Exercise 2
wage2 = pd.read_stata("stata/WAGE2.DTA")

y = wage2.lwage
X = sm.add_constant(wage2[["educ", "exper", "tenure", "married", "black", "south", "urban"]])
model = sm.OLS(y, X).fit()
model_summary = model.summary()
print(model_summary)

                            OLS Regression Results                            
Dep. Variable:                  lwage   R-squared:                       0.253
Model:                            OLS   Adj. R-squared:                  0.247
Method:                 Least Squares   F-statistic:                     44.75
Date:                Mon, 25 May 2020   Prob (F-statistic):           1.16e-54
Time:                        18:38:37   Log-Likelihood:                -381.55
No. Observations:                 935   AIC:                             779.1
Df Residuals:                     927   BIC:                             817.8
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          5.3955      0.113     47.653      0.0

In [7]:
wage2["expersq"] = wage2.exper.astype("int32") ** 2
wage2["tenuresq"] = wage2.tenure.astype("int32") ** 2

X = sm.add_constant(wage2[["educ", "exper", "expersq", "tenure", "tenuresq", "married", "black", "south", "urban"]])
model = sm.OLS(y, X).fit()
model_summary = model.summary()
print(model_summary)

                            OLS Regression Results                            
Dep. Variable:                  lwage   R-squared:                       0.255
Model:                            OLS   Adj. R-squared:                  0.248
Method:                 Least Squares   F-statistic:                     35.17
Date:                Mon, 25 May 2020   Prob (F-statistic):           1.22e-53
Time:                        18:38:37   Log-Likelihood:                -380.05
No. Observations:                 935   AIC:                             780.1
Df Residuals:                     925   BIC:                             828.5
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          5.3587      0.126     42.558      0.0

In [8]:
model.f_test("(expersq = 0), (tenuresq = 0)")

<class 'statsmodels.stats.contrast.ContrastResults'>
<F test: F=array([[1.4898061]]), p=0.22595672984182205, df_denom=925, df_num=2>

In [9]:
wage2["blackeduc"] = wage2.black * wage2.educ

X = sm.add_constant(wage2[["educ", "blackeduc", "exper", "expersq", "tenure", "tenuresq", "married", "black", "south", "urban"]])
model = sm.OLS(y, X).fit()
model_summary = model.summary()
print(model_summary)

                            OLS Regression Results                            
Dep. Variable:                  lwage   R-squared:                       0.256
Model:                            OLS   Adj. R-squared:                  0.248
Method:                 Least Squares   F-statistic:                     31.78
Date:                Mon, 25 May 2020   Prob (F-statistic):           4.22e-53
Time:                        18:38:37   Log-Likelihood:                -379.45
No. Observations:                 935   AIC:                             780.9
Df Residuals:                     924   BIC:                             834.2
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          5.3377      0.127     41.902      0.0

In [10]:
wage2["marrblack"] = wage2.married * wage2.black

X = sm.add_constant(wage2[["educ", "exper", "tenure", "married", "black", "marrblack", "south", "urban"]])
model = sm.OLS(y, X).fit()
model_summary = model.summary()
print(model_summary)

                            OLS Regression Results                            
Dep. Variable:                  lwage   R-squared:                       0.253
Model:                            OLS   Adj. R-squared:                  0.246
Method:                 Least Squares   F-statistic:                     39.17
Date:                Mon, 25 May 2020   Prob (F-statistic):           6.78e-54
Time:                        18:38:37   Log-Likelihood:                -381.37
No. Observations:                 935   AIC:                             780.7
Df Residuals:                     926   BIC:                             824.3
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          5.4038      0.114     47.351      0.0

In [11]:
model.params[5] + model.params[6]

-0.17946627597397202

C2.i The approximate difference between blacks and nonblacks is about 18.8% and is statistically significant

C2.ii The quadratic terms are not jointly significant at the 20% level

C2.iii Extended but not significant

C2.iv The estimated difference between married blacks and married nonblacks is about 18%

In [12]:
# Exercise 3
mlb1 = pd.read_stata("stata/MLB1.DTA")

y = mlb1.lsalary
X = sm.add_constant(mlb1[["years", "gamesyr", "bavg", "hrunsyr", "rbisyr", "runsyr", "fldperc", "allstar", "frstbase", "scndbase", "thrdbase", "shrtstop", "catcher"]])
model = sm.OLS(y, X).fit()
model_summary = model.summary()
print(model_summary)

                            OLS Regression Results                            
Dep. Variable:                lsalary   R-squared:                       0.654
Model:                            OLS   Adj. R-squared:                  0.640
Method:                 Least Squares   F-statistic:                     49.19
Date:                Mon, 25 May 2020   Prob (F-statistic):           6.45e-70
Time:                        18:38:38   Log-Likelihood:                -372.46
No. Observations:                 353   AIC:                             772.9
Df Residuals:                     339   BIC:                             827.1
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         11.1296      2.304      4.830      0.0

In [13]:
model.f_test("(frstbase = 0), (scndbase = 0), (thrdbase = 0), (shrtstop = 0), (catcher = 0)")

<class 'statsmodels.stats.contrast.ContrastResults'>
<F test: F=array([[1.77743354]]), p=0.11682998340619842, df_denom=339, df_num=5>

C3.i $H_{catch} = 0$ vs $H_{catch} \ne 0$. The null hypothesis is only rejected at the 10% level (if true, catchers make 25% more)

C3.ii The test that all positions make the same is not even rejected at the 10% level.

C3.iii The results appear to be consistent

In [14]:
# Exercise 4
gpa2 = pd.read_excel("excel/gpa2.xls", 
                     names = ["sat", "tothrs", "colgpa", "athlete", "verbmath", "hsize", "hsrank", "hsperc", "female", "white", "black", "hsizesq"],
                     sep = " ", header = None)

y = gpa2.colgpa
X = sm.add_constant(gpa2[["hsize", "hsizesq", "hsperc", "sat", "female", "athlete"]])
model = sm.OLS(y, X).fit()
model_summary = model.summary()
print(model_summary)

                            OLS Regression Results                            
Dep. Variable:                 colgpa   R-squared:                       0.293
Model:                            OLS   Adj. R-squared:                  0.291
Method:                 Least Squares   F-statistic:                     284.6
Date:                Mon, 25 May 2020   Prob (F-statistic):          8.63e-306
Time:                        18:38:38   Log-Likelihood:                -3426.3
No. Observations:                4137   AIC:                             6867.
Df Residuals:                    4130   BIC:                             6911.
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          1.2414      0.079     15.616      0.0

In [15]:
X = sm.add_constant(gpa2[["hsize", "hsizesq", "hsperc", "female", "athlete"]])
model = sm.OLS(y, X).fit()
model_summary = model.summary()
print(model_summary)

                            OLS Regression Results                            
Dep. Variable:                 colgpa   R-squared:                       0.189
Model:                            OLS   Adj. R-squared:                  0.188
Method:                 Least Squares   F-statistic:                     191.9
Date:                Mon, 25 May 2020   Prob (F-statistic):          2.47e-184
Time:                        18:38:38   Log-Likelihood:                -3710.0
No. Observations:                4137   AIC:                             7432.
Df Residuals:                    4131   BIC:                             7470.
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          3.0477      0.033     92.594      0.0

In [16]:
print(gpa2.sat[gpa2.athlete == 1].mean())
print(gpa2.sat[gpa2.athlete == 0].mean())

914.020618556701
1036.0537661678925


In [17]:
gpa2["femaleathlete"] = gpa2.athlete * gpa2.female
gpa2["male"] = 1 - gpa2.female
gpa2["maleathlete"] = gpa2.male * gpa2.athlete

X = sm.add_constant(gpa2[["hsize", "hsizesq", "hsperc", "sat", "male", "maleathlete", "femaleathlete"]])
model = sm.OLS(y, X).fit()
model_summary = model.summary()
print(model_summary)

                            OLS Regression Results                            
Dep. Variable:                 colgpa   R-squared:                       0.293
Model:                            OLS   Adj. R-squared:                  0.291
Method:                 Least Squares   F-statistic:                     243.9
Date:                Mon, 25 May 2020   Prob (F-statistic):          1.51e-304
Time:                        18:38:38   Log-Likelihood:                -3426.3
No. Observations:                4137   AIC:                             6869.
Df Residuals:                    4129   BIC:                             6919.
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
const             1.3962      0.076     18.478

In [18]:
gpa2["femalesat"] = gpa2.sat * gpa2.female

X = sm.add_constant(gpa2[["hsize", "hsizesq", "hsperc", "sat", "male", "maleathlete", "femaleathlete", "femalesat"]])
model = sm.OLS(y, X).fit()
model_summary = model.summary()
print(model_summary)

                            OLS Regression Results                            
Dep. Variable:                 colgpa   R-squared:                       0.293
Model:                            OLS   Adj. R-squared:                  0.291
Method:                 Least Squares   F-statistic:                     213.4
Date:                Mon, 25 May 2020   Prob (F-statistic):          2.25e-303
Time:                        18:38:39   Log-Likelihood:                -3426.2
No. Observations:                4137   AIC:                             6870.
Df Residuals:                    4128   BIC:                             6927.
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
const             1.3643      0.108     12.636

C4.ii 0.17 points on GPA. This is one of the largest effects and is statistically significant

C4.iii The effect is much smaller and no longer statistically significant. Athletes appear to have lower SAT scores than non-athletes and so removing this control makes it harder to identify the effects of being an athlete

C4.iv The base case is female non-athletes. Given that the coefficient for female athletes is significant, we 

C4.v We do not finde evidence to suggest that SAT scores are different by gender

In [19]:
# Exercise 5
ceosal1 = pd.read_stata("stata/CEOSAL1.DTA")
ceosal1["rosneg"] = (ceosal1.ros < 0).astype('int32')

y = ceosal1.lsalary
X = sm.add_constant(ceosal1[["lsales", "roe", "rosneg"]])
model = sm.OLS(y, X).fit()
model_summary = model.summary()
print(model_summary)

                            OLS Regression Results                            
Dep. Variable:                lsalary   R-squared:                       0.297
Model:                            OLS   Adj. R-squared:                  0.286
Method:                 Least Squares   F-statistic:                     28.81
Date:                Mon, 25 May 2020   Prob (F-statistic):           1.37e-15
Time:                        18:38:39   Log-Likelihood:                -140.47
No. Observations:                 209   AIC:                             288.9
Df Residuals:                     205   BIC:                             302.3
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          4.2976      0.293     14.655      0.0

C5.i Constant shifts with the value of PC. NoPC becomes negative.

C5.ii R-squared is the same

C5.iii Both should not due to collinearity

In [20]:
# Exercise 6
sleep75 = pd.read_stata("stata/SLEEP75.DTA")
sleep75_men = sleep75[sleep75.male == 1]
sleep75_women = sleep75[sleep75.male == 0]

y = sleep75.sleep
X = sm.add_constant(sleep75[["totwrk", "educ", "age", "agesq", "yngkid"]])
model = sm.OLS(y, X).fit()
model_summary = model.summary()
print(model_summary)
ssrp = model.ssr
n = model.nobs
k = 5

y = sleep75_men.sleep
X = sm.add_constant(sleep75_men[["totwrk", "educ", "age", "agesq", "yngkid"]])
model_m = sm.OLS(y, X).fit()
model_summary_m = model_m.summary()
print(model_summary_m)
ssr1 = model_m.ssr

y = sleep75_women.sleep
X = sm.add_constant(sleep75_women[["totwrk", "educ", "age", "agesq", "yngkid"]])
model_f = sm.OLS(y, X).fit()
model_summary_f = model_f.summary()
print(model_summary_f)
ssr2 = model_f.ssr

chow_statistic = ((ssrp - (ssr1 + ssr2)) / (ssr1 + ssr2)) * ((n - (2*(k + 1))) / (k+1))
print(chow_statistic)

y = sleep75.sleep
X = sm.add_constant(sleep75[["male", "totwrk", "educ", "age", "agesq", "yngkid"]])
model = sm.OLS(y, X).fit()
ssrp = model.ssr
chow_statistic = ((ssrp - (ssr1 + ssr2)) / (ssr1 + ssr2)) * ((n - (2*(k + 1))) / k)

print(chow_statistic)

                            OLS Regression Results                            
Dep. Variable:                  sleep   R-squared:                       0.115
Model:                            OLS   Adj. R-squared:                  0.108
Method:                 Least Squares   F-statistic:                     18.14
Date:                Mon, 25 May 2020   Prob (F-statistic):           6.00e-17
Time:                        18:38:39   Log-Likelihood:                -5262.6
No. Observations:                 706   AIC:                         1.054e+04
Df Residuals:                     700   BIC:                         1.056e+04
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       3825.3749    240.258     15.922      0.0

In [21]:
sleep75["totwrkmale"] = sleep75.totwrk * sleep75.male
sleep75["educmale"] = sleep75.educ * sleep75.male
sleep75["agemale"] = sleep75.age * sleep75.male
sleep75["agesqmale"] = sleep75.agesq * sleep75.male
sleep75["yngkidmale"] = sleep75.yngkid * sleep75.male

y = sleep75.sleep
X = sm.add_constant(sleep75[["male", "totwrk", "totwrkmale", "educ", "educmale", "age", "agemale", "agesq", "agesqmale", "yngkid", "yngkidmale"]])
model = sm.OLS(y, X).fit()
model_summary = model.summary()
print(model_summary)

                            OLS Regression Results                            
Dep. Variable:                  sleep   R-squared:                       0.131
Model:                            OLS   Adj. R-squared:                  0.117
Method:                 Least Squares   F-statistic:                     9.479
Date:                Mon, 25 May 2020   Prob (F-statistic):           4.95e-16
Time:                        18:38:39   Log-Likelihood:                -5256.2
No. Observations:                 706   AIC:                         1.054e+04
Df Residuals:                     694   BIC:                         1.059e+04
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       4238.7293    367.852     11.523      0.0

In [22]:
model.f_test("(male = 0), (totwrkmale = 0), (educmale = 0), (agemale = 0), (agesqmale = 0), (yngkidmale = 0)")

<class 'statsmodels.stats.contrast.ContrastResults'>
<F test: F=array([[2.11635063]]), p=0.04949224486431429, df_denom=694, df_num=6>

In [23]:
model.f_test("(totwrkmale = 0), (educmale = 0), (agemale = 0), (agesqmale = 0), (yngkidmale = 0)")

<class 'statsmodels.stats.contrast.ContrastResults'>
<F test: F=array([[1.25579217]]), p=0.28135192851645163, df_denom=694, df_num=5>

In [24]:
print(ssrp, ssr1, ssr2)

122147777.1993219 63763978.98930618 57288575.940988846


C6.i There appear to be differences between the two but it is difficult to tell because they are not significant

C6.ii We reject the null at the 5% level for the first method (all interaction terms)

C6.iii We fail to reject the null once we allow different intercepts between groups

C6.iv From ii and iii we should use a model which includes a term for male, but not include interactions

In [25]:
# Exercise 7
wage1 = pd.read_stata("stata/WAGE1.DTA")
wage1["educfemale"] = wage1.educ * wage1.female
wage1["experfemale"] = wage1.exper * wage1.female

y = wage1.lwage
X = sm.add_constant(wage1[["female", "educ", "educfemale", "exper", "expersq", "tenure", "tenursq"]])
model = sm.OLS(y, X).fit()
model_summary = model.summary()
print(model_summary)

                            OLS Regression Results                            
Dep. Variable:                  lwage   R-squared:                       0.441
Model:                            OLS   Adj. R-squared:                  0.433
Method:                 Least Squares   F-statistic:                     58.37
Date:                Mon, 25 May 2020   Prob (F-statistic):           1.67e-61
Time:                        18:38:39   Log-Likelihood:                -260.49
No. Observations:                 526   AIC:                             537.0
Df Residuals:                     518   BIC:                             571.1
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.3888      0.119      3.276      0.0

In [26]:
(model.params[1] + (model.params[2] * 12.5) + (model.params[3] * 12.5)) - (model.params[2] * 12.5)

-0.2963450285759721

In [27]:
wage1["adjustedfemaleeduc"] = (wage1.educ - 12.5) * wage1.female

y = wage1.lwage
X = sm.add_constant(wage1[["female", "educ", "adjustedfemaleeduc", "exper", "expersq", "tenure", "tenursq"]])
model = sm.OLS(y, X).fit()
model_summary = model.summary()
print(model_summary)

                            OLS Regression Results                            
Dep. Variable:                  lwage   R-squared:                       0.441
Model:                            OLS   Adj. R-squared:                  0.433
Method:                 Least Squares   F-statistic:                     58.37
Date:                Mon, 25 May 2020   Prob (F-statistic):           1.67e-61
Time:                        18:38:39   Log-Likelihood:                -260.49
No. Observations:                 526   AIC:                             537.0
Df Residuals:                     518   BIC:                             571.1
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
const                  0.3888      0

C7.i Differential listed above. Women make even less when educated at 12.5

C7.ii The coefficient on female is now interpreted as women with 12.5 years of education

C7.iii Female is now statistically significant. Women make less.

In [28]:
# Exercise 8
loanapp = pd.read_stata("stata/loanapp.dta")

loanapp_reg = loanapp[["approve", "white", "hrat", "obrat", "loanprc", "unem", "male", "married", "dep", "sch", "cosign", "chist", "pubrec", "mortlat1", "mortlat2", "vr"]].dropna()

y = loanapp_reg.approve
X = sm.add_constant(loanapp_reg[["white", "hrat", "obrat", "loanprc", "unem", "male", "married", "dep", "sch", "cosign", "chist", "pubrec", "mortlat1", "mortlat2", "vr"]])
model = sm.OLS(y, X).fit()
model_summary = model.summary()
print(model_summary)

                            OLS Regression Results                            
Dep. Variable:                approve   R-squared:                       0.166
Model:                            OLS   Adj. R-squared:                  0.159
Method:                 Least Squares   F-statistic:                     25.86
Date:                Mon, 25 May 2020   Prob (F-statistic):           1.84e-66
Time:                        18:38:40   Log-Likelihood:                -429.26
No. Observations:                1971   AIC:                             890.5
Df Residuals:                    1955   BIC:                             979.9
Df Model:                          15                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.9367      0.053     17.763      0.0

In [29]:
loanapp_reduced = loanapp[["approve", "white", "hrat", "obrat", "loanprc", "unem", "male", "married", "dep", "sch", "cosign", "chist", "pubrec", "mortlat1", "mortlat2", "vr"]].dropna()

y = loanapp_reduced.approve
X = sm.add_constant(loanapp_reduced[["white", "hrat", "obrat", "loanprc", "unem", "male", "married", "dep", "sch", "cosign", "chist", "pubrec", "mortlat1", "mortlat2", "vr"]])
model = sm.OLS(y, X).fit()
model_summary = model.summary()
print(model_summary)

                            OLS Regression Results                            
Dep. Variable:                approve   R-squared:                       0.166
Model:                            OLS   Adj. R-squared:                  0.159
Method:                 Least Squares   F-statistic:                     25.86
Date:                Mon, 25 May 2020   Prob (F-statistic):           1.84e-66
Time:                        18:38:40   Log-Likelihood:                -429.26
No. Observations:                1971   AIC:                             890.5
Df Residuals:                    1955   BIC:                             979.9
Df Model:                          15                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.9367      0.053     17.763      0.0

In [30]:
loanapp_reduced["obratwhite"] = loanapp_reduced.obrat * loanapp_reduced.white

X = sm.add_constant(loanapp_reduced[["white", "hrat", "obrat", "obratwhite", "loanprc", "unem", "male", "married", "dep", "sch", "cosign", "chist", "pubrec", "mortlat1", "mortlat2", "vr"]])
model = sm.OLS(y, X).fit()
model_summary = model.summary()
print(model_summary)

                            OLS Regression Results                            
Dep. Variable:                approve   R-squared:                       0.171
Model:                            OLS   Adj. R-squared:                  0.164
Method:                 Least Squares   F-statistic:                     25.17
Date:                Mon, 25 May 2020   Prob (F-statistic):           2.37e-68
Time:                        18:38:40   Log-Likelihood:                -422.99
No. Observations:                1971   AIC:                             880.0
Df Residuals:                    1954   BIC:                             974.9
Df Model:                          16                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          1.1806      0.087     13.601      0.0

In [31]:
(model.params[1] + (model.params[3] * 32) + (model.params[4] * 32)) - (model.params[3] * 32)

0.11283818312387683

C8.i We would expect a positive sign

C8.ii It is statistically significant and suggests white borrowers are 20% more likely to get approved

C8.iii The coefficient on white is smaller but still significant (and practically large at 10%). This still indicates discrimination

C8.iv The interaction term is sgnificant.

C8.v Being white means you are 11% more likely to get a loan

In [32]:
# Exercise 9
k401subs = pd.read_stata("stata/401ksubs.dta")

k401subs[k401subs.e401k == 1].shape[0] / k401subs.shape[0]

0.39212938005390835

In [33]:
y = k401subs.e401k
X = sm.add_constant(k401subs[["inc", "incsq", "age", "agesq", "male"]])
model = sm.OLS(y, X).fit()
model_summary = model.summary()
print(model_summary)

                            OLS Regression Results                            
Dep. Variable:                  e401k   R-squared:                       0.094
Model:                            OLS   Adj. R-squared:                  0.094
Method:                 Least Squares   F-statistic:                     193.0
Date:                Mon, 25 May 2020   Prob (F-statistic):          3.41e-196
Time:                        18:38:40   Log-Likelihood:                -6051.5
No. Observations:                9275   AIC:                         1.211e+04
Df Residuals:                    9269   BIC:                         1.216e+04
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.5063      0.081     -6.243      0.0

In [34]:
print(model.fittedvalues.max())
print(model.fittedvalues.min())

0.6971898835393978
0.029917158729936277


In [35]:
k401subs["predfit"] = 0 + (model.fittedvalues >= 0.5).astype('int32')

k401subs[k401subs.predfit == 1].shape[0] / k401subs.shape[0]

0.2652291105121294

In [36]:
print(k401subs[(k401subs.predfit == 0) & (k401subs.e401k == 0)].shape[0] / k401subs[k401subs.e401k == 0].shape[0])
print(k401subs[(k401subs.predfit == 1) & (k401subs.e401k == 0)].shape[0] / k401subs[k401subs.e401k == 1].shape[0])

0.8171337353671515
0.2834753918064339


In [37]:
X = sm.add_constant(k401subs[["inc", "incsq", "age", "agesq", "male", "pira"]])
model = sm.OLS(y, X).fit()
model_summary = model.summary()
print(model_summary)

                            OLS Regression Results                            
Dep. Variable:                  e401k   R-squared:                       0.095
Model:                            OLS   Adj. R-squared:                  0.094
Method:                 Least Squares   F-statistic:                     161.3
Date:                Mon, 25 May 2020   Prob (F-statistic):          1.35e-195
Time:                        18:38:40   Log-Likelihood:                -6050.2
No. Observations:                9275   AIC:                         1.211e+04
Df Residuals:                    9268   BIC:                         1.216e+04
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.5019      0.081     -6.186      0.0

C9.i 39% of families are eligible for for participation

C9.iii Income and age are significant and so it is hard to how we could support this claim. Gender does not have evidence that it is any different from zero with a small standard error and so we are more secure in the claim that it is unrelated to gender.

C9.iv No fitted values are greater than one or less than zero

C9.v 26.5% are predicted to be eligible

C9.vi 81.7% are predicted not to be eligible (of the ones not eligible). 28% of the families that are eligible are predicted to be eligible.

C9.vii This does not provide any information that the model much more reliably predicts people not eligible than eligible.

C9.viii pira makes a family about 2% more likely to be in a 401k plan, but it is not significant at even the 10% level

In [38]:
# Exercise 10
nbasal = pd.read_stata("stata/nbasal.dta")

y = nbasal.points
X = sm.add_constant(nbasal[["exper", "expersq", "guard", "forward"]])
model = sm.OLS(y, X).fit()
model_summary = model.summary()
print(model_summary)

                            OLS Regression Results                            
Dep. Variable:                 points   R-squared:                       0.091
Model:                            OLS   Adj. R-squared:                  0.077
Method:                 Least Squares   F-statistic:                     6.606
Date:                Mon, 25 May 2020   Prob (F-statistic):           4.43e-05
Time:                        18:38:40   Log-Likelihood:                -845.86
No. Observations:                 269   AIC:                             1702.
Df Residuals:                     264   BIC:                             1720.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          4.7608      1.179      4.039      0.0

In [39]:
X = sm.add_constant(nbasal[["exper", "expersq", "guard", "forward", "marr"]])
model = sm.OLS(y, X).fit()
model_summary = model.summary()
print(model_summary)

                            OLS Regression Results                            
Dep. Variable:                 points   R-squared:                       0.093
Model:                            OLS   Adj. R-squared:                  0.076
Method:                 Least Squares   F-statistic:                     5.401
Date:                Mon, 25 May 2020   Prob (F-statistic):           9.53e-05
Time:                        18:38:40   Log-Likelihood:                -845.54
No. Observations:                 269   AIC:                             1703.
Df Residuals:                     263   BIC:                             1725.
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          4.7029      1.182      3.980      0.0

In [40]:
nbasal["marrexper"] = nbasal.exper * nbasal.marr
nbasal["marrexpersq"] = nbasal.expersq * nbasal.marr

X = sm.add_constant(nbasal[["exper", "expersq", "guard", "forward", "marr", "marrexper", "marrexpersq"]])
model = sm.OLS(y, X).fit()
model_summary = model.summary()
print(model_summary)

                            OLS Regression Results                            
Dep. Variable:                 points   R-squared:                       0.106
Model:                            OLS   Adj. R-squared:                  0.082
Method:                 Least Squares   F-statistic:                     4.413
Date:                Mon, 25 May 2020   Prob (F-statistic):           0.000119
Time:                        18:38:40   Log-Likelihood:                -843.64
No. Observations:                 269   AIC:                             1703.
Df Residuals:                     261   BIC:                             1732.
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
const           5.8162      1.349      4.312      

In [41]:
model.f_test("(marr = 0), (marrexper = 0), (marrexpersq = 0)")

<class 'statsmodels.stats.contrast.ContrastResults'>
<F test: F=array([[1.4441516]]), p=0.23034807856107176, df_denom=261, df_num=3>

In [42]:
y = nbasal.assists
X = sm.add_constant(nbasal[["exper", "expersq", "guard", "forward", "marr"]])
model = sm.OLS(y, X).fit()
model_summary = model.summary()
print(model_summary)

                            OLS Regression Results                            
Dep. Variable:                assists   R-squared:                       0.350
Model:                            OLS   Adj. R-squared:                  0.338
Method:                 Least Squares   F-statistic:                     28.31
Date:                Mon, 25 May 2020   Prob (F-statistic):           6.24e-23
Time:                        18:38:40   Log-Likelihood:                -521.96
No. Observations:                 269   AIC:                             1056.
Df Residuals:                     263   BIC:                             1077.
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.2258      0.355     -0.636      0.5

C10.ii Because I'm not stupid (dummy variable trap)

C10.iii Guards score more than centers by about 2 points. The difference is statistically significant.

C10.iv Married players do not appear to score more than unmarried players when experience and position are taken into account

C10.v Marriage is still not significant when interacted (testing both individual coefficients and jointly)

C10.vi Experience appears to affect assists less. This may be because experienced/senior players are deferred to or otherwise given more opportunities.

In [43]:
# Exercise 11
k401subs = pd.read_stata("stata/401ksubs.dta")

print(k401subs.nettfa.mean())
print(k401subs.nettfa.std())
print(k401subs.nettfa.min())
print(k401subs.nettfa.max())

19.071678161621094
63.96397399902344
-502.302001953125
1536.7979736328125


In [44]:
y = k401subs.nettfa
X = sm.add_constant(k401subs[["e401k"]])
model = sm.OLS(y, X).fit()
model_summary = model.summary()
print(model_summary)

                            OLS Regression Results                            
Dep. Variable:                 nettfa   R-squared:                       0.021
Model:                            OLS   Adj. R-squared:                  0.021
Method:                 Least Squares   F-statistic:                     196.2
Date:                Mon, 25 May 2020   Prob (F-statistic):           3.93e-44
Time:                        18:38:40   Log-Likelihood:                -51631.
No. Observations:                9275   AIC:                         1.033e+05
Df Residuals:                    9273   BIC:                         1.033e+05
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         11.6768      0.843     13.851      0.0

In [45]:
X = sm.add_constant(k401subs[["inc", "incsq", "age", "agesq", "e401k"]])
model = sm.OLS(y, X).fit()
model_summary = model.summary()
print(model_summary)

                            OLS Regression Results                            
Dep. Variable:                 nettfa   R-squared:                       0.202
Model:                            OLS   Adj. R-squared:                  0.201
Method:                 Least Squares   F-statistic:                     468.7
Date:                Mon, 25 May 2020   Prob (F-statistic):               0.00
Time:                        18:38:40   Log-Likelihood:                -50683.
No. Observations:                9275   AIC:                         1.014e+05
Df Residuals:                    9269   BIC:                         1.014e+05
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         23.0852      9.960      2.318      0.0

In [46]:
k401subs["agee401kdemean"] = (k401subs.age - 41) * k401subs.e401k
k401subs["agesq401kdemean"] = ((k401subs.age - 41) ** 2) * k401subs.e401k

X = sm.add_constant(k401subs[["inc", "incsq", "age", "agesq", "agee401kdemean", "agesq401kdemean", "e401k"]])
model = sm.OLS(y, X).fit()
model_summary = model.summary()
print(model_summary)

                            OLS Regression Results                            
Dep. Variable:                 nettfa   R-squared:                       0.204
Model:                            OLS   Adj. R-squared:                  0.204
Method:                 Least Squares   F-statistic:                     340.2
Date:                Mon, 25 May 2020   Prob (F-statistic):               0.00
Time:                        18:38:40   Log-Likelihood:                -50668.
No. Observations:                9275   AIC:                         1.014e+05
Df Residuals:                    9267   BIC:                         1.014e+05
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
const              30.1681     10.180     

In [47]:
model.f_test("(agee401kdemean = 0), (agesq401kdemean = 0)")

<class 'statsmodels.stats.contrast.ContrastResults'>
<F test: F=array([[15.26945974]]), p=2.395856118129334e-07, df_denom=9.27e+03, df_num=2>

In [48]:
k401subs["fsize1"] = (k401subs.fsize == 1).astype("int32")
k401subs["fsize2"] = (k401subs.fsize == 2).astype("int32")
k401subs["fsize3"] = (k401subs.fsize == 3).astype("int32")
k401subs["fsize4"] = (k401subs.fsize == 4).astype("int32")
k401subs["fsize5"] = (k401subs.fsize > 4).astype("int32")

X = sm.add_constant(k401subs[["inc", "incsq", "age", "agesq", "e401k", "fsize1", "fsize2", "fsize4", "fsize5"]])
model = sm.OLS(y, X).fit()
model_summary = model.summary()
print(model_summary)

k401subs["fsize1inc"] = k401subs.inc * k401subs.fsize1
k401subs["fsize1incsq"] = k401subs.incsq * k401subs.fsize1
k401subs["fsize1age"] = k401subs.age * k401subs.fsize1
k401subs["fsize1agesq"] = k401subs.agesq * k401subs.fsize1
k401subs["fsize1e401k"] = k401subs.e401k * k401subs.fsize1

k401subs["fsize2inc"] = k401subs.inc * k401subs.fsize2
k401subs["fsize2incsq"] = k401subs.incsq * k401subs.fsize2
k401subs["fsize2age"] = k401subs.age * k401subs.fsize2
k401subs["fsize2agesq"] = k401subs.agesq * k401subs.fsize2
k401subs["fsize2e401k"] = k401subs.e401k * k401subs.fsize2

k401subs["fsize4inc"] = k401subs.inc * k401subs.fsize4
k401subs["fsize4incsq"] = k401subs.incsq * k401subs.fsize4
k401subs["fsize4age"] = k401subs.age * k401subs.fsize4
k401subs["fsize4agesq"] = k401subs.agesq * k401subs.fsize4
k401subs["fsize4e401k"] = k401subs.e401k * k401subs.fsize4

k401subs["fsize5inc"] = k401subs.inc * k401subs.fsize5
k401subs["fsize5incsq"] = k401subs.incsq * k401subs.fsize5
k401subs["fsize5age"] = k401subs.age * k401subs.fsize5
k401subs["fsize5agesq"] = k401subs.agesq * k401subs.fsize5
k401subs["fsize5e401k"] = k401subs.e401k * k401subs.fsize5

X = sm.add_constant(k401subs[["inc", "incsq", "age", "agesq", "e401k", "fsize1", "fsize2", "fsize4", "fsize5", "fsize1inc", "fsize1incsq", "fsize1age", "fsize1agesq", "fsize1e401k", "fsize2inc", "fsize2incsq", "fsize2age", "fsize2agesq", "fsize2e401k", "fsize4inc", "fsize4incsq", "fsize4age", "fsize4agesq", "fsize4e401k", "fsize5inc", "fsize5incsq", "fsize5age", "fsize5agesq", "fsize5e401k"]])
model = sm.OLS(y, X).fit()
model_summary = model.summary()
print(model_summary)

                            OLS Regression Results                            
Dep. Variable:                 nettfa   R-squared:                       0.204
Model:                            OLS   Adj. R-squared:                  0.203
Method:                 Least Squares   F-statistic:                     263.3
Date:                Mon, 25 May 2020   Prob (F-statistic):               0.00
Time:                        18:38:41   Log-Likelihood:                -50672.
No. Observations:                9275   AIC:                         1.014e+05
Df Residuals:                    9265   BIC:                         1.014e+05
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         11.6714     10.292      1.134      0.2

In [49]:
print(model.f_test("(fsize1 = 0), (fsize2 = 0), (fsize4 = 0), (fsize5 = 0), (fsize1inc = 0), (fsize1incsq = 0), (fsize1age = 0), (fsize1agesq = 0), (fsize1e401k = 0), (fsize2inc = 0), (fsize2incsq = 0), (fsize2age = 0), (fsize2agesq = 0), (fsize2e401k = 0), (fsize4inc = 0), (fsize4incsq = 0), (fsize4age = 0), (fsize4agesq = 0), (fsize4e401k = 0), (fsize5inc = 0), (fsize5incsq = 0), (fsize5age = 0), (fsize5agesq = 0), (fsize5e401k = 0)"))
print(model.f_test("(fsize1inc = 0), (fsize1incsq = 0), (fsize1age = 0), (fsize1agesq = 0), (fsize1e401k = 0), (fsize2inc = 0), (fsize2incsq = 0), (fsize2age = 0), (fsize2agesq = 0), (fsize2e401k = 0), (fsize4inc = 0), (fsize4incsq = 0), (fsize4age = 0), (fsize4agesq = 0), (fsize4e401k = 0), (fsize5inc = 0), (fsize5incsq = 0), (fsize5age = 0), (fsize5agesq = 0), (fsize5e401k = 0)"))

<F test: F=array([[3.86409524]]), p=5.785826963870373e-10, df_denom=9.24e+03, df_num=24>
<F test: F=array([[3.54267375]]), p=1.4249267096183198e-07, df_denom=9.24e+03, df_num=20>


C11.iii e401k is half

C11.iv Age is significant (not quadratic)

C11.v The difference isn't substantial.

C11.vi None of the family dummies are significant at the 1% level

C11.vii The Chow test rejects both sets of terms. It suggests we should include not just alternative intecepts but interactions.

In [50]:
# Exercise 12
beauty = pd.read_stata("stata/beauty.dta")

print(beauty[((beauty.abvavg == 1) & (beauty.female == 1))].shape[0] / beauty[beauty.female == 1].shape[0])
print(beauty[((beauty.abvavg == 1) & (beauty.female == 0))].shape[0] / beauty[beauty.female == 0].shape[0])

print(beauty[((beauty.belavg == 1) & (beauty.female == 1))].shape[0] / beauty[beauty.female == 1].shape[0])
print(beauty[((beauty.belavg == 1) & (beauty.female == 0))].shape[0] / beauty[beauty.female == 0].shape[0])

0.3302752293577982
0.29004854368932037
0.1353211009174312
0.11650485436893204


In [51]:
y = beauty.abvavg
X = sm.add_constant(beauty[["female"]])
model = sm.OLS(y, X).fit()
model_summary = model.summary()
print(model_summary)

                            OLS Regression Results                            
Dep. Variable:                 abvavg   R-squared:                       0.002
Model:                            OLS   Adj. R-squared:                  0.001
Method:                 Least Squares   F-statistic:                     2.181
Date:                Mon, 25 May 2020   Prob (F-statistic):              0.140
Time:                        18:38:41   Log-Likelihood:                -808.26
No. Observations:                1260   AIC:                             1621.
Df Residuals:                    1258   BIC:                             1631.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.2900      0.016     18.102      0.0

In [52]:
beauty_men = beauty[beauty.female == 0]
beauty_women = beauty[beauty.female == 1]

y = beauty_men.lwage
X = sm.add_constant(beauty_men[["belavg", "abvavg"]])
model = sm.OLS(y, X).fit()
model_summary = model.summary()
print(model_summary)

y = beauty_women.lwage
X = sm.add_constant(beauty_women[["belavg", "abvavg"]])
model = sm.OLS(y, X).fit()
model_summary = model.summary()
print(model_summary)

                            OLS Regression Results                            
Dep. Variable:                  lwage   R-squared:                       0.013
Model:                            OLS   Adj. R-squared:                  0.011
Method:                 Least Squares   F-statistic:                     5.529
Date:                Mon, 25 May 2020   Prob (F-statistic):            0.00412
Time:                        18:38:41   Log-Likelihood:                -655.76
No. Observations:                 824   AIC:                             1318.
Df Residuals:                     821   BIC:                             1332.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          1.8839      0.024     77.541      0.0

In [53]:
y = beauty_men.lwage
X = sm.add_constant(beauty_men[["belavg", "abvavg", "educ", "exper", "expersq", "union", "goodhlth", "black", "married", "south", "bigcity", "smllcity", "service"]])
model = sm.OLS(y, X).fit()
model_summary = model.summary()
print(model_summary)
ssrm = model.ssr

y = beauty_women.lwage
X = sm.add_constant(beauty_women[["belavg", "abvavg", "educ", "exper", "expersq", "union", "goodhlth", "black", "married", "south", "bigcity", "smllcity", "service"]])
model = sm.OLS(y, X).fit()
model_summary = model.summary()
print(model_summary)
ssrw = model.ssr

                            OLS Regression Results                            
Dep. Variable:                  lwage   R-squared:                       0.308
Model:                            OLS   Adj. R-squared:                  0.297
Method:                 Least Squares   F-statistic:                     27.79
Date:                Mon, 25 May 2020   Prob (F-statistic):           1.71e-56
Time:                        18:38:41   Log-Likelihood:                -509.31
No. Observations:                 824   AIC:                             1047.
Df Residuals:                     810   BIC:                             1113.
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.3580      0.119      3.017      0.0

In [54]:
y = beauty.lwage
X = sm.add_constant(beauty[["belavg", "abvavg", "educ", "exper", "expersq", "union", "goodhlth", "black", "married", "south", "bigcity", "smllcity", "service"]])
model = sm.OLS(y, X).fit()

chow = ((model.ssr - (ssrm + ssrw))/(ssrm + ssrw)) * ((model.nobs - (2 * (model.df_model + 1))) / model.df_model)

print(chow)

16.62216554950696


C12.i More people are rated as being above average

C12.ii We fail to reject the null

C12.iii The hypothesis is that below average is associated with lower wages p-values are above if you divide by 2

C12.iv There is no evidence to suggest the coefficient is different from 0

C12.v No substantial change it appears

C12.vi The Chow test rejects the null of equality suggesting the slopes are different between men and women

In [55]:
# Exercise 13
apple = pd.read_stata("stata/APPLE.DTA")

apple["ecobuy"] = (apple.ecolbs > 0).astype("int32")

apple[apple.ecobuy == 1].shape[0] / apple.shape[0]

0.6242424242424243

In [56]:
y = apple.ecobuy
X = sm.add_constant(apple[["ecoprc", "regprc", "faminc", "hhsize", "educ", "age"]])
model = sm.OLS(y, X).fit()
model_summary = model.summary()
print(model_summary)

                            OLS Regression Results                            
Dep. Variable:                 ecobuy   R-squared:                       0.110
Model:                            OLS   Adj. R-squared:                  0.102
Method:                 Least Squares   F-statistic:                     13.43
Date:                Mon, 25 May 2020   Prob (F-statistic):           2.18e-14
Time:                        18:38:42   Log-Likelihood:                -419.60
No. Observations:                 660   AIC:                             853.2
Df Residuals:                     653   BIC:                             884.6
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.4237      0.165      2.568      0.0

In [57]:
model.f_test("(faminc = 0), (hhsize = 0), (educ = 0), (age = 0)")

<class 'statsmodels.stats.contrast.ContrastResults'>
<F test: F=array([[4.42755104]]), p=0.0015443199731425277, df_denom=653, df_num=4>

In [58]:
apple["lfaminc"] = np.log(apple.faminc)

X = sm.add_constant(apple[["ecoprc", "regprc", "lfaminc", "hhsize", "educ", "age"]])
model = sm.OLS(y, X).fit()
model_summary = model.summary()
print(model_summary)

                            OLS Regression Results                            
Dep. Variable:                 ecobuy   R-squared:                       0.112
Model:                            OLS   Adj. R-squared:                  0.103
Method:                 Least Squares   F-statistic:                     13.67
Date:                Mon, 25 May 2020   Prob (F-statistic):           1.16e-14
Time:                        18:38:42   Log-Likelihood:                -418.94
No. Observations:                 660   AIC:                             851.9
Df Residuals:                     653   BIC:                             883.3
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.3038      0.179      1.697      0.0

In [59]:
print(model.fittedvalues[model.fittedvalues < 0].shape[0])
print(model.fittedvalues[model.fittedvalues > 1].shape[0])
print(model.fittedvalues[model.fittedvalues > 1])

0
2
167    1.050653
493    1.049140
dtype: float64


In [60]:
print(model.fittedvalues[(model.fittedvalues > 0.5) & (apple.ecobuy == 1)].shape[0] / apple[apple.ecobuy == 1].shape[0])
print(model.fittedvalues[(model.fittedvalues < 0.5) & (apple.ecobuy == 0)].shape[0] / apple[apple.ecobuy == 0].shape[0])

0.8252427184466019
0.4112903225806452


C13.i 62% of families claim they would buy ecolabeled apples

C13.iii The nonprice variables are jointly significant in the LPM, almost certainly due to educ. It seems to make sense since education may indicate a willingness to hear out the argument for eco labeling

C13.iv The log does not appear to change anything substantively.

C13.v No negative values but some greater than 1. Only 2 observations, so not a major concern.

C13.vi Predicting 1 is more accurate than 0

In [61]:
# Exercise 14
charity = pd.read_stata("stata/charity.dta")

y = charity.respond
X = sm.add_constant(charity[["resplast", "avggift"]])
model = sm.OLS(y, X).fit()
model_summary = model.summary()
print(model_summary)

                            OLS Regression Results                            
Dep. Variable:                respond   R-squared:                       0.110
Model:                            OLS   Adj. R-squared:                  0.110
Method:                 Least Squares   F-statistic:                     263.8
Date:                Mon, 25 May 2020   Prob (F-statistic):          9.88e-109
Time:                        18:38:42   Log-Likelihood:                -2761.6
No. Observations:                4268   AIC:                             5529.
Df Residuals:                    4265   BIC:                             5548.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.2821      0.009     31.918      0.0

In [62]:
X = sm.add_constant(charity[["resplast", "avggift", "propresp"]])
model = sm.OLS(y, X).fit()
model_summary = model.summary()
print(model_summary)

                            OLS Regression Results                            
Dep. Variable:                respond   R-squared:                       0.202
Model:                            OLS   Adj. R-squared:                  0.202
Method:                 Least Squares   F-statistic:                     360.8
Date:                Mon, 25 May 2020   Prob (F-statistic):          8.64e-209
Time:                        18:38:42   Log-Likelihood:                -2527.8
No. Observations:                4268   AIC:                             5064.
Df Residuals:                    4264   BIC:                             5089.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0023      0.015      0.152      0.8

In [63]:
X = sm.add_constant(charity[["resplast", "avggift", "propresp", "mailsyear"]])
model = sm.OLS(y, X).fit()
model_summary = model.summary()
print(model_summary)

                            OLS Regression Results                            
Dep. Variable:                respond   R-squared:                       0.210
Model:                            OLS   Adj. R-squared:                  0.209
Method:                 Least Squares   F-statistic:                     282.5
Date:                Mon, 25 May 2020   Prob (F-statistic):          1.03e-215
Time:                        18:38:42   Log-Likelihood:                -2508.8
No. Observations:                4268   AIC:                             5028.
Df Residuals:                    4263   BIC:                             5059.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.1172      0.025     -4.782      0.0

C14.ii Avg gift is significant but the value is quite small and so no

C14.iii The likelihood that someone who has responded to all previous mailings will respond to this one

C14.iv resplast's magnitude fell. This makes sense because propresp more directly responds someone's past ability to respond

C14.v Mailings per year is smaller than the others, but meaningful and significant. The model focuses a lot on past behaviour and treats all mailings as the same.

In [64]:
# Exercise 15
fertil2 = pd.read_stata("stata/FERTIL2.DTA")

print(fertil2.children.min())
print(fertil2.children.max())
print(fertil2.children.mean())
print(fertil2[fertil2.children == fertil2.children.mean()])

0
13
2.2678284797064894
Empty DataFrame
Columns: [mnthborn, yearborn, age, electric, radio, tv, bicycle, educ, ceb, agefbrth, children, knowmeth, usemeth, monthfm, yearfm, agefm, idlnchld, heduc, agesq, urban, urb_educ, spirit, protest, catholic, frsthalf, educ0, evermarr]
Index: []

[0 rows x 27 columns]


In [65]:
fertil2[fertil2.electric == 1].shape[0] / fertil2.shape[0]

0.14010548039440496

In [66]:
print(fertil2[fertil2.electric == 1].children.mean())
print(fertil2[fertil2.electric == 0].children.mean())

fertil2_reg = fertil2[["children", "electric"]].dropna()

y = fertil2_reg.children
X = sm.add_constant(fertil2_reg[["electric"]])
model = sm.OLS(y, X).fit()
model_summary = model.summary()
print(model_summary)

1.8985270049099836
2.327728849746464
                            OLS Regression Results                            
Dep. Variable:               children   R-squared:                       0.004
Model:                            OLS   Adj. R-squared:                  0.004
Method:                 Least Squares   F-statistic:                     19.69
Date:                Mon, 25 May 2020   Prob (F-statistic):           9.35e-06
Time:                        18:38:42   Log-Likelihood:                -9652.7
No. Observations:                4358   AIC:                         1.931e+04
Df Residuals:                    4356   BIC:                         1.932e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          

In [67]:
fertil2_reg = fertil2[["children", "age", "agesq", "educ", "electric", "urban", "catholic", "spirit", "protest"]].dropna()

y = fertil2_reg.children
X = sm.add_constant(fertil2_reg[["age", "agesq", "educ", "electric", "urban", "catholic", "spirit", "protest"]])
model = sm.OLS(y, X).fit()
model_summary = model.summary()
print(model_summary)

                            OLS Regression Results                            
Dep. Variable:               children   R-squared:                       0.574
Model:                            OLS   Adj. R-squared:                  0.573
Method:                 Least Squares   F-statistic:                     732.6
Date:                Mon, 25 May 2020   Prob (F-statistic):               0.00
Time:                        18:38:43   Log-Likelihood:                -7803.0
No. Observations:                4358   AIC:                         1.562e+04
Df Residuals:                    4349   BIC:                         1.568e+04
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -4.3147      0.243    -17.731      0.0

In [68]:
fertil2_reg["educelectric"] = fertil2_reg.educ * fertil2_reg.electric
fertil2_reg["educelectricdemean"] = (fertil2_reg.educ - 7) * fertil2_reg.electric

X = sm.add_constant(fertil2_reg[["age", "agesq", "educ", "educelectric", "electric", "urban", "catholic", "spirit", "protest"]])
model = sm.OLS(y, X).fit()
model_summary = model.summary()
print(model_summary)

                            OLS Regression Results                            
Dep. Variable:               children   R-squared:                       0.574
Model:                            OLS   Adj. R-squared:                  0.573
Method:                 Least Squares   F-statistic:                     651.5
Date:                Mon, 25 May 2020   Prob (F-statistic):               0.00
Time:                        18:38:43   Log-Likelihood:                -7802.1
No. Observations:                4358   AIC:                         1.562e+04
Df Residuals:                    4348   BIC:                         1.569e+04
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const           -4.3599      0.246    -17.741   

In [69]:
X = sm.add_constant(fertil2_reg[["age", "agesq", "educ", "educelectricdemean", "electric", "urban", "catholic", "spirit", "protest"]])
model = sm.OLS(y, X).fit()
model_summary = model.summary()
print(model_summary)

                            OLS Regression Results                            
Dep. Variable:               children   R-squared:                       0.574
Model:                            OLS   Adj. R-squared:                  0.573
Method:                 Least Squares   F-statistic:                     651.5
Date:                Mon, 25 May 2020   Prob (F-statistic):               0.00
Time:                        18:38:43   Log-Likelihood:                -7802.1
No. Observations:                4358   AIC:                         1.562e+04
Df Residuals:                    4348   BIC:                         1.569e+04
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
const                 -4.3599      0

C15.iv Electricity doesn't get people pregnant. The simple regression does not account for factors which affect pregnancy, even if electricity is related to pregnancy

C15.v Electricity is smaller but still significant

C15.vi Electricity ceases to be significant

C15.vii The effect of education is now longer concerned with effect of education at 0, and so electricity can now be properly measured seperate from the interaction with education. Electric is larger and significant.