# This is my notebook on Chapter 6 from Wooldridge
### I did 4 computer exercises

In [46]:
import wooldridge
import statsmodels.formula.api as smf
import statsmodels.api as sm
import numpy as np
import pandas as pd
from scipy import stats
import plotly.express as px
from IPython.display import display, Latex

import warnings
warnings.filterwarnings('ignore')

# Task 1

In [6]:
data = wooldridge.data('KIELMC')
data = data[data.year == 1981]
data.shape

(142, 25)

In [17]:
model = smf.ols(data=data, formula='np.log(price) ~ np.log(dist)')
fitted = model.fit()
fitted.rsquared, fitted.pvalues[fitted.pvalues < 0.05]

(0.18025880137091477,
 Intercept       1.997514e-24
 np.log(dist)    1.394998e-07
 dtype: float64)

In [21]:
model = smf.ols(data=data, formula='np.log(price) ~ np.log(dist) + np.log(intst)+\
np.log(area) + np.log(land) + rooms + baths + age')
fitted = model.fit()
fitted.rsquared, fitted.pvalues[fitted.pvalues > 0.05]

(0.7475421851715305,
 np.log(dist)     0.338153
 np.log(intst)    0.451261
 np.log(land)     0.053908
 rooms            0.134588
 dtype: float64)

In [23]:
model = smf.ols(data=data, formula='np.log(price) ~ np.log(dist) + np.log(intst)+\
np.log(area) + np.log(land) + rooms + baths + age + np.square(np.log(intst))')
fitted = model.fit()
fitted.rsquared, fitted.pvalues[fitted.pvalues > 0.05]

(0.7775494537560258,
 Intercept    0.212022
 rooms        0.154933
 dtype: float64)

### As we can see, adding square component to the model is crucial. Adding $log(intst)^2$ really made $log(dist), log(intst), log(land)$ significant.

# Task 2

In [24]:
data = wooldridge.data('WAGE1')
data.head(1)

Unnamed: 0,wage,educ,exper,tenure,nonwhite,female,married,numdep,smsa,northcen,...,trcommpu,trade,services,profserv,profocc,clerocc,servocc,lwage,expersq,tenursq
0,3.1,11,2,0,0,1,0,2,1,0,...,0,0,0,0,0,0,0,1.131402,4,0


In [68]:
model = smf.ols(data=data, formula='np.log(wage) ~ educ + exper + np.square(exper)')
fitted = model.fit()

### $$log(wage) = \beta_0 + \beta_1 \cdot educ + \beta_2 \cdot exper + \beta_3 \cdot exper^2 + u$$
$$\frac{d}{d exper}y = (\beta_2 + 2\cdot \beta_3 \cdot exper).$$

In [43]:
b0, b1, b2, b3 = fitted.params
f'Estimated return to the fifth year of experience is \
{round(100 * (b2 + 2 * b3 * 4) * 1, 2)}%'

'Estimated return to the fifth year of experience is 3.53%'

In [44]:
f'Estimated return to the fifth year of experience is \
{round(100 * (b2 + 2* b3 * 19) * 1, 2)}%'

'Estimated return to the fifth year of experience is 1.39%'

In [57]:
parabola_vertex = round(-b2 / (2 * b3), 2)
f'Parabola (of exper) vertex is at point: -b / 2a. In our case it is {parabola_vertex}'

'Parabola (of exper) vertex is at point: -b / 2a. In our case it is 28.74'

Let's check how many cases are in the sample with exper > 28.75

In [67]:
high_exper = round(100 * len(data[data['exper'] > parabola_vertex]) / len(data))
f'There are {high_exper}% of cases with exper > bound. It"s too much.'

'There are 23% of cases with exper > bound. It"s too much.'

## Task 3

In [74]:
model = smf.ols(data=data, formula='np.log(wage) ~ educ*exper')
fitted = model.fit()
fitted.summary()

0,1,2,3
Dep. Variable:,np.log(wage),R-squared:,0.25
Model:,OLS,Adj. R-squared:,0.245
Method:,Least Squares,F-statistic:,57.91
Date:,"Sat, 07 May 2022",Prob (F-statistic):,2.5e-32
Time:,10:03:42,Log-Likelihood:,-337.88
No. Observations:,526,AIC:,683.8
Df Residuals:,522,BIC:,700.8
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.1532,0.167,0.916,0.360,-0.176,0.482
educ,0.1030,0.013,8.090,0.000,0.078,0.128
exper,0.0133,0.006,2.197,0.028,0.001,0.025
educ:exper,-0.0002,0.000,-0.500,0.617,-0.001,0.001

0,1,2,3
Omnibus:,8.116,Durbin-Watson:,1.79
Prob(Omnibus):,0.017,Jarque-Bera (JB):,9.941
Skew:,0.174,Prob(JB):,0.00694
Kurtosis:,3.576,Cond. No.,2150.0


### Model: $$log(wage) = \beta_0 + \beta_1 \cdot educ + \beta_2 \cdot exper + \beta_3 \cdot educ\cdot exper + u$$
### We use the fact that: $$\frac{d}{d educ}y = (\beta_1 + \beta_3 \cdot exper).$$