# This is my notebook on Chapter 5 from Wooldridge
### I did 4 computer exercises

In [210]:
import wooldridge
import statsmodels.formula.api as smf
import statsmodels.api as sm
import numpy as np
import pandas as pd
from scipy import stats
import plotly.express as px

import warnings
warnings.filterwarnings('ignore')

## Task 1

In [163]:
data = wooldridge.data('WAGE1')

In [164]:
model = smf.ols(data=data, formula='wage ~ educ + exper + tenure')
fitted = model.fit()
fitted.rsquared

0.3064224368841789

In [165]:
model_log = smf.ols(data=data, formula='np.log(wage) ~ educ + exper + tenure')
fitted_log = model_log.fit()
fitted_log.rsquared

0.3160133195452939

In [166]:
residuals, residuals_log = fitted.resid, fitted_log.resid
total = pd.concat([residuals, residuals_log], keys=['level', 'log'], names=['model', 'id'])
df = pd.DataFrame(total).reset_index().rename(columns={0: 'residuals'})
df.head(1)

Unnamed: 0,model,id,residuals
0,level,0,-0.66056


In [167]:
fig = px.histogram(data_frame=df, x='residuals', color='model', marginal="box",
                   barmode='overlay', opacity=.7, nbins=250)
fig.show()

As we can see, residuals are more normally distributed in log-level model. In level-level equation residuals are right-skewed and more spread out.

## Task 2

In [168]:
data = wooldridge.data('GPA2')
FIRST, ALL = 2070, data.shape[0]
data.shape

(4137, 12)

In [169]:
model = smf.ols(data=data.iloc[:, :], formula='colgpa ~ hsperc + sat')
fitted = model.fit()
se1 = fitted.bse['hsperc']
se1

0.0005494660278445033

In [170]:
model = smf.ols(data=data.iloc[:FIRST, :], formula='colgpa ~ hsperc + sat')
fitted = model.fit()
se2 = fitted.bse['hsperc']
se2

0.0007185234883026612

In [171]:
se2 / se1, np.sqrt(ALL / FIRST)

(1.3076759105951508, 1.4137010732957944)

## Task 3

In this task I would test motheduc + fatheduc significance, using Lagrange Multiplier.

In [231]:
data = wooldridge.data('BWGHT').dropna(axis=0).reset_index()
model = smf.ols(data=data, formula='bwght ~ cigs + parity + faminc')
fitted = model.fit()
residuals = fitted.resid

In [254]:
LM_model = sm.OLS(residuals, data[['cigs', 'parity', 'faminc', 'motheduc', 'fatheduc']])
LM_fitted = LM_model.fit()
LM = LM_fitted.nobs * LM_fitted.rsquared
LM

2.8176100483733233

In [255]:
critical = stats.chi2.ppf(0.95, 2)
critical

5.991464547107979

In [256]:
p_value = 1 - stats.chi2.cdf(LM, 2)
p_value

0.24443520285554854

We see, that LM-statistics is around $2.817$, but the critical value for $\chi^2_{2}$ is 6. So we cannot reject the $H_0$ hypothesis (motheduc and fatheduc are jointly significant). The p_value for this test is $\approx 0.25$.

## Task 4

In this task, I would compare skewness of distribution of some variable.

In [281]:
data = wooldridge.data('401KSUBS')
data = data[data['fsize'] == 1]
data.shape

(2017, 11)

In [317]:
inc = data.inc.reset_index(drop=True)
log_inc = np.log(inc)
inc = (inc - inc.mean()) / inc.std()
inc_skew = (inc ** 3).sum() / 2016
inc_skew, stats.skew(inc)

(1.863182464872014, 1.863644506407914)

In [321]:
log_inc = (log_inc - log_inc.mean()) / log_inc.std()
log_inc_skew = (log_inc ** 3).sum() / 2016
log_inc_skew, stats.skew(log_inc)

(0.3607819431388949, 0.3608714116940555)

In [295]:
incs = pd.concat([inc, log_inc], keys=['inc', 'log(inc)']).reset_index()
incs
fig = px.histogram(data_frame=incs, x='inc', color='level_0', marginal="box",
                   barmode='overlay', opacity=.7, nbins=150)
fig.show()