In [1]:
# import all packages 
import numpy as np
import pandas as pd
import wooldridge as woo
import statsmodels.formula.api as smf 
import patsy as pt
import scipy.stats as stats
import matplotlib.pyplot as plt

In [2]:
# ch10 q3
# when the derivative of y* is taken with respect to z, the result is deltas 1, 2, and 3
# multiplied by z'.
# the sum of deltas is equal to LRP and z' is equivalent to ∆z
# therefore, the change in y* due to a change in z* is equal to LRP*∆z.

In [3]:
# ch10 q5
# housing = B0 + B1*interest + B2*income + B3*fall + B4*winter + B5*spring + B6*summer
# adding four dummy variables for all the seasons will allow the regression to account
# for changes in real estate patterns as the time of year changes. for example, there might
# be a higher demand to move during summer when kids are not in school, so the average house
# prices might be higher then, compared to the other three seasons. 

In [4]:
# ch10 c7
c = woo.dataWoo('CONSUMP')
reg = smf.ols(formula='gc ~ gy', data=c)
results = reg.fit()
print(f'results.summary(): \n{results.summary()}\n')

# (i) the regression implies that when real per capita disposal income growth increases by 1%, 
# real per capita consumption growth will increase by .57%. 
# both the intercept and gy are significant as they have p values of 0, which is < .05.

reg = smf.ols(formula='gc ~ gy + gy_1', data=c)
results = reg.fit()
print(f'results.summary(): \n{results.summary()}\n')

# (ii) i conclude that adjustment lags in consumption growth lower the model's significance 
# given that gy_1's p value is .173. 

reg = smf.ols(formula='gc ~ gy + gy_1 + i3', data=c)
results = reg.fit()
print(f'results.summary(): \n{results.summary()}\n')

# (iii) the interest rate does not affect consumption growth since i3's p value is .092
# making the coefficient's value insignificant. 

results.summary(): 
                            OLS Regression Results                            
Dep. Variable:                     gc   R-squared:                       0.679
Model:                            OLS   Adj. R-squared:                  0.669
Method:                 Least Squares   F-statistic:                     71.81
Date:                Fri, 01 Apr 2022   Prob (F-statistic):           6.75e-10
Time:                        11:19:26   Log-Likelihood:                 127.22
No. Observations:                  36   AIC:                            -250.4
Df Residuals:                      34   BIC:                            -247.3
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.0081      0.002

In [14]:
# ch10 c12
p = woo.dataWoo('PHILLIPS')
p['inflation'] = p['inf']
reg = smf.ols(formula='inflation ~ unem', data=p)
results = reg.fit()
print(f'results.summary(): \n{results.summary()}\n')

# (i) now i have 56 observations. 

# (ii) adding the extra three years does increase the regressions adjusted r^2 by .004, 
# so it very slightly helps in obtaining an estimated tradeoff between inflation and
# unemployement. 

T = len(p - 46)
date_range = pd.date_range(start='1986', periods=T, freq='Y')
p.index = date_range.year
yt96 = (p['year'] <= 1996)
reg = smf.ols(formula='inflation ~ unem', data=p, subset=yt96)
results = reg.fit()
print(f'results.summary(): \n{results.summary()}\n')

# (iii) the dataset only includes years 1948-96, so instead of doing 2007 to 2017, i 
# included the decade between 1986-96. limiting my regression to ten years lowered
# the r^2 to .053 and made both my constant and beta1 insignificant. therefore, the 
# estimates using the most recent ten years are not precise enough to draw any firm 
# conclusions. 

T = len(p - 10)
date_range = pd.date_range(start='1948', periods=T, freq='Y')
p.index = date_range.year
yt85 = (p['year'] <= 1985)
reg = smf.ols(formula='inflation ~ unem', data=p, subset=yt85)
results = reg.fit()
print(f'results.summary(): \n{results.summary()}\n')

# (iv) 0.4676(10/56) + 0.5728(46/56) = 0.554 which ≠ 0.5024, so we cannot expect the slope
# estimate using all observations to be roughly equal to a weighted average of the slope 
# estimates on the early and later subsamples. 

results.summary(): 
                            OLS Regression Results                            
Dep. Variable:              inflation   R-squared:                       0.062
Model:                            OLS   Adj. R-squared:                  0.045
Method:                 Least Squares   F-statistic:                     3.579
Date:                Sun, 03 Apr 2022   Prob (F-statistic):             0.0639
Time:                        13:11:31   Log-Likelihood:                -139.43
No. Observations:                  56   AIC:                             282.9
Df Residuals:                      54   BIC:                             286.9
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      1.0536      1.548

In [95]:
# ch11 q1
# corr[x(t), x(t+h)] = {cov[x(t), x(t+h)]} / {std[x(t)]*std[x(t+h)]}
# = {gamma(h)} / {sqrt[gamma)0)]*sqrt[gamma)0)]}
# = gamma(h) / gamma(0)

In [6]:
# ch11 q2
# (i) E[x(t)] = E[e(t)] - (1/2)E[e(t-1)] + (1/2)E[e(t-2)]
# = 0 - (1/2)(0) + (1/2)(0)
# = 0 
# V[x(t)] = V[e(t)] + (1/4)V[e(t-1)] + (1/4)V[e(t-2)]
# = 1 + (1/4)(1) + (1/4)(1)
# = 3/2 
# therefore, E[x(t)] and V[x(t)] are both not dependent on t. 

# (ii) corr[x(t), x(t+1)] = {cov[x(t), x(t+1)]} / {sqrt[var(x(t))]*sqrt[var(x(t+1))]}
# = {-1/2*E[e(t)^2] - 1/4*E[e(t-1)^2]} / sqrt(3/2)*sqrt{V[e(t+1)] + (1/4)V[e(t)] + (1/4)V[e(t-1)]}
# = {-1/2*1 - 1/4*1} / sqrt(3/2)*sqrt{1 + (1/4)(1) + (1/4)(1)}
# = (-3/4) / sqrt(3/2)*sqrt(3/2)
# = (-3/4) / (3/2) 
# = -1/2
# corr[x(t), x(t+2)] = {cov[x(t), x(t+2)]} / {sqrt[var(x(t))]*sqrt[var(x(t+2))]}
# = {1/2*E[e(t)^2]} / sqrt(3/2)*sqrt{V[e(t+2)] + (1/4)V[e(t+1)] + (1/4)V[e(t)]}
# = {1/2*1} / sqrt(3/2)*sqrt{1 + (1/4)(1) + (1/4)(1)}
# = (1/2) / sqrt(3/2)*sqrt(3/2)
# = (1/2) / (3/2) 
# = 1/3

# (iii) corr[x(t), x(t+h)] = {cov[x(t), x(t+h)]} / {sqrt[var(x(t))]*sqrt[var(x(t+h))]}
# cov[x(t), x(t+h)] = E[x(t), x(t+h)]
# = E{[e(t) - (1/2)*e(t-1) + (1/2)*e(t-2)][e(t+h) - (1/2)*e(t+h-1) + (1/2)*e(t+h-2)]}
# = E[e(t), e(t+h)] - (1/2)*E[e(t), e(t+h-1)] + (1/2)*E[e(t), e(t+h-2)] - (1/2)*E[e(t-1), e(t+h)]
# + (1/4)*E[e(t-1), e(t+h-1)] - (1/4)*E[e(t-1), e(t+h-2)] + (1/2)*E[e(t-2), e(t+h)] 
# - (1/4)*E[e(t-2), e(t+h-1)] + (1/4)*E[e(t-2), e(t+h-2)]
# = 0 
# if cov[x(t), x(t+h)] = 0 then, corr[x(t), x(t+h)] must also = 0. 

# (iv) since corr[x(t), x(t+h)] equals zero when h is greater than 2, x(t) and x(t+h) 
# show no signs of correlation, and x(t) is an asymptotically uncorrelated process. 

In [7]:
# ch11 c11
o = woo.dataWoo('OKUN')
reg = smf.ols(formula='pcrgdp ~ cunem', data=o)
results = reg.fit()
print(f'results.summary(): \n{results.summary()}\n')

# (i) no, as i expected, the intercept and slope are slightly off at 3.34 and -1.89. 

hypothesis = ['cunem = -2']
ttest = results.t_test(hypothesis)
tstat = ttest.statistic[0][0]
tpval = ttest.pvalue
print(f'tpval1: {tpval}\n')

# (ii) since, the t p-value is .55, we fail to reject the null hypothesis, and 
# B1 equals -2. 

hypothesis = ['Intercept = 3']
ttest = results.t_test(hypothesis)
tstat = ttest.statistic[0][0]
tpval = ttest.pvalue
print(f'tpval1: {tpval}\n')

# (iii) now the t p-value is .04, so we reject the null hypothesis and the intercept is 
# not 3. without the t p-value equalling zero, i would not say this is a strong rejection. 

hypothesis = ['cunem = -2, Intercept = 3']
ftest = results.f_test(hypothesis)
fstat = ftest.statistic[0][0]
fpval = ftest.pvalue
print(f'fpval1: {fpval}\n')

# (iv) with the f p-value being .1, we fail to reject the null hypothesis at the 10% level.
# overall, i'd say the data tends to support okun's law. 

results.summary(): 
                            OLS Regression Results                            
Dep. Variable:                 pcrgdp   R-squared:                       0.710
Model:                            OLS   Adj. R-squared:                  0.704
Method:                 Least Squares   F-statistic:                     107.9
Date:                Fri, 01 Apr 2022   Prob (F-statistic):           2.04e-13
Time:                        11:19:27   Log-Likelihood:                -68.769
No. Observations:                  46   AIC:                             141.5
Df Residuals:                      44   BIC:                             145.2
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      3.3444      0.163

In [23]:
# ch11 c13
b = woo.dataWoo('BEVERIDGE')
correlation = b.corr()
print(f'correlation:\n{correlation}\n')

# (i) given that the correlation coefficient equals 0.995754, one can assume it points 
# more toward a unit root. 

# (ii) similarly to u_rate, one assumes v_rate points more towards a unit root because 
# the correlation coefficient is 0.944582. 

reg = smf.ols(formula='urate ~ vrate', data=b)
results = reg.fit()
print(f'results.summary(): \n{results.summary()}\n')

# (iii) you do find a negative relationship, as vrate's coefficient is -3.74. 

# (iv) we cannot trust the confidence intervals produced by OLS because both 
# urate and vrate have unit roots. 

reg = smf.ols(formula='curate ~ cvrate', data=b)
results = reg.fit()
print(f'results.summary(): \n{results.summary()}\n')

# (v) the estimated slope coefficient is now -.03, and it is not statistically different
# from zero with a p-value of .797. 

[[ 1. nan]
 [nan nan]]
correlation:
            month     urate     vrate         t   urate_1   vrate_1    curate  \
month    1.000000  0.717791 -0.449912  1.000000  0.715513 -0.451964 -0.034882   
urate    0.717791  1.000000 -0.843242  0.717789  0.995754 -0.855374  0.023361   
vrate   -0.449912 -0.843242  1.000000 -0.449903 -0.832384  0.944582 -0.104697   
t        1.000000  0.717789 -0.449903  1.000000  0.715516 -0.451968 -0.034939   
urate_1  0.715513  0.995754 -0.832384  0.715516  1.000000 -0.844823 -0.068767   
vrate_1 -0.451964 -0.855374  0.944582 -0.451968 -0.844823  1.000000 -0.095128   
curate  -0.034882  0.023361 -0.104697 -0.034939 -0.068767 -0.095128  1.000000   
cvrate   0.083505  0.088364  0.104747  0.083560  0.090240 -0.227529 -0.022378   

           cvrate  
month    0.083505  
urate    0.088364  
vrate    0.104747  
t        0.083560  
urate_1  0.090240  
vrate_1 -0.227529  
curate  -0.022378  
cvrate   1.000000  

[[ 1. nan]
 [nan nan]]
results.summary(): 
          

In [10]:
# ch12 q1
# in the presence of serial correlation, variance is not just equal to sigma^2,
# but it is preceded by the coefficient rho. the regular standard errors fail
# to account for rho, which is usually positive, so the regular standard error
# is usually smaller than robust standard error. this is not the case 100% of
# the time, but it is typical.

In [11]:
# ch12 q3
# (i) because presidential elections only take place every four years,
# the number of periods is low and they are not nearly as closely connected
# as time series data points that are daily or monthly in frequency.

# (ii) the Durbin-Watson test statistic can be approximated by 2(1-p),
# which in this case mean that the DW = ~ 2, which means that we fail
# to reject the null hypothesis and assume that there is no serial
# correlation.

# (iii) time series data sets are usually small, and the DW test still
# tends to work with very small rho values even in small samples.

In [44]:
# ch12 c10
p = woo.dataWoo('PHILLIPS')
inflation = p['inf']
reg = smf.ols(formula='inflation ~ unem', data=p)
results = reg.fit()
print(f'results.summary(): \n{results.summary()}\n')

# (i) the resulting regression equation is inflation = 1.05 + .50*unem + u.

u_hat = results.resid
u_hatlag = u_hat.shift(1)
reg = smf.ols(formula='u_hat ~ u_hatlag', data=p)
results = reg.fit()
print(f'results.summary(): \n{results.summary()}\n')

# (ii) rho is 0.5725 and has a p-value of zero, so therefore is significant, meaning
# there is strong evidence of serial correlation.

T = len(p - 1)
date_range = pd.date_range(start='1949', periods=T, freq='Y')
p.index = date_range.year
yt96 = (p['year'] <= 1996)
reg_s = smf.ols(formula='Q("inf") ~ unem', data=p, subset=yt96)
results_s = reg_s.fit()
print(f'results_s.summary(): \n{results_s.summary()}\n')

p['resid_s'] = results_s.resid
p['resid_s_lag1'] = p['resid_s'].shift(1)
reg = smf.ols(formula='resid_s ~ resid_s_lag1', data=p)
results = reg.fit()
print(f'results.summary(): \n{results.summary()}\n')

# (iii) there is much difference in the estimate when adding the later years 
# since the 2006 results include a negatuve coefficient on unem, while the prais
# winsten results shown below list unem with a positive beta value. 

T = len(p)
date_range = pd.date_range(start='1948', periods=T, freq='Y')
p.index = date_range.year
yt96 = (p['year'] <= 1996)
reg_s = smf.ols(formula='Q("inf") ~ unem', data=p, subset=yt96)
results_s = reg_s.fit()
print(f'results_s.summary(): \n{results_s.summary()}\n')

# (iv) the PW and CO estimates of beta are identical with both having a value of 
# 0.4676. 

results.summary(): 
                            OLS Regression Results                            
Dep. Variable:              inflation   R-squared:                       0.062
Model:                            OLS   Adj. R-squared:                  0.045
Method:                 Least Squares   F-statistic:                     3.579
Date:                Fri, 01 Apr 2022   Prob (F-statistic):             0.0639
Time:                        14:48:37   Log-Likelihood:                -139.43
No. Observations:                  56   AIC:                             282.9
Df Residuals:                      54   BIC:                             286.9
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      1.0536      1.548

In [67]:
# ch12 c13
ok = woo.dataWoo('OKUN')
reg = smf.ols(formula='pcrgdp ~ cunem', data=ok)
results = reg.fit()
print(f'results.summary(): \n{results.summary()}\n')

ok['u_hat'] = results.resid
ok['u_hatlag'] = u_hat.shift(1)
reg = smf.ols(formula='u_hat ~ u_hatlag', data=ok)
results = reg.fit()
print(f'results.summary(): \n{results.summary()}\n')

# (i) the resulting regression equation is pcrgdp = 3.34 - 1.89*cunem + u. i conclude
# that we fail to reject the null, and there is not serial correlation present. 

ok['u_hat2'] = results.resid**2
reg = smf.ols(formula='u_hat2 ~ cunem', data=ok)
results = reg.fit()
print(f'results.summary(): \n{results.summary()}\n')

# (ii) the p-value of cunem is .02, which is statistically significant, so there is 
# heteroskedasticity because we reject the null hypothesis of homoskedasticity. 

results = reg.fit(cov_type='HC1')
print(f'results.summary(): \n{results.summary()}\n')

# (iii) yes, there are substantially different as the original OLS value is -1.89, 
# while the robust standard errors value is 0.54. 

results.summary(): 
                            OLS Regression Results                            
Dep. Variable:                 pcrgdp   R-squared:                       0.710
Model:                            OLS   Adj. R-squared:                  0.704
Method:                 Least Squares   F-statistic:                     107.9
Date:                Fri, 01 Apr 2022   Prob (F-statistic):           2.04e-13
Time:                        15:29:24   Log-Likelihood:                -68.769
No. Observations:                  46   AIC:                             141.5
Df Residuals:                      44   BIC:                             145.2
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      3.3444      0.163

In [82]:
# ch13 c3

# (i) given that a house with a larger distance distance to incinerator is  
# considered nicer, the sign of delta1 is probably negative. 

k = woo.dataWoo('KIELMC')
reg = smf.ols(formula='np.log(price) ~ y81 + np.log(dist) + y81*np.log(dist)', data=k)
results = reg.fit()
print(f'results.summary(): \n{results.summary()}\n')

# (ii) log(price) = 8.0585 + -0.0113*y81 + 0.3167*log(dist) + 0.0482*y81*log(dist)
# for houses from the year 1981, when the distance to a house's incinerator in feet increases 
# by 1%, the price of the house will icnrease by 4.82%. 

reg = smf.ols(formula='np.log(price) ~ y81 + np.log(dist) + y81*np.log(dist) + age + age**2 + rooms + baths + \
                       np.log(intst) + np.log(land) + np.log(area)', data=k)
results = reg.fit()
print(f'results.summary(): \n{results.summary()}\n')

# (iii) unlike the regression before it, this demostrates that the coefficient on distance 
# to a house's incinerator in feet is not statistically different from zero, so therefore has
# no effect on the value of a house. 

# (iv) the coefficient on log(dist) could be positive and statistically significant in part (ii)
# but not part (iii) because the other variables included in the latter are more significant as 
# a group when it comes to affecting the price of homes than distance to an incinerator. 

results.summary(): 
                            OLS Regression Results                            
Dep. Variable:          np.log(price)   R-squared:                       0.396
Model:                            OLS   Adj. R-squared:                  0.390
Method:                 Least Squares   F-statistic:                     69.22
Date:                Fri, 01 Apr 2022   Prob (F-statistic):           1.87e-34
Time:                        20:19:13   Log-Likelihood:                -109.24
No. Observations:                 321   AIC:                             226.5
Df Residuals:                     317   BIC:                             241.6
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
Intercept           

In [87]:
# ch13 c14
j = woo.dataWoo('JTRAIN3')
reg = smf.ols(formula='re78 ~ train', data=j)
results = reg.fit()
print(f'results.summary(): \n{results.summary()}\n')

# (i) jtrain3's regression equation is re78 = 21.5539 + -15.2048*train + u. it appears
# the job training had a negative effect on real labor earnings in 1978 given beta 1's 
# value of -15.204. 

j['cre'] = j['re78'] - j['re75']
reg = smf.ols(formula='cre ~ train', data=j)
results = reg.fit()
print(f'results.summary(): \n{results.summary()}\n')

# (ii) now the estimate explains that receiving the job training in '78 increased the 
# change in real labor earnings by roughly $2,491. this is wildly different from the estimate
# in part (i), which claimed the job training decreased earnings by $15,205. 

CI95 = results.conf_int(0.05)
print(f'CI95: \n{CI95}\n')
results = reg.fit(cov_type='HC1')
print(f'results.summary(): \n{results.summary()}\n')
CI95 = results.conf_int(0.05)
print(f'CI95: \n{CI95}\n')

# (iii) the 95% confidence interval for the training effect using the OLS standard error
# is (0.730648, 3.922363) and is (1.062932, 3.590079) for the heteroskedasticity robust
# standard error. my findings conclude that the job training had a positive effect on labor 
# earnings by roughly $1,000-3,000. 

results.summary(): 
                            OLS Regression Results                            
Dep. Variable:                   re78   R-squared:                       0.061
Model:                            OLS   Adj. R-squared:                  0.061
Method:                 Least Squares   F-statistic:                     173.4
Date:                Fri, 01 Apr 2022   Prob (F-statistic):           2.03e-38
Time:                        20:51:45   Log-Likelihood:                -11066.
No. Observations:                2675   AIC:                         2.214e+04
Df Residuals:                    2673   BIC:                         2.215e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     21.5539      0.304

In [16]:
# ch14 c3
import linearmodels as plm
jtrain = woo.dataWoo("JTRAIN")
jtrain['entity'] = jtrain['fcode']
jtrain = jtrain.set_index(['fcode', 'year'])

reg_fe = plm.PanelOLS.from_formula(
    formula = 'hrsemp ~ d88 + d89 + grant + grant_1 + lemploy + EntityEffects',
    data=jtrain, drop_absorbed=True)
results_fe = reg_fe.fit()
print(f'results_fe.summary: \n{results_fe.summary}\n')

# (i) there are  135 firms used in this regression. there are 470 observations 
# in the full data set, so if there was full information on each firm, ~156 firms 
# would be included.

# (ii) the coefficient on grant, 34.228, is statistically significant and implies
# that in the current year, receiving a job training grant increases the number
# of hours of job training per employee by 34.228 hours.

# (iii) it is not surprising that the grant lag coefficient is not significant,
# because presumably, firms would like to train their workers sooner
# rather than later, so they would be incentivized to use the whole
# grant in just one year. Using the grant now increases the number of
# hours spent in training now, but not in the future.

# (iv) there are small differences between firms of different sizes and their
# training hours per employee. in fact, because this coefficient is
# not statistically significant, the differences are equivalent to 0.
# however, if we did want to interperate the coefficient, we would
# say that for a 10% increase in the number of employees, job training hours per 
# employee were reduced by 1.7%.

results_fe.summary: 
                          PanelOLS Estimation Summary                           
Dep. Variable:                 hrsemp   R-squared:                        0.4909
Estimator:                   PanelOLS   R-squared (Between):              0.2473
No. Observations:                 390   R-squared (Within):               0.4909
Date:                Sun, Apr 03 2022   R-squared (Overall):              0.3195
Time:                        22:50:03   Log-likelihood                   -1503.7
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      48.206
Entities:                         135   P-value                           0.0000
Avg Obs:                       2.8889   Distribution:                   F(5,250)
Min Obs:                       1.0000                                           
Max Obs:                       3.0000   F-statistic (robust):             48.206
       

Inputs contain missing values. Dropping rows with missing observations.


In [17]:
# ch14 c10
airfare = woo.dataWoo("AIRFARE")
airfare = airfare.set_index(['id', 'year'], drop=False)
print('Comparison of Pooled OLS, Random Effects & Fixed Effects')
print('Pooled OLS: Results')

reg_ols = plm.PooledOLS.from_formula(
    formula='lfare ~ concen + ldist + ldistsq  + C(year)', data=airfare)
results_ols = reg_ols.fit()
print(f'results_ols.summary: \n{results_ols.summary}\n')
print('Random Effects: Results')

reg_re = plm.RandomEffects.from_formula(
    formula='lfare ~ concen + ldist + ldistsq  + C(year)', data=airfare)
results_re = reg_re.fit()
print(f'results_re.summary: \n{results_re.summary}\n')
print('Fixed Effects: Results')

reg_fe = plm.PanelOLS.from_formula(
    formula='lfare ~ concen + ldist + ldistsq  + C(year)', data=airfare)
results_fe = reg_fe.fit()
print(f'results_fe.summary: \n{results_fe.summary}\n')

# (i) f concentration increases by .10, fares will increase by 3.6%.

# (ii) the unobservable effects in this model make the standard errors unreliable.
# in this model, the CI is [.30, .42].

# (iii) the turning point is at (.9/.2) = exp(4.5) = 90, which is probably outside
# of the data range, as most flights are more than 200 miles long at minimum.

# (iv) the random effects model has B1 coefficient that is smaller than the
# pooled ols model. 0.36 to 0.20 is a fairly large decrease, where the effect
# is almost half of what we previously estimated it to be.

# (v) the fixed effects model is also 0.36. this is because the RE estimator
# of theta (a ratio between variance measures) is close to 1, which
# means it approximates the FE estimator.

# (vi) some unobservable effects caputred by a about routes could be the
# general desirability of the destination outside of concentration,
# e.g. superstar cities like Paris and NYC. if the route is over water,
# that could cause an increase in price as well. which airlines serve
# destinations would also affect price, such as locations only served
# by luxury airlines.

# (vii) i suspect that some of the unobserved effects i listed in vi are
# correlated with the regressors, so i think fixed effects is the
# best model here. following the fixed effects model, concentration does
# have an effect on fares.

Comparison of Pooled OLS, Random Effects & Fixed Effects
Pooled OLS: Results
results_ols.summary: 
                          PooledOLS Estimation Summary                          
Dep. Variable:                  lfare   R-squared:                        0.4062
Estimator:                  PooledOLS   R-squared (Between):              0.4215
No. Observations:                4596   R-squared (Within):               0.1246
Date:                Sun, Apr 03 2022   R-squared (Overall):              0.4062
Time:                        22:50:08   Log-likelihood                   -1512.3
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      523.18
Entities:                        1149   P-value                           0.0000
Avg Obs:                       4.0000   Distribution:                  F(6,4589)
Min Obs:                       4.0000                                           
Max Obs:  