## Econ 430
### UCLA, Master of Quantitative Economics
#### Dr. Randall R. Rojas
Note: For more details on the codes, please see our textbook; Using Python for Introductory Econometrics by F. Heiss & D. Brunner.

# 1) Pooled Cross-Sections
## Example: Changes to the Return to Education and the  Gender Wage Gap

In [1]:
import wooldridge as woo
import pandas as pd
import statsmodels.formula.api as smf

cps78_85 = woo.dataWoo('cps78_85')

# OLS results including interaction terms:
reg = smf.ols(formula='lwage ~ y85*(educ+female) + exper +'
                      'I((exper**2)/100) + union',
              data=cps78_85)
results = reg.fit()

# print regression table:
table = pd.DataFrame({'b': round(results.params, 4),
                      'se': round(results.bse, 4),
                      't': round(results.tvalues, 4),
                      'pval': round(results.pvalues, 4)})
print(f'table: \n{table}\n')


table: 
                            b      se        t    pval
Intercept              0.4589  0.0934   4.9111  0.0000
y85                    0.1178  0.1238   0.9517  0.3415
educ                   0.0747  0.0067  11.1917  0.0000
female                -0.3167  0.0366  -8.6482  0.0000
y85:educ               0.0185  0.0094   1.9735  0.0487
y85:female             0.0851  0.0513   1.6576  0.0977
exper                  0.0296  0.0036   8.2932  0.0000
I((exper ** 2) / 100) -0.0399  0.0078  -5.1513  0.0000
union                  0.2021  0.0303   6.6722  0.0000



# 2) Difference-in-Differences
## Example: Effect of Garbage Incinerator's Location on Housing Prices

In [2]:
import wooldridge as woo
import pandas as pd
import statsmodels.formula.api as smf

kielmc = woo.dataWoo('kielmc')

# separate regressions for 1978 and 1981:
y78 = (kielmc['year'] == 1978)
reg78 = smf.ols(formula='rprice ~ nearinc', data=kielmc, subset=y78)
results78 = reg78.fit()

y81 = (kielmc['year'] == 1981)
reg81 = smf.ols(formula='rprice ~ nearinc', data=kielmc, subset=y81)
results81 = reg81.fit()

# joint regression including an interaction term:
reg_joint = smf.ols(formula='rprice ~ nearinc * C(year)', data=kielmc)
results_joint = reg_joint.fit()

# print regression tables:
table_78 = pd.DataFrame({'b': round(results78.params, 4),
                         'se': round(results78.bse, 4),
                         't': round(results78.tvalues, 4),
                         'pval': round(results78.pvalues, 4)})
print(f'table_78: \n{table_78}\n')

table_81 = pd.DataFrame({'b': round(results81.params, 4),
                         'se': round(results81.bse, 4),
                         't': round(results81.tvalues, 4),
                         'pval': round(results81.pvalues, 4)})
print(f'table_81: \n{table_81}\n')

table_joint = pd.DataFrame({'b': round(results_joint.params, 4),
                            'se': round(results_joint.bse, 4),
                            't': round(results_joint.tvalues, 4),
                            'pval': round(results_joint.pvalues, 4)})
print(f'table_joint: \n{table_joint}\n')


table_78: 
                    b        se        t    pval
Intercept  82517.2276  2653.790  31.0941  0.0000
nearinc   -18824.3705  4744.594  -3.9675  0.0001

table_81: 
                     b         se        t  pval
Intercept  101307.5136  3093.0267  32.7535   0.0
nearinc    -30688.2738  5827.7088  -5.2659   0.0

table_joint: 
                                  b         se        t    pval
Intercept                82517.2276  2726.9101  30.2603  0.0000
C(year)[T.1981]          18790.2860  4050.0650   4.6395  0.0000
nearinc                 -18824.3705  4875.3221  -3.8612  0.0001
nearinc:C(year)[T.1981] -11863.9033  7456.6462  -1.5911  0.1126



## Example: Effect of Garbage Incinerator's Location on Housing Prices
Improved model: Using $\log(y)$ and additional regressors.  
Result: The decrease is now around 13.2%.

In [3]:
import wooldridge as woo
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf

kielmc = woo.dataWoo('kielmc')

# difference in difference (DiD):
reg_did = smf.ols(formula='np.log(rprice) ~ nearinc*C(year)', data=kielmc)
results_did = reg_did.fit()

# print regression table:
table_did = pd.DataFrame({'b': round(results_did.params, 4),
                          'se': round(results_did.bse, 4),
                          't': round(results_did.tvalues, 4),
                          'pval': round(results_did.pvalues, 4)})
print(f'table_did: \n{table_did}\n')

# DiD with control variables:
reg_didC = smf.ols(formula='np.log(rprice) ~ nearinc*C(year) + age +'
                           'I(age**2) + np.log(intst) + np.log(land) +'
                           'np.log(area) + rooms + baths',
                   data=kielmc)
results_didC = reg_didC.fit()

# print regression table:
table_didC = pd.DataFrame({'b': round(results_didC.params, 4),
                           'se': round(results_didC.bse, 4),
                           't': round(results_didC.tvalues, 4),
                           'pval': round(results_didC.pvalues, 4)})
print(f'table_didC: \n{table_didC}\n')
# Note: The decrease is now around 13.2%

table_did: 
                               b      se         t    pval
Intercept                11.2854  0.0305  369.8386  0.0000
C(year)[T.1981]           0.1931  0.0453    4.2606  0.0000
nearinc                  -0.3399  0.0546   -6.2308  0.0000
nearinc:C(year)[T.1981]  -0.0626  0.0834   -0.7508  0.4533

table_didC: 
                              b      se        t    pval
Intercept                7.6517  0.4159  18.3986  0.0000
C(year)[T.1981]          0.1621  0.0285   5.6868  0.0000
nearinc                  0.0322  0.0475   0.6789  0.4977
nearinc:C(year)[T.1981] -0.1315  0.0520  -2.5305  0.0119
age                     -0.0084  0.0014  -5.9236  0.0000
I(age ** 2)              0.0000  0.0000   4.3415  0.0000
np.log(intst)           -0.0614  0.0315  -1.9500  0.0521
np.log(land)             0.0998  0.0245   4.0766  0.0001
np.log(area)             0.3508  0.0515   6.8129  0.0000
rooms                    0.0473  0.0173   2.7317  0.0067
baths                    0.0943  0.0277   3.4003  0.

# 3) First Difference Estimator
## Example: County Crime Rates in North Carolina

In [4]:
import wooldridge as woo
import numpy as np
import linearmodels as plm

crime4 = woo.dataWoo('crime4')
crime4 = crime4.set_index(['county', 'year'], drop=False)

# estimate FD model:
reg = plm.FirstDifferenceOLS.from_formula(
    formula='np.log(crmrte) ~ year + d83 + d84 + d85 + d86 + d87 +'
            'lprbarr + lprbconv + lprbpris + lavgsen + lpolpc',
    data=crime4)
results = reg.fit()
print(f'results: \n{results}\n')


results: 
                     FirstDifferenceOLS Estimation Summary                      
Dep. Variable:         np.log(crmrte)   R-squared:                        0.4326
Estimator:         FirstDifferenceOLS   R-squared (Between):              0.6003
No. Observations:                 540   R-squared (Within):               0.4281
Date:                Sun, Nov 07 2021   R-squared (Overall):              0.6000
Time:                        16:22:21   Log-likelihood                    248.48
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      36.661
Entities:                          90   P-value                           0.0000
Avg Obs:                       7.0000   Distribution:                  F(11,529)
Min Obs:                       7.0000                                           
Max Obs:                       7.0000   F-statistic (robust):             36.661
                  

# 4)Fixed Effects Estimation
## Example: Has the Return to Education Changed over Time?

In [4]:
import wooldridge as woo
import pandas as pd
import linearmodels as plm

wagepan = woo.dataWoo('wagepan')
wagepan = wagepan.set_index(['nr', 'year'], drop=False)

# FE model estimation:
reg = plm.PanelOLS.from_formula(
    formula='lwage ~ married + union + C(year)*educ + EntityEffects',
    data=wagepan, drop_absorbed=True)
results = reg.fit()

# print regression table:
table = pd.DataFrame({'b': round(results.params, 4),
                      'se': round(results.std_errors, 4),
                      't': round(results.tstats, 4),
                      'pval': round(results.pvalues, 4)})
print(f'table: \n{table}\n')

table: 
                           b      se        t    pval
C(year)[1980]         1.3625  0.0162  83.9031  0.0000
C(year)[1981]         1.3400  0.1452   9.2307  0.0000
C(year)[1982]         1.3567  0.1451   9.3481  0.0000
C(year)[1983]         1.3729  0.1452   9.4561  0.0000
C(year)[1984]         1.4468  0.1452   9.9617  0.0000
C(year)[1985]         1.4122  0.1451   9.7315  0.0000
C(year)[1986]         1.4281  0.1451   9.8404  0.0000
C(year)[1987]         1.4529  0.1452  10.0061  0.0000
married               0.0548  0.0184   2.9773  0.0029
union                 0.0830  0.0194   4.2671  0.0000
C(year)[T.1981]:educ  0.0116  0.0123   0.9448  0.3448
C(year)[T.1982]:educ  0.0148  0.0123   1.2061  0.2279
C(year)[T.1983]:educ  0.0171  0.0123   1.3959  0.1628
C(year)[T.1984]:educ  0.0166  0.0123   1.3521  0.1764
C(year)[T.1985]:educ  0.0237  0.0123   1.9316  0.0535
C(year)[T.1986]:educ  0.0274  0.0123   2.2334  0.0256
C(year)[T.1987]:educ  0.0304  0.0123   2.4798  0.0132



Variables have been fully absorbed and have removed from the regression:

educ



#  5) Panel Data Inspection
## Example: Wage Equation

In [6]:
import wooldridge as woo

wagepan = woo.dataWoo('wagepan')

# print relevant dimensions for panel:
N = wagepan.shape[0]
T = wagepan['year'].drop_duplicates().shape[0]
n = wagepan['nr'].drop_duplicates().shape[0]
print(f'N: {N}\n')
print(f'T: {T}\n')
print(f'n: {n}\n')

# check non-varying variables

# (I) across time and within individuals by calculating individual
# specific variances for each variable:
isv_nr = (wagepan.groupby('nr').var() == 0)  # True, if variance is zero
# choose variables where all grouped variances are zero:
noVar_nr = isv_nr.all(axis=0)  # which cols are completely True
print(f'isv_nr.columns[noVar_nr]: \n{isv_nr.columns[noVar_nr]}\n')

# (II) across individuals within one point in time for each variable:
isv_t = (wagepan.groupby('year').var() == 0)
noVar_t = isv_t.all(axis=0)
print(f'isv_t.columns[noVar_t]: \n{isv_t.columns[noVar_t]}\n')


N: 4360

T: 8

n: 545

isv_nr.columns[noVar_nr]: 
Index(['black', 'hisp', 'educ'], dtype='object')

isv_t.columns[noVar_t]: 
Index(['d81', 'd82', 'd83', 'd84', 'd85', 'd86', 'd87'], dtype='object')



# 6) Pooled, Fixed and Random Effects Comparison
## Example: Wage Equation

In [7]:
import wooldridge as woo
import pandas as pd
import linearmodels as plm

wagepan = woo.dataWoo('wagepan')

# estimate different models:
wagepan = wagepan.set_index(['nr', 'year'], drop=False)

reg_ols = plm.PooledOLS.from_formula(
    formula='lwage ~ educ + black + hisp + exper + I(exper**2) +'
            'married + union + C(year)', data=wagepan)
results_ols = reg_ols.fit()

reg_re = plm.RandomEffects.from_formula(
    formula='lwage ~ educ + black + hisp + exper + I(exper**2) +'
            'married + union + C(year)', data=wagepan)
results_re = reg_re.fit()

reg_fe = plm.PanelOLS.from_formula(
    formula='lwage ~ I(exper**2) + married + union +'
            'C(year) + EntityEffects', data=wagepan)
results_fe = reg_fe.fit()

# print results:
theta_hat = results_re.theta.iloc[0, 0]
print(f'theta_hat: {theta_hat}\n')

table_ols = pd.DataFrame({'b': round(results_ols.params, 4),
                          'se': round(results_ols.std_errors, 4),
                          't': round(results_ols.tstats, 4),
                          'pval': round(results_ols.pvalues, 4)})
print(f'table_ols: \n{table_ols}\n')

table_re = pd.DataFrame({'b': round(results_re.params, 4),
                         'se': round(results_re.std_errors, 4),
                         't': round(results_re.tstats, 4),
                         'pval': round(results_re.pvalues, 4)})
print(f'table_re: \n{table_re}\n')

table_fe = pd.DataFrame({'b': round(results_fe.params, 4),
                         'se': round(results_fe.std_errors, 4),
                         't': round(results_fe.tstats, 4),
                         'pval': round(results_fe.pvalues, 4)})
print(f'table_fe: \n{table_fe}\n')


theta_hat: 0.6450593029243452

table_ols: 
                    b      se        t    pval
C(year)[1980]  0.0921  0.0783   1.1761  0.2396
C(year)[1981]  0.1504  0.0838   1.7935  0.0730
C(year)[1982]  0.1548  0.0893   1.7335  0.0831
C(year)[1983]  0.1541  0.0944   1.6323  0.1027
C(year)[1984]  0.1825  0.0990   1.8437  0.0653
C(year)[1985]  0.2013  0.1031   1.9523  0.0510
C(year)[1986]  0.2340  0.1068   2.1920  0.0284
C(year)[1987]  0.2659  0.1100   2.4166  0.0157
educ           0.0913  0.0052  17.4419  0.0000
black         -0.1392  0.0236  -5.9049  0.0000
hisp           0.0160  0.0208   0.7703  0.4412
exper          0.0672  0.0137   4.9095  0.0000
I(exper ** 2) -0.0024  0.0008  -2.9413  0.0033
married        0.1083  0.0157   6.8997  0.0000
union          0.1825  0.0172  10.6349  0.0000

table_re: 
                    b      se       t    pval
C(year)[1980]  0.0234  0.1514  0.1546  0.8771
C(year)[1981]  0.0638  0.1601  0.3988  0.6901
C(year)[1982]  0.0543  0.1690  0.3211  0.7481
C(year)[1

# 7) Correlated Random Effects
## Example: Wage Equation

In [8]:
import wooldridge as woo
import pandas as pd
import statsmodels.formula.api as smf
import linearmodels as plm

wagepan = woo.dataWoo('wagepan')
wagepan['t'] = wagepan['year']
wagepan['entity'] = wagepan['nr']
wagepan = wagepan.set_index(['nr'])

# include group specific means:
wagepan['married_b'] = wagepan.groupby('nr').mean()['married']
wagepan['union_b'] = wagepan.groupby('nr').mean()['union']
wagepan = wagepan.set_index(['year'], append=True)

# estimate FE parameters in 3 different ways:
reg_we = plm.PanelOLS.from_formula(
    formula='lwage ~ married + union + C(t)*educ + EntityEffects',
    drop_absorbed=True, data=wagepan)
results_we = reg_we.fit()

reg_dum = smf.ols(
    formula='lwage ~ married + union + C(t)*educ + C(entity)',
    data=wagepan)
results_dum = reg_dum.fit()

# estimate CRE:
reg_cre = plm.RandomEffects.from_formula(
    formula='lwage ~ married + union + C(t)*educ + married_b + union_b',
    data=wagepan)
results_cre = reg_cre.fit()

# compare to RE estimates:
reg_re = plm.RandomEffects.from_formula(
    formula='lwage ~ married + union + C(t)*educ',
    data=wagepan)
results_re = reg_re.fit()

var_selection = ['married', 'union', 'C(t)[T.1982]:educ']

# print results:
table = pd.DataFrame({'b_we': round(results_we.params[var_selection], 4),
                      'b_dum': round(results_dum.params[var_selection], 4),
                      'b_cre': round(results_cre.params[var_selection], 4),
                      'b_re': round(results_re.params[var_selection], 4)})
print(f'table: \n{table}\n')

# CRE Test:

# RE test as an Wald test on the CRE specific coefficients:
wtest = results_cre.wald_test(formula='married_b = union_b = 0')
print(f'wtest: \n{wtest}\n')

Variables have been fully absorbed and have removed from the regression:

educ



table: 
                     b_we   b_dum   b_cre    b_re
married            0.0548  0.0548  0.0548  0.0773
union              0.0830  0.0830  0.0830  0.1075
C(t)[T.1982]:educ  0.0148  0.0148  0.0148  0.0143

wtest: 
Linear Equality Hypothesis Test
H0: Linear equality constraint is valid
Statistic: 19.4058
P-value: 0.0001
Distributed: chi2(2)



In [9]:
# 8) Robust (Clustered) Standard Errors
## Example: Wage Equation

In [10]:
import wooldridge as woo
import numpy as np
import pandas as pd
import linearmodels as plm

crime4 = woo.dataWoo('crime4')
crime4 = crime4.set_index(['county', 'year'], drop=False)

# estimate FD model:
reg = plm.FirstDifferenceOLS.from_formula(
    formula='np.log(crmrte) ~ year + d83 + d84 + d85 + d86 + d87 +'
            'lprbarr + lprbconv + lprbpris + lavgsen + lpolpc',
    data=crime4)

# regression with standard SE:
results_default = reg.fit()

# regression with "clustered" SE:
results_cluster = reg.fit(cov_type='clustered', cluster_entity=True,
                          debiased=False)

# regression with "clustered" SE (small-sample correction):
results_css = reg.fit(cov_type='clustered', cluster_entity=True)

# print results:
table = pd.DataFrame({'b': round(results_default.params, 4),
                      'se_default': round(results_default.std_errors, 4),
                      'se_cluster': round(results_cluster.std_errors, 4),
                      'se_css': round(results_css.std_errors, 4)})
print(f'table: \n{table}\n')

table: 
               b  se_default  se_cluster  se_css
year      0.0077      0.0171      0.0136  0.0137
d83      -0.0999      0.0239      0.0219  0.0222
d84      -0.1478      0.0413      0.0356  0.0359
d85      -0.1524      0.0584      0.0505  0.0511
d86      -0.1249      0.0760      0.0624  0.0630
d87      -0.0841      0.0940      0.0773  0.0781
lprbarr  -0.3275      0.0300      0.0556  0.0562
lprbconv -0.2381      0.0182      0.0390  0.0394
lprbpris -0.1650      0.0260      0.0451  0.0456
lavgsen  -0.0218      0.0221      0.0254  0.0257
lpolpc    0.3984      0.0269      0.1014  0.1025



# 9) Hausman Test
## Example: Wage Equation

In [11]:
import wooldridge as woo
import numpy as np
import linearmodels as plm
import scipy.stats as stats

wagepan = woo.dataWoo('wagepan')
wagepan = wagepan.set_index(['nr', 'year'], drop=False)

# estimation of FE and RE:
reg_fe = plm.PanelOLS.from_formula(formula='lwage ~ I(exper**2) + married +'
                                           'union + C(year) + EntityEffects',
                                   data=wagepan)
results_fe = reg_fe.fit()
b_fe = results_fe.params
b_fe_cov = results_fe.cov

reg_re = plm.RandomEffects.from_formula(
    formula='lwage ~ educ + black + hisp + exper + I(exper**2)'
            '+ married + union + C(year)', data=wagepan)
results_re = reg_re.fit()
b_re = results_re.params
b_re_cov = results_re.cov

# Hausman test of FE vs. RE
# (I) find overlapping coefficients:
common_coef = set(results_fe.params.index).intersection(results_re.params.index)

# (II) calculate differences between FE and RE:
b_diff = np.array(results_fe.params[common_coef] - results_re.params[common_coef])
df = len(b_diff)
b_diff.reshape((df, 1))
b_cov_diff = np.array(b_fe_cov.loc[common_coef, common_coef] -
                      b_re_cov.loc[common_coef, common_coef])
b_cov_diff.reshape((df, df))

# (III) calculate test statistic:
stat = abs(np.transpose(b_diff) @ np.linalg.inv(b_cov_diff) @ b_diff)
pval = 1 - stats.chi2.cdf(stat, df)

print(f'stat: {stat}\n')
print(f'pval: {pval}\n')

stat: 43.42707117638294

pval: 9.1506138486519e-06

