# Panel Data Models
---

In [1]:
import pandas as pd
import numpy as np

import statsmodels.api as sm

from linearmodels import PooledOLS
from linearmodels import RandomEffects
from linearmodels import PanelOLS
from linearmodels import FirstDifferenceOLS

from stargazer.stargazer import Stargazer

In [2]:
# Import crime data from North Carolina between 1981 to 1987
data = pd.read_csv("data/crime.csv", index_col=0)
data

Unnamed: 0,county,year,crmrte,prbarr,prbconv,prbpris,avgsen,polpc,density,taxpc,...,lwfir,lwser,lwmfg,lwfed,lwsta,lwloc,lpctymle,lpctmin,ltaxpc,lmix
1,1,81,0.039885,0.289696,0.402062,0.472222,5.61,0.001787,2.307159,25.69763,...,5.607452,5.374044,5.434246,6.014619,5.464848,5.444450,-2.433870,3.006608,3.246399,-2.303407
2,1,82,0.038345,0.338111,0.433005,0.506993,5.59,0.001767,2.330254,24.87425,...,5.706707,5.444911,5.482013,6.039540,5.536862,5.467174,-2.449038,3.006608,3.213833,-2.272549
3,1,83,0.030305,0.330449,0.525703,0.479705,5.80,0.001836,2.341801,26.45144,...,5.736475,5.481292,5.597310,6.084157,5.522900,5.515765,-2.464036,3.006608,3.275311,-2.517281
4,1,84,0.034726,0.362525,0.604706,0.520104,6.89,0.001886,2.346420,26.84235,...,5.858180,5.531204,5.640985,6.129421,5.568077,5.577387,-2.478925,3.006608,3.289981,-2.544612
5,1,85,0.036573,0.325395,0.578723,0.497059,6.55,0.001924,2.364896,28.14034,...,5.948220,5.564850,5.700042,6.195282,5.639919,5.664972,-2.497306,3.006608,3.337204,-2.372487
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
626,197,83,0.015575,0.226667,0.480392,0.428571,7.77,0.001073,0.869048,18.90585,...,5.540736,5.280478,5.545919,5.865476,5.844326,5.463408,-2.538060,1.697597,2.939471,-2.172773
627,197,84,0.013662,0.204188,1.410260,0.372727,10.11,0.001109,0.872024,22.70475,...,5.569252,5.261022,5.593186,5.846150,5.869890,5.508943,-2.548068,1.697597,3.122574,-2.145931
628,197,85,0.013086,0.180556,0.830769,0.333333,5.96,0.001054,0.875000,24.12361,...,5.604843,5.333961,5.631821,5.880086,5.871498,5.616807,-2.561072,1.697597,3.183191,-2.512306
629,197,86,0.012874,0.112676,2.250000,0.244444,7.68,0.001088,0.880952,24.98198,...,5.743947,5.371892,5.723879,5.931024,5.873919,5.685245,-2.580968,1.697597,3.218155,-2.580217


In [3]:
# Best practice: check dtypes
# There are two columns with object dtypes that should be further investigated
# BUT since we will not be using those columns in this notebook, 
# we can safely ignore them 

data.dtypes

county        int64
year          int64
crmrte      float64
prbarr      float64
prbconv     float64
prbpris     float64
avgsen      float64
polpc       float64
density     float64
taxpc       float64
region       object
smsa         object
pctmin      float64
wcon        float64
wtuc        float64
wtrd        float64
wfir        float64
wser        float64
wmfg        float64
wfed        float64
wsta        float64
wloc        float64
mix         float64
pctymle     float64
lcrmrte     float64
lprbarr     float64
lprbconv    float64
lprbpris    float64
lavgsen     float64
lpolpc      float64
ldensity    float64
lwcon       float64
lwtuc       float64
lwtrd       float64
lwfir       float64
lwser       float64
lwmfg       float64
lwfed       float64
lwsta       float64
lwloc       float64
lpctymle    float64
lpctmin     float64
ltaxpc      float64
lmix        float64
dtype: object

In [4]:
data.columns

Index(['county', 'year', 'crmrte', 'prbarr', 'prbconv', 'prbpris', 'avgsen',
       'polpc', 'density', 'taxpc', 'region', 'smsa', 'pctmin', 'wcon', 'wtuc',
       'wtrd', 'wfir', 'wser', 'wmfg', 'wfed', 'wsta', 'wloc', 'mix',
       'pctymle', 'lcrmrte', 'lprbarr', 'lprbconv', 'lprbpris', 'lavgsen',
       'lpolpc', 'ldensity', 'lwcon', 'lwtuc', 'lwtrd', 'lwfir', 'lwser',
       'lwmfg', 'lwfed', 'lwsta', 'lwloc', 'lpctymle', 'lpctmin', 'ltaxpc',
       'lmix'],
      dtype='object')

# 1. Random Effects, Fixed Effects, and First Differences
---

## Comments:
- In notebook 1_introduction, I wrote the code for both non-formula and formula APIs to running regressions in statsmodels.
- However, for simplicity and comparability with R code, I will only use the formula API from this notebook onwards.
- The main library from this notebook onwards is linearmodels.
- Linearmodels is a library that extends statsmodels with econometric models.

In [5]:
# Important! You must set a multi-index (entity, time) for panel data methods in linearmodels to work

data = data.set_index(['county','year'])
data

Unnamed: 0_level_0,Unnamed: 1_level_0,crmrte,prbarr,prbconv,prbpris,avgsen,polpc,density,taxpc,region,smsa,...,lwfir,lwser,lwmfg,lwfed,lwsta,lwloc,lpctymle,lpctmin,ltaxpc,lmix
county,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,81,0.039885,0.289696,0.402062,0.472222,5.61,0.001787,2.307159,25.69763,central,no,...,5.607452,5.374044,5.434246,6.014619,5.464848,5.444450,-2.433870,3.006608,3.246399,-2.303407
1,82,0.038345,0.338111,0.433005,0.506993,5.59,0.001767,2.330254,24.87425,central,no,...,5.706707,5.444911,5.482013,6.039540,5.536862,5.467174,-2.449038,3.006608,3.213833,-2.272549
1,83,0.030305,0.330449,0.525703,0.479705,5.80,0.001836,2.341801,26.45144,central,no,...,5.736475,5.481292,5.597310,6.084157,5.522900,5.515765,-2.464036,3.006608,3.275311,-2.517281
1,84,0.034726,0.362525,0.604706,0.520104,6.89,0.001886,2.346420,26.84235,central,no,...,5.858180,5.531204,5.640985,6.129421,5.568077,5.577387,-2.478925,3.006608,3.289981,-2.544612
1,85,0.036573,0.325395,0.578723,0.497059,6.55,0.001924,2.364896,28.14034,central,no,...,5.948220,5.564850,5.700042,6.195282,5.639919,5.664972,-2.497306,3.006608,3.337204,-2.372487
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
197,83,0.015575,0.226667,0.480392,0.428571,7.77,0.001073,0.869048,18.90585,west,no,...,5.540736,5.280478,5.545919,5.865476,5.844326,5.463408,-2.538060,1.697597,2.939471,-2.172773
197,84,0.013662,0.204188,1.410260,0.372727,10.11,0.001109,0.872024,22.70475,west,no,...,5.569252,5.261022,5.593186,5.846150,5.869890,5.508943,-2.548068,1.697597,3.122574,-2.145931
197,85,0.013086,0.180556,0.830769,0.333333,5.96,0.001054,0.875000,24.12361,west,no,...,5.604843,5.333961,5.631821,5.880086,5.871498,5.616807,-2.561072,1.697597,3.183191,-2.512306
197,86,0.012874,0.112676,2.250000,0.244444,7.68,0.001088,0.880952,24.98198,west,no,...,5.743947,5.371892,5.723879,5.931024,5.873919,5.685245,-2.580968,1.697597,3.218155,-2.580217


## 1.1. Pooled OLS

In [6]:
# Naive OLS assuming i.i.d. data

mod_OLS = PooledOLS.from_formula("crmrte ~ 1 + density + taxpc + wcon + pctmin", data=data)
res_OLS = mod_OLS.fit()
res_OLS.summary

0,1,2,3
Dep. Variable:,crmrte,R-squared:,0.5369
Estimator:,PooledOLS,R-squared (Between):,0.6178
No. Observations:,630,R-squared (Within):,-0.0076
Date:,"Thu, Mar 11 2021",R-squared (Overall):,0.5369
Time:,10:57:52,Log-likelihood,1875.8
Cov. Estimator:,Unadjusted,,
,,F-statistic:,181.12
Entities:,90,P-value,0.0000
Avg Obs:,7.0000,Distribution:,"F(4,625)"
Min Obs:,7.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Intercept,0.0102,0.0018,5.6226,0.0000,0.0066,0.0137
density,0.0088,0.0004,24.807,0.0000,0.0081,0.0095
taxpc,0.0001,4.433e-05,2.8486,0.0045,3.922e-05,0.0002
wcon,-1.866e-06,4.12e-06,-0.4528,0.6508,-9.957e-06,6.225e-06
pctmin,0.0002,2.95e-05,7.8095,0.0000,0.0002,0.0003


## 1.2. Random Effects

In [7]:
mod_randeff = RandomEffects.from_formula("crmrte ~ 1 + density + taxpc + wcon + pctmin", data=data)
res_randeff = mod_randeff.fit()
res_randeff.summary

0,1,2,3
Dep. Variable:,crmrte,R-squared:,0.1804
Estimator:,RandomEffects,R-squared (Between):,0.6137
No. Observations:,630,R-squared (Within):,0.0015
Date:,"Thu, Mar 11 2021",R-squared (Overall):,0.5345
Time:,10:57:52,Log-likelihood,2231.3
Cov. Estimator:,Unadjusted,,
,,F-statistic:,34.396
Entities:,90,P-value,0.0000
Avg Obs:,7.0000,Distribution:,"F(4,625)"
Min Obs:,7.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Intercept,0.0126,0.0026,4.8509,0.0000,0.0075,0.0177
density,0.0086,0.0008,10.972,0.0000,0.0071,0.0102
taxpc,5.558e-05,3.776e-05,1.4719,0.1415,-1.857e-05,0.0001
wcon,-2.495e-06,2.602e-06,-0.9591,0.3379,-7.604e-06,2.614e-06
pctmin,0.0002,6.726e-05,3.4515,0.0006,0.0001,0.0004


## 1.3. Fixed Effects

In [8]:
# Entity fixed effects

entity_fe_formula = "crmrte ~ density + taxpc + wcon + pctmin + EntityEffects"
mod_entity_fe = PanelOLS.from_formula(entity_fe_formula, data=data, drop_absorbed=True)
res_entity_fe = mod_entity_fe.fit()
res_entity_fe.summary

Variables have been fully absorbed and have removed from the regression:

pctmin



0,1,2,3
Dep. Variable:,crmrte,R-squared:,0.0086
Estimator:,PanelOLS,R-squared (Between):,-0.0345
No. Observations:,630,R-squared (Within):,0.0086
Date:,"Thu, Mar 11 2021",R-squared (Overall):,-0.0331
Time:,10:57:52,Log-likelihood,2280.2
Cov. Estimator:,Unadjusted,,
,,F-statistic:,1.5605
Entities:,90,P-value,0.1980
Avg Obs:,7.0000,Distribution:,"F(3,537)"
Min Obs:,7.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
density,-0.0015,0.0052,-0.2836,0.7768,-0.0118,0.0088
taxpc,8.822e-05,4.522e-05,1.9510,0.0516,-6.034e-07,0.0002
wcon,-2.06e-06,2.627e-06,-0.7841,0.4334,-7.22e-06,3.101e-06


In [9]:
# Entity and time period fixed effects

twoways_fe_formula = "crmrte ~ density + taxpc + wcon + pctmin + TimeEffects + EntityEffects"
mod_twoways_fe = PanelOLS.from_formula(twoways_fe_formula, data=data, drop_absorbed=True)
res_twoways_fe = mod_twoways_fe.fit()
res_twoways_fe.summary

0,1,2,3
Dep. Variable:,crmrte,R-squared:,0.0083
Estimator:,PanelOLS,R-squared (Between):,0.2240
No. Observations:,630,R-squared (Within):,0.0066
Date:,"Thu, Mar 11 2021",R-squared (Overall):,0.2170
Time:,10:57:52,Log-likelihood,2296.8
Cov. Estimator:,Unadjusted,,
,,F-statistic:,1.4778
Entities:,90,P-value,0.2196
Avg Obs:,7.0000,Distribution:,"F(3,531)"
Min Obs:,7.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
density,0.0009,0.0053,0.1702,0.8649,-0.0096,0.0114
taxpc,0.0001,5.281e-05,1.9647,0.0500,1.499e-08,0.0002
wcon,-7.157e-07,2.627e-06,-0.2724,0.7854,-5.876e-06,4.445e-06


## 1.4. First Differences

In [10]:
mod_fd = FirstDifferenceOLS.from_formula("crmrte ~ density + taxpc + wcon", data=data)
res_fd = mod_fd.fit()
res_fd.summary

0,1,2,3
Dep. Variable:,crmrte,R-squared:,0.0019
Estimator:,FirstDifferenceOLS,R-squared (Between):,0.5954
No. Observations:,540,R-squared (Within):,0.0015
Date:,"Thu, Mar 11 2021",R-squared (Overall):,0.5764
Time:,10:57:52,Log-likelihood,1779.7
Cov. Estimator:,Unadjusted,,
,,F-statistic:,0.3358
Entities:,90,P-value,0.7995
Avg Obs:,7.0000,Distribution:,"F(3,537)"
Min Obs:,7.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
density,0.0081,0.0122,0.6650,0.5064,-0.0159,0.0321
taxpc,2.537e-05,5.485e-05,0.4625,0.6439,-8.237e-05,0.0001
wcon,-9.953e-07,2.245e-06,-0.4432,0.6578,-5.406e-06,3.416e-06


# 2. Standard Errors
---

### Note:
- Three covariance estiminators are supported in linearmodels:
    1. White's (1980) robust standard errors
    2. Clustered standard errors (entity, time, two-way)
    3. Driscoll-Kraay HAC standard errors
    
Reference: https://bashtage.github.io/linearmodels/doc/panel/models.html

In [11]:
# warning: this is probably bad
# White's (1980) robust standard errors

res_entity_fe_robust = mod_entity_fe.fit(cov_type="robust")
res_entity_fe_robust.summary

0,1,2,3
Dep. Variable:,crmrte,R-squared:,0.0086
Estimator:,PanelOLS,R-squared (Between):,-0.0345
No. Observations:,630,R-squared (Within):,0.0086
Date:,"Thu, Mar 11 2021",R-squared (Overall):,-0.0331
Time:,10:57:52,Log-likelihood,2280.2
Cov. Estimator:,Robust,,
,,F-statistic:,1.5605
Entities:,90,P-value,0.1980
Avg Obs:,7.0000,Distribution:,"F(3,537)"
Min Obs:,7.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
density,-0.0015,0.0050,-0.2987,0.7652,-0.0112,0.0083
taxpc,8.822e-05,5.714e-05,1.5438,0.1232,-2.403e-05,0.0002
wcon,-2.06e-06,6.017e-07,-3.4230,0.0007,-3.242e-06,-8.777e-07


In [12]:
# Clustered by entity (fixed effects)

res_fe_clustered_entity = mod_entity_fe.fit(cov_type="clustered", cluster_entity=True)
res_fe_clustered_entity.summary

0,1,2,3
Dep. Variable:,crmrte,R-squared:,0.0086
Estimator:,PanelOLS,R-squared (Between):,-0.0345
No. Observations:,630,R-squared (Within):,0.0086
Date:,"Thu, Mar 11 2021",R-squared (Overall):,-0.0331
Time:,10:57:52,Log-likelihood,2280.2
Cov. Estimator:,Clustered,,
,,F-statistic:,1.5605
Entities:,90,P-value,0.1980
Avg Obs:,7.0000,Distribution:,"F(3,537)"
Min Obs:,7.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
density,-0.0015,0.0064,-0.2315,0.8170,-0.0141,0.0111
taxpc,8.822e-05,8.68e-05,1.0164,0.3099,-8.228e-05,0.0003
wcon,-2.06e-06,5.288e-07,-3.8952,0.0001,-3.098e-06,-1.021e-06


In [13]:
# Clustered by entity (first difference)

res_fd_clustered_entity = mod_fd.fit(cov_type="clustered", cluster_entity=True)
res_fd_clustered_entity.summary

0,1,2,3
Dep. Variable:,crmrte,R-squared:,0.0019
Estimator:,FirstDifferenceOLS,R-squared (Between):,0.5954
No. Observations:,540,R-squared (Within):,0.0015
Date:,"Thu, Mar 11 2021",R-squared (Overall):,0.5764
Time:,10:57:53,Log-likelihood,1779.7
Cov. Estimator:,Clustered,,
,,F-statistic:,0.3358
Entities:,90,P-value,0.7995
Avg Obs:,7.0000,Distribution:,"F(3,537)"
Min Obs:,7.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
density,0.0081,0.0060,1.3505,0.1774,-0.0037,0.0200
taxpc,2.537e-05,3.798e-05,0.6680,0.5044,-4.923e-05,9.997e-05
wcon,-9.953e-07,3.078e-07,-3.2335,0.0013,-1.6e-06,-3.906e-07


In [14]:
# Clustered by time (fixed effects)

res_fe_clustered_time = mod_entity_fe.fit(cov_type="clustered", cluster_time=True)
res_fe_clustered_time.summary

0,1,2,3
Dep. Variable:,crmrte,R-squared:,0.0086
Estimator:,PanelOLS,R-squared (Between):,-0.0345
No. Observations:,630,R-squared (Within):,0.0086
Date:,"Thu, Mar 11 2021",R-squared (Overall):,-0.0331
Time:,10:57:53,Log-likelihood,2280.2
Cov. Estimator:,Clustered,,
,,F-statistic:,1.5605
Entities:,90,P-value,0.1980
Avg Obs:,7.0000,Distribution:,"F(3,537)"
Min Obs:,7.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
density,-0.0015,0.0073,-0.2031,0.8391,-0.0158,0.0129
taxpc,8.822e-05,2.532e-05,3.4843,0.0005,3.848e-05,0.0001
wcon,-2.06e-06,7.017e-07,-2.9352,0.0035,-3.438e-06,-6.812e-07


In [15]:
# Clustered by time (first difference)

res_fd_clustered_time = mod_fd.fit(cov_type="clustered", cluster_time=True)
res_fd_clustered_time.summary

0,1,2,3
Dep. Variable:,crmrte,R-squared:,0.0019
Estimator:,FirstDifferenceOLS,R-squared (Between):,0.5954
No. Observations:,540,R-squared (Within):,0.0015
Date:,"Thu, Mar 11 2021",R-squared (Overall):,0.5764
Time:,10:57:53,Log-likelihood,1779.7
Cov. Estimator:,Clustered,,
,,F-statistic:,0.3358
Entities:,90,P-value,0.7995
Avg Obs:,7.0000,Distribution:,"F(3,537)"
Min Obs:,7.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
density,0.0081,0.0068,1.2001,0.2306,-0.0052,0.0214
taxpc,2.537e-05,3.552e-05,0.7142,0.4754,-4.441e-05,9.515e-05
wcon,-9.953e-07,3.835e-07,-2.5953,0.0097,-1.749e-06,-2.419e-07


In [16]:
# Newey-West
# WARNING: linearmodels currently does not implement Newey-West SEs
# But statsmodels does! 
# The following code implements a entity-dummies fixed effects regression
# Note: you must include "-1" into the formula to remove the intercept

data_noindex = data.reset_index()
mod_fe_NW = sm.formula.ols("crmrte ~ density + taxpc + wcon + C(county) -1", data=data_noindex)
res_fe_NW = mod_fe_NW.fit(cov_type="HAC", cov_kwds={"maxlags":1})
res_fe_NW.summary()

0,1,2,3
Dep. Variable:,crmrte,R-squared:,0.872
Model:,OLS,Adj. R-squared:,0.85
Method:,Least Squares,F-statistic:,
Date:,"Thu, 11 Mar 2021",Prob (F-statistic):,
Time:,10:57:53,Log-Likelihood:,2280.2
No. Observations:,630,AIC:,-4374.0
Df Residuals:,537,BIC:,-3961.0
Df Model:,92,,
Covariance Type:,HAC,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
C(county)[1],0.0373,0.011,3.441,0.001,0.016,0.059
C(county)[3],0.0154,0.004,3.495,0.000,0.007,0.024
C(county)[5],0.0112,0.002,6.328,0.000,0.008,0.015
C(county)[7],0.0206,0.003,8.062,0.000,0.016,0.026
C(county)[9],0.0108,0.002,4.462,0.000,0.006,0.016
C(county)[11],0.0160,0.003,6.014,0.000,0.011,0.021
C(county)[13],0.0313,0.002,14.239,0.000,0.027,0.036
C(county)[15],0.0155,0.002,8.457,0.000,0.012,0.019
C(county)[17],0.0193,0.002,9.367,0.000,0.015,0.023

0,1,2,3
Omnibus:,623.8,Durbin-Watson:,1.9
Prob(Omnibus):,0.0,Jarque-Bera (JB):,150008.192
Skew:,3.773,Prob(JB):,0.0
Kurtosis:,78.217,Cond. No.,93000.0


In [17]:
# Driscoll-Kraay HAC

res_fe_HAC = mod_entity_fe.fit(cov_type="kernel", kernel="andrews")
res_fe_HAC.summary

0,1,2,3
Dep. Variable:,crmrte,R-squared:,0.0086
Estimator:,PanelOLS,R-squared (Between):,-0.0345
No. Observations:,630,R-squared (Within):,0.0086
Date:,"Thu, Mar 11 2021",R-squared (Overall):,-0.0331
Time:,10:57:53,Log-likelihood,2280.2
Cov. Estimator:,Driscoll-Kraay,,
,,F-statistic:,1.5605
Entities:,90,P-value,0.1980
Avg Obs:,7.0000,Distribution:,"F(3,537)"
Min Obs:,7.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
density,-0.0015,0.0075,-0.1978,0.8433,-0.0162,0.0132
taxpc,8.822e-05,1.61e-05,5.4808,0.0000,5.66e-05,0.0001
wcon,-2.06e-06,8.857e-07,-2.3255,0.0204,-3.8e-06,-3.198e-07
