# Panel models 


### Some bus ridership data

In [1]:
%matplotlib inline

import pandas as pd
import seaborn as sns
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf

from linearmodels.panel import PanelOLS

In [6]:
# This data has
data = pd.read_csv('data/PanelEstimationFile.csv')
# Set the index
data = data.set_index(['ID', 'YEAR'])
data.head()


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 0,SFTAZ,MODE,TOD2,DIST,DISTNAME,AVG_RIDE,ROUTES,STOPS,TRIP_STOPS,...,CALTRAIN_AVG_RIDE,AVG_TNC,HHLDS_SMOOTH,POP_SMOOTH,TOTALEMP_SMOOTH,EMPRES_SMOOTH,HHLDS_0_VEH_SHARE_SMOOTH,BART_AVG_RIDE_SMOOTH,CALTRAIN_AVG_RIDE_SMOOTH,AVG_TNC_SMOOTH
ID,YEAR,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1_BUS_0300-0859,2010,0,1,BUS,0300-0859,9,'Bayshore',11.503687,2.0,3.0,36.0,...,0.0,0.0,71.422712,297.2428,12.22362,115.311125,0.241549,0.0,0.0,0.0
1_BUS_0300-0859,2015,1,1,BUS,0300-0859,9,'Bayshore',33.240093,3.0,3.0,50.0,...,0.0,3.395,74.882905,311.916768,17.408276,125.629692,0.197806,0.0,0.0,0.659106
1_BUS_0900-1559,2010,2,1,BUS,0900-1559,9,'Bayshore',50.245435,2.0,3.0,84.0,...,0.0,0.0,71.422712,297.2428,12.22362,115.311125,0.241549,0.0,0.0,0.0
1_BUS_0900-1559,2015,3,1,BUS,0900-1559,9,'Bayshore',56.483018,2.0,3.0,119.0,...,0.0,3.725,74.882905,311.916768,17.408276,125.629692,0.197806,0.0,0.0,0.855811
1_BUS_1600-1859,2010,4,1,BUS,1600-1859,9,'Bayshore',31.448123,2.0,3.0,36.0,...,0.0,0.0,71.422712,297.2428,12.22362,115.311125,0.241549,0.0,0.0,0.0


In [7]:
# create log transformations for all variables
from pandas.api.types import is_numeric_dtype

for col in data.columns:
    if (is_numeric_dtype(data[col])):
        data[col+'_log'] = np.log(1+data[col])

In [11]:
# last time we found this log-log model did ok
# this one is a little different because it includes both 2010 and 2015 data

mod = smf.ols(formula='AVG_RIDE_log \
              ~ POP_SMOOTH_log \
              + TOTALEMP_SMOOTH_log \
              + ROUTES_log \
              + TRIP_STOPS_log \
              + AVG_TNC_log \
              + 1', 
              data=data)
res = mod.fit()
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:           AVG_RIDE_log   R-squared:                       0.604
Model:                            OLS   Adj. R-squared:                  0.604
Method:                 Least Squares   F-statistic:                     2245.
Date:                Thu, 09 Apr 2020   Prob (F-statistic):               0.00
Time:                        12:17:55   Log-Likelihood:                -10004.
No. Observations:                7358   AIC:                         2.002e+04
Df Residuals:                    7352   BIC:                         2.006e+04
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
Intercept              -2.0470    

In [13]:
# try the same model with a PanelOLS instead
# by default we have the same thing

mod = PanelOLS.from_formula('AVG_RIDE_log \
              ~ POP_SMOOTH_log \
              + TOTALEMP_SMOOTH_log \
              + ROUTES_log \
              + TRIP_STOPS_log \
              + AVG_TNC_log \
              + 1', 
              data=data)

res = mod.fit()
print(res)

                          PanelOLS Estimation Summary                           
Dep. Variable:           AVG_RIDE_log   R-squared:                        0.6042
Estimator:                   PanelOLS   R-squared (Between):              0.6135
No. Observations:                7358   R-squared (Within):               0.2126
Date:                Thu, Apr 09 2020   R-squared (Overall):              0.6042
Time:                        12:18:35   Log-likelihood                    -1e+04
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      2245.0
Entities:                        3679   P-value                           0.0000
Avg Obs:                       2.0000   Distribution:                  F(5,7352)
Min Obs:                       2.0000                                           
Max Obs:                       2.0000   F-statistic (robust):             2245.0
                            

In [15]:
# add a constant on every entity
# this means we estimate the coefficients only on the change

mod = PanelOLS.from_formula('AVG_RIDE_log \
              ~ POP_SMOOTH_log \
              + TOTALEMP_SMOOTH_log \
              + ROUTES_log \
              + TRIP_STOPS_log \
              + AVG_TNC_log \
              + EntityEffects', 
              data=data)

res = mod.fit()
print(res)

                          PanelOLS Estimation Summary                           
Dep. Variable:           AVG_RIDE_log   R-squared:                        0.2836
Estimator:                   PanelOLS   R-squared (Between):              0.9365
No. Observations:                7358   R-squared (Within):               0.2836
Date:                Thu, Apr 09 2020   R-squared (Overall):              0.9348
Time:                        12:19:34   Log-likelihood                    1686.5
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      290.85
Entities:                        3679   P-value                           0.0000
Avg Obs:                       2.0000   Distribution:                  F(5,3674)
Min Obs:                       2.0000                                           
Max Obs:                       2.0000   F-statistic (robust):             290.85
                            

In [16]:
# add a constant on every time period
# this controls for other background changes

mod = PanelOLS.from_formula('AVG_RIDE_log \
              ~ POP_SMOOTH_log \
              + TOTALEMP_SMOOTH_log \
              + ROUTES_log \
              + TRIP_STOPS_log \
              + AVG_TNC_log \
              + EntityEffects \
              + TimeEffects', 
              data=data)

res = mod.fit()
print(res)

                          PanelOLS Estimation Summary                           
Dep. Variable:           AVG_RIDE_log   R-squared:                        0.2803
Estimator:                   PanelOLS   R-squared (Between):              0.9296
No. Observations:                7358   R-squared (Within):               0.2474
Date:                Thu, Apr 09 2020   R-squared (Overall):              0.9278
Time:                        12:20:05   Log-likelihood                    1704.4
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      286.13
Entities:                        3679   P-value                           0.0000
Avg Obs:                       2.0000   Distribution:                  F(5,3673)
Min Obs:                       2.0000                                           
Max Obs:                       2.0000   F-statistic (robust):             286.13
                            