## 12.13 面板模型的python命令及实例

### 1.面板数据的设定

In [1]:
import pandas as pd
import statsmodels.api as sm
from linearmodels.panel import PanelOLS, PooledOLS, RandomEffects, FirstDifferenceOLS

`statsmodels`和`linearmodels`的数据类型不一样
- `linearmodels`需要设置多重索引形成真正意义上的面板数据
- `statsmodels`是按照截面数据的形式指定个体和时间的字段即可

In [2]:
lin = pd.read_stata('../2_Data/Data-2e/lin_1992.dta')
lin.dropna(inplace=True)
panel_lin = lin.set_index(['province', 'year']) # 第1个索引是个体，第2个索引是时间

原始数据是以截面数据形式展现的面板数据。
- `province` 是面板变量（个体变量）= `prov`
- `t` 是时间变量（时间序列） = `year`

### 2.混合回归

#### （1）以`'province'`作为聚类变量的聚类稳健的标准误
##### a. 使用`linearmodels.panel.PooledOLS()`

在`fit()`中设置`cov_type='Clustered'`
- `cluster_entity` - Boolean flag indicating to use entity clusters
-  `cluster_time` - Boolean indicating to use time clusters

In [3]:
# 使用聚类稳健的标准误，聚类变量为entity
exog = panel_lin[['ltlan', 'ltwlab', 'ltpow', 
                  'ltfer','hrs','mipric1',
                  'giprice','mci','ngca']]
dep = panel_lin['ltvfo']
exog = sm.add_constant(exog)
model = PooledOLS(dependent=dep, exog=exog)
result = model.fit(cov_type='clustered', # 采用聚类标准误
                   cluster_entity=True, # 按个体进行聚类
                #  debiased=True, 
                   group_debias=True)  # 是否按照个体数n进行聚类
print(result.summary)

                          PooledOLS Estimation Summary                          
Dep. Variable:                  ltvfo   R-squared:                        0.8685
Estimator:                  PooledOLS   R-squared (Between):              0.8901
No. Observations:                 476   R-squared (Within):               0.8146
Date:                Fri, May 03 2024   R-squared (Overall):              0.8685
Time:                        03:14:41   Log-likelihood                    103.19
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      342.09
Entities:                          28   P-value                           0.0000
Avg Obs:                       17.000   Distribution:                   F(9,466)
Min Obs:                       17.000                                           
Max Obs:                       17.000   F-statistic (robust):             81.390
                            

  mu = self._frame.groupby(level=level).mean()
  mu = self._frame.groupby(level=level).mean()
  out = self._frame.groupby(level=level).count()
  mu = self._frame.groupby(level=level).mean()
  group_mu = self._frame.groupby(level=level).transform("mean")
  mu = self._frame.groupby(level=level).mean()
  mu = self._frame.groupby(level=level).mean()
  group_mu = self._frame.groupby(level=level).transform("mean")


##### b. 使用`sm.OLS()`估计

在`.fit()`函数中：
- `cov_type='cluster'`：使用聚类标准误
- `cov_kwds={'groups':lin['province]}`：指定按个体变量进行聚类

In [4]:
exog = lin[['ltlan', 'ltwlab', 'ltpow', 
                  'ltfer','hrs','mipric1',
                  'giprice','mci','ngca']]
dep = lin['ltvfo']
exog = sm.add_constant(exog)
model = sm.OLS(endog=dep, exog=exog)
results = model.fit(cov_type='cluster', 
                    cov_kwds={'groups': lin['province']})
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                  ltvfo   R-squared:                       0.869
Model:                            OLS   Adj. R-squared:                  0.866
Method:                 Least Squares   F-statistic:                     81.39
Date:                Fri, 03 May 2024   Prob (F-statistic):           3.21e-17
Time:                        03:14:41   Log-Likelihood:                 103.19
No. Observations:                 476   AIC:                            -186.4
Df Residuals:                     466   BIC:                            -144.7
Df Model:                           9                                         
Covariance Type:              cluster                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          1.0806      0.827      1.307      0.1

#### （2）使用普通标准误进行对比

##### a. 使用`linearmodels.panel.PooledOLS`

In [5]:
exog = panel_lin[['ltlan', 'ltwlab', 'ltpow', 
                  'ltfer','hrs','mipric1',
                  'giprice','mci','ngca']]
dep = panel_lin['ltvfo']
exog = sm.add_constant(exog)
model = PooledOLS(dependent=dep, exog=exog)
result = model.fit()
print(result.summary)

                          PooledOLS Estimation Summary                          
Dep. Variable:                  ltvfo   R-squared:                        0.8685
Estimator:                  PooledOLS   R-squared (Between):              0.8901
No. Observations:                 476   R-squared (Within):               0.8146
Date:                Fri, May 03 2024   R-squared (Overall):              0.8685
Time:                        03:14:41   Log-likelihood                    103.19
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      342.09
Entities:                          28   P-value                           0.0000
Avg Obs:                       17.000   Distribution:                   F(9,466)
Min Obs:                       17.000                                           
Max Obs:                       17.000   F-statistic (robust):             342.09
                            

  mu = self._frame.groupby(level=level).mean()
  mu = self._frame.groupby(level=level).mean()
  out = self._frame.groupby(level=level).count()
  mu = self._frame.groupby(level=level).mean()
  group_mu = self._frame.groupby(level=level).transform("mean")
  mu = self._frame.groupby(level=level).mean()
  mu = self._frame.groupby(level=level).mean()
  group_mu = self._frame.groupby(level=level).transform("mean")


##### b. 使用`sm.OLS()`

In [6]:
exog = lin[['ltlan', 'ltwlab', 'ltpow', 
                  'ltfer','hrs','mipric1',
                  'giprice','mci','ngca']]
dep = lin['ltvfo']
exog = sm.add_constant(exog)
model = sm.OLS(endog=dep, exog=exog)
results = model.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                  ltvfo   R-squared:                       0.869
Model:                            OLS   Adj. R-squared:                  0.866
Method:                 Least Squares   F-statistic:                     342.1
Date:                Fri, 03 May 2024   Prob (F-statistic):          5.01e-199
Time:                        03:14:41   Log-Likelihood:                 103.19
No. Observations:                 476   AIC:                            -186.4
Df Residuals:                     466   BIC:                            -144.7
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          1.0806      0.283      3.815      0.0

### 3.固定效应

#### （1）使用组内估计量

In [7]:
mod = PanelOLS(dependent=dep, exog=exog, entity_effects=True)
fe_res = mod.fit(cov_type='clustered', 
                 cluster_entity=True,
                 group_debias=True)
print(fe_res)

ValueError: Series can only be used with a 2-level MultiIndex

#### （2）LSDV法

##### a. 使用`linearmodels.panel.PanelOLS`进行LSDV法

In [None]:

mod = PanelOLS(dependent=dep, exog=exog, entity_effects=True)
fe_res = mod.fit(use_lsdv=True, 
                 cov_type='clustered', 
                 cluster_entity=True,
                 group_debias=True
                 )
print(fe_res)

                          PanelOLS Estimation Summary                           
Dep. Variable:                  ltvfo   R-squared:                        0.8732
Estimator:                   PanelOLS   R-squared (Between):              0.5992
No. Observations:                 476   R-squared (Within):               0.8732
Date:                Fri, May 03 2024   R-squared (Overall):              0.6775
Time:                        02:48:09   Log-likelihood                    409.94
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      433.77
Entities:                          28   P-value                           0.0000
Avg Obs:                       17.000   Distribution:                   F(7,441)
Min Obs:                       17.000                                           
Max Obs:                       17.000   F-statistic (robust):             322.57
                            

  mu = self._frame.groupby(level=level).mean()
  mu = self._frame.groupby(level=level).mean()
  out = self._frame.groupby(level=level).count()
  mu = self._frame.groupby(level=level).mean()
  group_mu = self._frame.groupby(level=level).transform("mean")
  mu = self._frame.groupby(level=level).mean()
  mu = self._frame.groupby(level=level).mean()
  group_mu = self._frame.groupby(level=level).transform("mean")


In [None]:
lin = pd.read_stata('../2_Data/Data-2e/lin_1992.dta')
lin.dropna(inplace=True)
prov = pd.Categorical(lin.prov)
lin = lin.set_index(['prov', 'year']) 
lin['prov'] = prov

exog = lin[['ltlan', 'ltwlab', 'ltpow', 
                  'ltfer','hrs','mipric1',
                  'giprice','mci','ngca','prov']]
dep = lin['ltvfo']
exog = sm.add_constant(exog)


mod = PanelOLS(dependent=dep, exog=exog)
fe_res = mod.fit(cov_type='clustered', cluster_entity=True,group_debias=True)
print(fe_res)

                          PanelOLS Estimation Summary                           
Dep. Variable:                  ltvfo   R-squared:                        0.9642
Estimator:                   PanelOLS   R-squared (Between):              1.0000
No. Observations:                 476   R-squared (Within):               0.8746
Date:                Fri, May 03 2024   R-squared (Overall):              0.9642
Time:                        03:10:17   Log-likelihood                    412.62
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      328.23
Entities:                          28   P-value                           0.0000
Avg Obs:                       17.000   Distribution:                  F(36,439)
Min Obs:                       17.000                                           
Max Obs:                       17.000   F-statistic (robust):         -9.952e+15
                            

- 计算出来的结果与组内估计量一模一样
- 将个体虚拟变量显示出来的方法

##### b. 使用statamodels回归，需手动增加虚拟变量

两种方式，但效果一样
- 直接手动增加个体特征的虚拟变量
- 用`pd.get_dummies()`


In [None]:
lin = pd.read_stata('../2_Data/Data-2e/lin_1992.dta')

entity_ids = lin['province'].unique()
entity_vars = [f'entity_{entity_id}' for entity_id in entity_ids]

for var in entity_vars:
    lin[var] = ( lin['province']== var[7:] ).astype(int)

lin.pop(item=entity_vars[0])
entity_vars.pop(0)

exog_vars = ['ltlan', 'ltwlab', 'ltpow', 
                  'ltfer','hrs','mipric1',
                  'giprice','mci','ngca']+entity_vars

lin = lin.dropna()
exog = lin[exog_vars]
dep = lin['ltvfo']
exog = sm.add_constant(exog)
mod = sm.OLS(dep, exog)
res = mod.fit(cov_type='cluster', 
              cov_kwds={'groups': lin['province']})
print(res.summary())

In [None]:
lin = pd.read_stata('../2_Data/Data-2e/lin_1992.dta')

entity_ids = lin['province'].unique()
entity_vars = [f'entity_{entity_id}' for entity_id in entity_ids]
entity_vars.pop(0)

lin = pd.get_dummies(lin, 
                     columns=['province'],
                     drop_first=True, 
                     prefix='entity',
                     dtype=int,
                     ) 
# 使用后回删除'province'字段，后续使用聚类标准误会出现错误
## 原数据中有'prov'字段所以无需增加，如果没有，需提前复制新增'prov'字段

exog_vars = ['ltlan', 'ltwlab', 'ltpow', 
                  'ltfer','hrs','mipric1',
                  'giprice','mci','ngca']+entity_vars

lin = lin.dropna()
exog = lin[exog_vars]
dep = lin['ltvfo']
exog = sm.add_constant(exog)
mod = sm.OLS(dep, exog)
res = mod.fit(cov_type='cluster', cov_kwds={'groups': lin['prov']})
print(res.summary())

#### （3）一阶差分法

不常用的方法
- 在教材中使用的xtserial命令损失了56个样本，2个时间周期的变量
- 而在python命令下，仅损失了1个时间周期的变量，效果应该更好。

In [None]:
exog = panel_lin[['ltlan', 'ltwlab', 'ltpow', 
                  'ltfer','hrs','mipric1',
                  'giprice','mci','ngca']]
dep = panel_lin['ltvfo']
mod = FirstDifferenceOLS(dependent=dep, exog=exog)
fe_res = mod.fit(cov_type='robust', 
                 )
print(fe_res)

#### （4）双向固定效应

##### a. 加入时间趋势项

`'t'`为时间趋势项

In [None]:
exog = panel_lin[['ltlan', 'ltwlab', 'ltpow', 
                  'ltfer','hrs','mipric1',
                  'giprice','mci','ngca',
                  't'
                  ]]
dep = panel_lin['ltvfo']
exog = sm.add_constant(exog)
mod = PanelOLS(dependent=dep, exog=exog, entity_effects=True)
fe_res = mod.fit(cov_type='clustered', 
                 cluster_entity=True,
                 group_debias=True)
print(fe_res)

                          PanelOLS Estimation Summary                           
Dep. Variable:                  ltvfo   R-squared:                        0.8749
Estimator:                   PanelOLS   R-squared (Between):              0.5699
No. Observations:                 476   R-squared (Within):               0.8749
Date:                Fri, May 03 2024   R-squared (Overall):              0.6570
Time:                        02:14:21   Log-likelihood                    413.14
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      306.24
Entities:                          28   P-value                           0.0000
Avg Obs:                       17.000   Distribution:                  F(10,438)
Min Obs:                       17.000                                           
Max Obs:                       17.000   F-statistic (robust):             247.93
                            

  group_mu = self._frame.groupby(level=level).transform("mean")
  group_mu = self._frame.groupby(level=level).transform("mean")
  mu = self._frame.groupby(level=level).mean()
  mu = self._frame.groupby(level=level).mean()
  out = self._frame.groupby(level=level).count()
  mu = self._frame.groupby(level=level).mean()
  group_mu = self._frame.groupby(level=level).transform("mean")
  mu = self._frame.groupby(level=level).mean()
  mu = self._frame.groupby(level=level).mean()
  group_mu = self._frame.groupby(level=level).transform("mean")


##### b. 加入时间虚拟变量

In [8]:
lin = pd.read_stata('../2_Data/Data-2e/lin_1992.dta')
lin.dropna(inplace=True)
year = pd.Categorical(lin.year)
lin = lin.set_index(['province', 'year']) 
lin['year'] = year

exog = lin[['ltlan', 'ltwlab', 'ltpow', 
                  'ltfer','hrs',
                  'mci','ngca','year']]
dep = lin['ltvfo']
exog = sm.add_constant(exog)

mod = PanelOLS(dependent=dep, exog=exog, entity_effects=True)
fe_res = mod.fit(cov_type='clustered', cluster_entity=True,group_debias=True)
print(fe_res)

                          PanelOLS Estimation Summary                           
Dep. Variable:                  ltvfo   R-squared:                        0.8932
Estimator:                   PanelOLS   R-squared (Between):              0.5974
No. Observations:                 476   R-squared (Within):               0.8932
Date:                Fri, May 03 2024   R-squared (Overall):              0.6819
Time:                        03:14:50   Log-likelihood                    450.91
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      154.59
Entities:                          28   P-value                           0.0000
Avg Obs:                       17.000   Distribution:                  F(23,425)
Min Obs:                       17.000                                           
Max Obs:                       17.000   F-statistic (robust):             949.82
                            

  group_mu = self._frame.groupby(level=level).transform("mean")
  group_mu = self._frame.groupby(level=level).transform("mean")
  mu = self._frame.groupby(level=level).mean()
  mu = self._frame.groupby(level=level).mean()
  out = self._frame.groupby(level=level).count()
  mu = self._frame.groupby(level=level).mean()
  group_mu = self._frame.groupby(level=level).transform("mean")
  mu = self._frame.groupby(level=level).mean()
  mu = self._frame.groupby(level=level).mean()
  group_mu = self._frame.groupby(level=level).transform("mean")


### 4.随机效应

### 5.豪斯曼检验

### 6.组间统计量