## 12.13 面板模型的python命令及实例

### 1.面板数据的设定

In [224]:
import pandas as pd
import statsmodels.api as sm
from linearmodels.panel import PanelOLS, PooledOLS, RandomEffects, FirstDifferenceOLS


In [225]:
# 读取数据
lin = pd.read_stata('../2_Data/Data-2e/lin_1992.dta')
lin.dropna(inplace=True)
panel_lin = lin.set_index(['province', 'year']) # 第1个索引是个体，第2个索引是时间
lin.head()

Unnamed: 0,prov,year,no,sownarea,land,mci,casharea,gsl,ngca,vfo,...,iprice,giprice,mipric1,ltvfo,ltlan,ltwlab,ltpow,ltfer,province,t
0,anhui,70,16,110860,68770,1.61,9110,93470,0.16,418350.40625,...,111.900002,2.39,1.76,7.05,5.24,3.0,0.85,0.44,anhui,1.0
1,anhui,71,16,115429,68390,1.69,9470,94460,0.18,447111.5,...,110.199997,2.45,1.77,7.11,5.24,3.03,0.98,0.41,anhui,2.0
2,anhui,72,16,119875,68150,1.76,10550,95080,0.21,452006.125,...,109.599998,2.47,1.95,7.12,5.23,3.06,1.1,0.68,anhui,3.0
3,anhui,73,16,122181,67880,1.8,11540,94770,0.22,532227.25,...,109.599998,2.46,2.12,7.29,5.23,3.14,1.22,0.9,anhui,4.0
4,anhui,74,16,121830,67730,1.8,11530,94550,0.22,529670.9375,...,109.599998,2.47,2.24,7.28,5.23,3.18,1.34,0.88,anhui,5.0


原始数据是以截面数据形式展现的面板数据。
- `province` 是面板变量（个体变量）= `prov`
- `t` 是时间变量（时间序列） = `year`

### 2.混合回归

In [226]:
# 设置解释变量和被解释变量
exog = panel_lin[['ltlan', 'ltwlab', 'ltpow', 
                  'ltfer','hrs','mipric1',
                  'giprice','mci','ngca']]
dep = panel_lin['ltvfo']
exog = sm.add_constant(exog)

#### （1）以`province`作为聚类变量的聚类稳健的标准误

在`fit()`中设置`cov_type='Clustered'`
- `cluster_entity` - Boolean flag indicating to use entity clusters
-  `cluster_time` - Boolean indicating to use time clusters

In [227]:
# 使用聚类稳健的标准误，聚类变量为entity
model = PooledOLS(dependent=dep, exog=exog)
result = model.fit(cov_type='clustered', cluster_entity=True)
print(result.summary)

                          PooledOLS Estimation Summary                          
Dep. Variable:                  ltvfo   R-squared:                        0.8685
Estimator:                  PooledOLS   R-squared (Between):              0.8901
No. Observations:                 476   R-squared (Within):               0.8146
Date:                Thu, May 02 2024   R-squared (Overall):              0.8685
Time:                        01:09:15   Log-likelihood                    103.19
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      342.09
Entities:                          28   P-value                           0.0000
Avg Obs:                       17.000   Distribution:                   F(9,466)
Min Obs:                       17.000                                           
Max Obs:                       17.000   F-statistic (robust):             84.227
                            

  mu = self._frame.groupby(level=level).mean()
  mu = self._frame.groupby(level=level).mean()
  out = self._frame.groupby(level=level).count()
  mu = self._frame.groupby(level=level).mean()
  group_mu = self._frame.groupby(level=level).transform("mean")
  mu = self._frame.groupby(level=level).mean()
  mu = self._frame.groupby(level=level).mean()
  group_mu = self._frame.groupby(level=level).transform("mean")


计算的t统计量与stata不完全一样，但相差不大，此前也碰到过这种问题
- 可能是因数据缺失造成
- 可能是数据的类型问题，如int8的数据长度不够，需要使用int32

In [228]:
exog = lin[['ltlan', 'ltwlab', 'ltpow', 
                  'ltfer','hrs','mipric1',
                  'giprice','mci','ngca']]
dep = lin['ltvfo']
exog = sm.add_constant(exog)
model = sm.OLS(endog=dep, exog=exog)
results = model.fit(cov_type='cluster', cov_kwds={'groups': lin['province']})
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                  ltvfo   R-squared:                       0.869
Model:                            OLS   Adj. R-squared:                  0.866
Method:                 Least Squares   F-statistic:                     81.39
Date:                Thu, 02 May 2024   Prob (F-statistic):           3.21e-17
Time:                        01:09:15   Log-Likelihood:                 103.19
No. Observations:                 476   AIC:                            -186.4
Df Residuals:                     466   BIC:                            -144.7
Df Model:                           9                                         
Covariance Type:              cluster                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          1.0806      0.827      1.307      0.1

#### （2）使用普通标准误进行对比

In [229]:
exog = lin[['ltlan', 'ltwlab', 'ltpow', 
                  'ltfer','hrs','mipric1',
                  'giprice','mci','ngca']]
dep = lin['ltvfo']
exog = sm.add_constant(exog)
model = sm.OLS(endog=dep, exog=exog)
results = model.fit()
# results = model.fit(cov_type='cluster', cov_kwds={'groups': lin['province']})
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                  ltvfo   R-squared:                       0.869
Model:                            OLS   Adj. R-squared:                  0.866
Method:                 Least Squares   F-statistic:                     342.1
Date:                Thu, 02 May 2024   Prob (F-statistic):          5.01e-199
Time:                        01:09:15   Log-Likelihood:                 103.19
No. Observations:                 476   AIC:                            -186.4
Df Residuals:                     466   BIC:                            -144.7
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          1.0806      0.283      3.815      0.0

In [231]:
exog = panel_lin[['ltlan', 'ltwlab', 'ltpow', 
                  'ltfer','hrs','mipric1',
                  'giprice','mci','ngca']]
dep = panel_lin['ltvfo']
exog = sm.add_constant(exog)
model = PooledOLS(dependent=dep, exog=exog)
result = model.fit()
print(result.summary)

                          PooledOLS Estimation Summary                          
Dep. Variable:                  ltvfo   R-squared:                        0.8685
Estimator:                  PooledOLS   R-squared (Between):              0.8901
No. Observations:                 476   R-squared (Within):               0.8146
Date:                Thu, May 02 2024   R-squared (Overall):              0.8685
Time:                        01:09:40   Log-likelihood                    103.19
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      342.09
Entities:                          28   P-value                           0.0000
Avg Obs:                       17.000   Distribution:                   F(9,466)
Min Obs:                       17.000                                           
Max Obs:                       17.000   F-statistic (robust):             342.09
                            

  mu = self._frame.groupby(level=level).mean()
  mu = self._frame.groupby(level=level).mean()
  out = self._frame.groupby(level=level).count()
  mu = self._frame.groupby(level=level).mean()
  group_mu = self._frame.groupby(level=level).transform("mean")
  mu = self._frame.groupby(level=level).mean()
  mu = self._frame.groupby(level=level).mean()
  group_mu = self._frame.groupby(level=level).transform("mean")


### 3.固定效应

#### （1）使用组内估计量

In [None]:
mod = PanelOLS(dependent=dep, exog=exog, entity_effects=True)
fe_res = mod.fit(cov_type='clustered', 
                 cluster_entity=True,
                 group_debias=True)
print(fe_res)

                          PanelOLS Estimation Summary                           
Dep. Variable:                  ltvfo   R-squared:                        0.8746
Estimator:                   PanelOLS   R-squared (Between):              0.5636
No. Observations:                 476   R-squared (Within):               0.8746
Date:                Thu, May 02 2024   R-squared (Overall):              0.6525
Time:                        00:04:24   Log-likelihood                    412.62
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      340.20
Entities:                          28   P-value                           0.0000
Avg Obs:                       17.000   Distribution:                   F(9,439)
Min Obs:                       17.000                                           
Max Obs:                       17.000   F-statistic (robust):             274.25
                            

  group_mu = self._frame.groupby(level=level).transform("mean")
  group_mu = self._frame.groupby(level=level).transform("mean")
  mu = self._frame.groupby(level=level).mean()
  mu = self._frame.groupby(level=level).mean()
  out = self._frame.groupby(level=level).count()
  mu = self._frame.groupby(level=level).mean()
  group_mu = self._frame.groupby(level=level).transform("mean")
  mu = self._frame.groupby(level=level).mean()
  mu = self._frame.groupby(level=level).mean()
  group_mu = self._frame.groupby(level=level).transform("mean")


#### （2）LSDV法

In [None]:
mod = PanelOLS(dependent=dep, exog=exog, entity_effects=True)
fe_res = mod.fit(use_lsdv=True, 
                 cov_type='clustered', 
                 cluster_entity=True,
                 group_debias=True
                 )
print(fe_res)

                          PanelOLS Estimation Summary                           
Dep. Variable:                  ltvfo   R-squared:                        0.8746
Estimator:                   PanelOLS   R-squared (Between):              0.5636
No. Observations:                 476   R-squared (Within):               0.8746
Date:                Thu, May 02 2024   R-squared (Overall):              0.6525
Time:                        00:25:36   Log-likelihood                    412.62
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      340.20
Entities:                          28   P-value                           0.0000
Avg Obs:                       17.000   Distribution:                   F(9,439)
Min Obs:                       17.000                                           
Max Obs:                       17.000   F-statistic (robust):             274.25
                            

  mu = self._frame.groupby(level=level).mean()
  mu = self._frame.groupby(level=level).mean()
  out = self._frame.groupby(level=level).count()
  mu = self._frame.groupby(level=level).mean()
  group_mu = self._frame.groupby(level=level).transform("mean")
  mu = self._frame.groupby(level=level).mean()
  mu = self._frame.groupby(level=level).mean()
  group_mu = self._frame.groupby(level=level).transform("mean")



- 计算出来的结果与组内估计量一模一样
- 没找到和Stata一样可以将个体虚拟变量显示出来的方法


##### 手动增加虚拟变量，再用statamodels回归

In [None]:
lin = pd.read_stata('../2_Data/Data-2e/lin_1992.dta')

entity_ids = lin['province'].unique()
entity_vars = [f'entity_{entity_id}' for entity_id in entity_ids]
entity_vars
for var in entity_vars:
    lin[var] = ( lin['province']== var[7:] ).astype(int)

lin.pop(item=entity_vars[0])
entity_vars.pop(0)

exog_vars = ['ltlan', 'ltwlab', 'ltpow', 
                  'ltfer','hrs','mipric1',
                  'giprice','mci','ngca']+entity_vars

lin = lin.dropna()
exog = lin[exog_vars]
dep = lin['ltvfo']
exog = sm.add_constant(exog)
mod = sm.OLS(dep, exog)
res = mod.fit(cov_type='cluster', cov_kwds={'groups': lin['province']})
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:                  ltvfo   R-squared:                       0.964
Model:                            OLS   Adj. R-squared:                  0.961
Method:                 Least Squares   F-statistic:                     30.55
Date:                Thu, 02 May 2024   Prob (F-statistic):           6.83e-12
Time:                        00:56:03   Log-Likelihood:                 412.62
No. Observations:                 476   AIC:                            -751.2
Df Residuals:                     439   BIC:                            -597.1
Df Model:                          36                                         
Covariance Type:              cluster                                         
                          coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------
const                   2.5682    



#### （3）一阶差分法

In [234]:
exog = panel_lin[['ltlan', 'ltwlab', 'ltpow', 
                  'ltfer','hrs','mipric1',
                  'giprice','mci','ngca']]

mod = FirstDifferenceOLS(dependent=dep, exog=exog)
fe_res = mod.fit()
# fe_res = mod.fit()
print(fe_res)

                     FirstDifferenceOLS Estimation Summary                      
Dep. Variable:                  ltvfo   R-squared:                        0.5692
Estimator:         FirstDifferenceOLS   R-squared (Between):              0.9955
No. Observations:                 448   R-squared (Within):               0.7931
Date:                Thu, May 02 2024   R-squared (Overall):              0.9952
Time:                        01:12:49   Log-likelihood                    350.78
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      64.438
Entities:                          28   P-value                           0.0000
Avg Obs:                       17.000   Distribution:                   F(9,439)
Min Obs:                       17.000                                           
Max Obs:                       17.000   F-statistic (robust):             64.438
                            

  mu = self._frame.groupby(level=level).mean()
  mu = self._frame.groupby(level=level).mean()
  out = self._frame.groupby(level=level).count()
  mu = self._frame.groupby(level=level).mean()
  group_mu = self._frame.groupby(level=level).transform("mean")
  mu = self._frame.groupby(level=level).mean()
  mu = self._frame.groupby(level=level).mean()
  group_mu = self._frame.groupby(level=level).transform("mean")


### 4.随机效应

```plaintext


```

### 5.豪斯曼检验

### 6.组间统计量