## 12.13 面板模型的python命令及实例

### 1.面板数据的设定

In [3]:
import pandas as pd
import statsmodels.api as sm
from linearmodels.panel import PanelOLS, PooledOLS, RandomEffects, FirstDifferenceOLS,BetweenOLS

设置以下数据，避免后续使用时代码结构太混乱

In [5]:
class Dataset:
    
    def __init__(self, data):
        self.data_original = data
        self.data = self._int_data()
        self.exog = self._init_exog(lm=False)
        self.endog = self.data['ltvfo']
        self.data_lm = self.data.set_index(['prov', 'year'])
        self.exog_lm = self._init_exog(lm=True)
        self.dept_lm = self.data_lm[['ltvfo']]
        self.exog_lm_dv_n = self._init_exog(lm=True, dv='entity')
        self.exog_lm_dv_t = self._init_exog(lm=True, dv='time')
        
    def _int_data(self):
        
        self.data_original['year_d'] = self.data_original['year'].astype(str)
        self.data_original['prov_d'] = self.data_original['prov'].astype(str)
        data = self.data_original.dropna()
        return data
    
    def _init_exog(self,lm=True, dv = None):
        if lm:
            exog =  self.data_lm[['ltlan', 'ltwlab', 'ltpow', 'ltfer','hrs','mipric1', 'giprice','mci','ngca']]
            if dv == 'entity':
                exog =  self.data_lm[['ltlan', 'ltwlab', 'ltpow', 'ltfer','hrs','mipric1', 'giprice','mci','ngca','prov_d']]
            if dv == 'time':
                exog =  self.data_lm[['ltlan', 'ltwlab', 'ltpow', 'ltfer','hrs','mipric1', 'giprice','mci','ngca','year_d']]
        else:
            exog = self.data[['ltlan', 'ltwlab', 'ltpow', 'ltfer','hrs','mipric1', 'giprice','mci','ngca']]
            
        exog = sm.add_constant(exog)
        return exog
    

data = pd.read_stata('../2_Data/Data-2e/lin_1992.dta')
lin = Dataset(data)

`statsmodels`和`linearmodels`的数据类型不一样
- `linearmodels`需要设置多重索引形成真正意义上的面板数据
- `statsmodels`是按照截面数据的形式指定个体和时间的字段即可

原始数据是以截面数据形式展现的面板数据。
- `province` 是面板变量（个体变量）= `prov`
- `t` 是时间变量（时间序列） = `year`

### 2.混合回归

#### （1）以`'province'`作为聚类变量的聚类稳健的标准误

##### a. 使用`linearmodels.panel.PooledOLS()`

在`fit()`中设置`cov_type='Clustered'`
- `cluster_entity` - Boolean flag indicating to use entity clusters
-  `cluster_time` - Boolean indicating to use time clusters

In [3]:

model = PooledOLS(dependent=lin.dept_lm, exog=lin.exog_lm)
result = model.fit(cov_type='clustered', # 采用聚类标准误
                   cluster_entity=True, # 按个体进行聚类
                   group_debias=True)  # 是否按照个体数n进行聚类
print(result.summary)

                          PooledOLS Estimation Summary                          
Dep. Variable:                  ltvfo   R-squared:                        0.8685
Estimator:                  PooledOLS   R-squared (Between):              0.8901
No. Observations:                 476   R-squared (Within):               0.8146
Date:                Fri, May 03 2024   R-squared (Overall):              0.8685
Time:                        19:14:33   Log-likelihood                    103.19
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      342.09
Entities:                          28   P-value                           0.0000
Avg Obs:                       17.000   Distribution:                   F(9,466)
Min Obs:                       17.000                                           
Max Obs:                       17.000   F-statistic (robust):             81.390
                            

##### b. 使用`sm.OLS()`估计

在`.fit()`函数中：
- `cov_type='cluster'`：使用聚类标准误
- `cov_kwds={'groups':lin['province]}`：指定按个体变量进行聚类

In [4]:
model = sm.OLS(endog=lin.endog, exog=lin.exog)
results = model.fit(cov_type='cluster', 
                    cov_kwds={'groups': lin.data['province']})
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                  ltvfo   R-squared:                       0.869
Model:                            OLS   Adj. R-squared:                  0.866
Method:                 Least Squares   F-statistic:                     81.39
Date:                Fri, 03 May 2024   Prob (F-statistic):           3.21e-17
Time:                        19:14:33   Log-Likelihood:                 103.19
No. Observations:                 476   AIC:                            -186.4
Df Residuals:                     466   BIC:                            -144.7
Df Model:                           9                                         
Covariance Type:              cluster                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          1.0806      0.827      1.307      0.1

#### （2）使用普通标准误进行对比

##### a. 使用`linearmodels.panel.PooledOLS`

In [5]:
model = PooledOLS(dependent=lin.dept_lm, exog=lin.exog_lm)
result = model.fit()
print(result.summary)

                          PooledOLS Estimation Summary                          
Dep. Variable:                  ltvfo   R-squared:                        0.8685
Estimator:                  PooledOLS   R-squared (Between):              0.8901
No. Observations:                 476   R-squared (Within):               0.8146
Date:                Fri, May 03 2024   R-squared (Overall):              0.8685
Time:                        19:14:34   Log-likelihood                    103.19
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      342.09
Entities:                          28   P-value                           0.0000
Avg Obs:                       17.000   Distribution:                   F(9,466)
Min Obs:                       17.000                                           
Max Obs:                       17.000   F-statistic (robust):             342.09
                            

##### b. 使用`sm.OLS()`

In [6]:
model = sm.OLS(endog=lin.endog, exog=lin.exog)
results = model.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                  ltvfo   R-squared:                       0.869
Model:                            OLS   Adj. R-squared:                  0.866
Method:                 Least Squares   F-statistic:                     342.1
Date:                Fri, 03 May 2024   Prob (F-statistic):          5.01e-199
Time:                        19:14:34   Log-Likelihood:                 103.19
No. Observations:                 476   AIC:                            -186.4
Df Residuals:                     466   BIC:                            -144.7
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          1.0806      0.283      3.815      0.0

### 3.固定效应

#### （1）使用组内估计量

In [7]:
mod = PanelOLS(dependent=lin.dept_lm, exog=lin.exog_lm, entity_effects=True)
fe_res = mod.fit(use_lsdv=False,
                 cov_type='clustered', 
                 cluster_entity=True,
                 group_debias=True)
print(fe_res)

                          PanelOLS Estimation Summary                           
Dep. Variable:                  ltvfo   R-squared:                        0.8746
Estimator:                   PanelOLS   R-squared (Between):              0.5636
No. Observations:                 476   R-squared (Within):               0.8746
Date:                Fri, May 03 2024   R-squared (Overall):              0.6525
Time:                        19:14:34   Log-likelihood                    412.62
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      340.20
Entities:                          28   P-value                           0.0000
Avg Obs:                       17.000   Distribution:                   F(9,439)
Min Obs:                       17.000                                           
Max Obs:                       17.000   F-statistic (robust):             274.25
                            

#### （2）LSDV法

##### a. 使用`linearmodels.panel.PanelOLS`进行LSDV法

In [8]:
mod = PanelOLS(dependent=lin.dept_lm, exog= lin.exog_lm_dv_n)
fe_res = mod.fit(use_lsdv=True,
                 cov_type='clustered', 
                 cluster_entity=True,
                 group_debias=True
                 )
print(fe_res)

                          PanelOLS Estimation Summary                           
Dep. Variable:                  ltvfo   R-squared:                        0.9642
Estimator:                   PanelOLS   R-squared (Between):              1.0000
No. Observations:                 476   R-squared (Within):               0.8746
Date:                Fri, May 03 2024   R-squared (Overall):              0.9642
Time:                        19:14:34   Log-likelihood                    412.62
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      328.23
Entities:                          28   P-value                           0.0000
Avg Obs:                       17.000   Distribution:                  F(36,439)
Min Obs:                       17.000                                           
Max Obs:                       17.000   F-statistic (robust):         -9.952e+15
                            

- 计算出来的结果与组内估计量一模一样

##### b. 使用statamodels回归，需手动增加虚拟变量

两种方式，但效果一样
- 直接手动增加个体特征的虚拟变量
- 用`pd.get_dummies()`


In [16]:
l = lin.data
entity_ids = l['prov'].unique()
entity_vars = [f'entity_{entity_id}' for entity_id in entity_ids]

### -----------------------------------------------------------
for var in entity_vars:
    l.loc[:,var] = ( l['prov']== var[7:] ).astype(int)

l.pop(item=entity_vars[0])
entity_vars.pop(0)
####----------------------------------------------------------                                                                       

# 使用get_dummies函数 效果一样
# li = pd.get_dummies(l, columns=['province'],drop_first=True, 
#     prefix='entity', dtype=int,) 

exog_vars = ['ltlan', 'ltwlab', 'ltpow', 
                  'ltfer','hrs','mipric1',
                  'giprice','mci','ngca']+entity_vars

l = l.dropna()
exog = l[exog_vars]
dep = l['ltvfo']
exog = sm.add_constant(exog)
mod = sm.OLS(dep, exog)
res = mod.fit(cov_type='cluster', 
              cov_kwds={'groups': l['prov']})
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:                  ltvfo   R-squared:                       0.964
Model:                            OLS   Adj. R-squared:                  0.961
Method:                 Least Squares   F-statistic:                     30.55
Date:                Fri, 03 May 2024   Prob (F-statistic):           6.83e-12
Time:                        19:16:14   Log-Likelihood:                 412.62
No. Observations:                 476   AIC:                            -751.2
Df Residuals:                     439   BIC:                            -597.1
Df Model:                          36                                         
Covariance Type:              cluster                                         
                          coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------
const                   2.5682    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  l.loc[:,var] = ( l['prov']== var[7:] ).astype(int)


#### （3）一阶差分法

不常用的方法
- 在教材中使用的xtserial命令损失了56个样本，2个时间周期的变量
- 而在python命令下，仅损失了1个时间周期的变量，效果应该更好。

In [10]:
mod = FirstDifferenceOLS(dependent=lin.dept_lm, exog=lin.exog_lm.drop(columns=['const']))
fe_res = mod.fit(cov_type='robust', 
                 )
print(fe_res)

                     FirstDifferenceOLS Estimation Summary                      
Dep. Variable:                  ltvfo   R-squared:                        0.5692
Estimator:         FirstDifferenceOLS   R-squared (Between):              0.9955
No. Observations:                 448   R-squared (Within):               0.7931
Date:                Fri, May 03 2024   R-squared (Overall):              0.9952
Time:                        19:14:34   Log-likelihood                    350.78
Cov. Estimator:                Robust                                           
                                        F-statistic:                      64.438
Entities:                          28   P-value                           0.0000
Avg Obs:                       17.000   Distribution:                   F(9,439)
Min Obs:                       17.000                                           
Max Obs:                       17.000   F-statistic (robust):             191.12
                            

#### （4）双向固定效应

##### a. 加入时间趋势项

`'t'`为时间趋势项

In [11]:
panel_lin = lin.data_lm

exog = panel_lin[[
    'ltlan', 'ltwlab', 'ltpow','ltfer','hrs','mipric1','giprice','mci','ngca','t' 
    ]] # 增加了时间趋势变量't'
exog = sm.add_constant(exog)

mod = PanelOLS(dependent=lin.dept_lm, exog=exog, entity_effects=True)
fe_res = mod.fit(cov_type='clustered', 
                 cluster_entity=True,  
                 group_debias=True
                 )
print(fe_res)

                          PanelOLS Estimation Summary                           
Dep. Variable:                  ltvfo   R-squared:                        0.8749
Estimator:                   PanelOLS   R-squared (Between):              0.5699
No. Observations:                 476   R-squared (Within):               0.8749
Date:                Fri, May 03 2024   R-squared (Overall):              0.6570
Time:                        19:14:34   Log-likelihood                    413.14
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      306.24
Entities:                          28   P-value                           0.0000
Avg Obs:                       17.000   Distribution:                  F(10,438)
Min Obs:                       17.000                                           
Max Obs:                       17.000   F-statistic (robust):             247.93
                            

##### b. 加入时间虚拟变量

In [6]:
mod = PanelOLS(
    dependent=lin.dept_lm, 
    exog= lin.exog_lm_dv_t.drop(['mipric1','giprice'], axis=1), 
    entity_effects=True
    )
res= mod.fit(
    cov_type='clustered', 
    cluster_entity=True,
    group_debias=True
    )
print(res)

                          PanelOLS Estimation Summary                           
Dep. Variable:                  ltvfo   R-squared:                        0.8932
Estimator:                   PanelOLS   R-squared (Between):              0.5974
No. Observations:                 476   R-squared (Within):               0.8932
Date:                Fri, May 03 2024   R-squared (Overall):              0.6819
Time:                        21:18:15   Log-likelihood                    450.91
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      154.59
Entities:                          28   P-value                           0.0000
Avg Obs:                       17.000   Distribution:                  F(23,425)
Min Obs:                       17.000                                           
Max Obs:                       17.000   F-statistic (robust):             949.82
                            

In [13]:
mod = PanelOLS(dependent=lin.dept_lm, exog= lin.exog_lm, entity_effects=True, time_effects=True, drop_absorbed=True)
fe_res = mod.fit(use_lsdv=True,
                 cov_type='clustered', 
                 cluster_entity=True,
                 group_debias=True
                 )
print(fe_res)

                          PanelOLS Estimation Summary                           
Dep. Variable:                  ltvfo   R-squared:                        0.6751
Estimator:                   PanelOLS   R-squared (Between):              0.5974
No. Observations:                 476   R-squared (Within):               0.8686
Date:                Fri, May 03 2024   R-squared (Overall):              0.6749
Time:                        19:14:34   Log-likelihood                    450.91
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      126.18
Entities:                          28   P-value                           0.0000
Avg Obs:                       17.000   Distribution:                   F(7,425)
Min Obs:                       17.000                                           
Max Obs:                       17.000   F-statistic (robust):             215.85
                            

Variables have been fully absorbed and have removed from the regression:

mipric1, giprice

  fe_res = mod.fit(use_lsdv=True,


### 4.随机效应

In [51]:
mod = RandomEffects(dependent=lin.dept_lm, exog= lin.exog_lm.drop(['mipric1','giprice'], axis=1))
re_res = mod.fit(
          #   cov_type='clustered', 
          #   cluster_entity=True,
          #   group_debias=True
                 )
print(re_res)

                        RandomEffects Estimation Summary                        
Dep. Variable:                  ltvfo   R-squared:                        0.8643
Estimator:              RandomEffects   R-squared (Between):              0.8021
No. Observations:                 476   R-squared (Within):               0.8700
Date:                Fri, May 03 2024   R-squared (Overall):              0.8215
Time:                        20:33:49   Log-likelihood                    373.36
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      425.96
Entities:                          28   P-value                           0.0000
Avg Obs:                       17.000   Distribution:                   F(7,468)
Min Obs:                       17.000                                           
Max Obs:                       17.000   F-statistic (robust):             425.96
                            

### 5.豪斯曼检验

In [56]:
import numpy as np
import scipy.stats as stats

mod = PanelOLS(dependent=lin.dept_lm, exog=lin.exog_lm.drop(['mipric1','giprice'], axis=1), entity_effects=True)
fe_res = mod.fit()

b_diff = fe_res.params - re_res.params
v_diff = fe_res.cov - re_res.cov

df = len(fe_res.params)

table = pd.DataFrame({'FE': fe_res.params, 
                      'RE':re_res.params,
                      'dif.':b_diff, 
                      'SE(dif.)':np.sqrt(np.diag(v_diff))
                      }, index=fe_res.params.index)
chi2 =  np.dot(b_diff.T,np.linalg.inv(v_diff).dot(b_diff))
p = 1- stats.chi2.cdf(chi2, df)

print(table)
print('=================================================')
print('Chi-squared:', chi2)
print('degrees of freedom:', df)
print(f'p-value:{p:.4f}') 
print('=================================================')


              FE        RE      dif.  SE(dif.)
const   2.310114  2.387878 -0.077764  0.178212
ltlan   0.639966  0.565591  0.074374  0.043153
ltwlab  0.123993  0.144184 -0.020192  0.009747
ltpow   0.077160  0.060477  0.016683  0.001574
ltfer   0.176277  0.188274 -0.011997  0.004112
hrs     0.207582  0.218610 -0.011028       NaN
mci     0.258036  0.470237 -0.212201  0.052192
ngca    0.772279  0.674517  0.097762  0.046883
Chi-squared: 48.67790870308027
degrees of freedom: 8
p-value:0.0000


  'SE(dif.)':np.sqrt(np.diag(v_diff))


### 6.组间统计量

In [34]:
mod = BetweenOLS(dependent=lin.dept_lm, exog= lin.exog_lm.drop(['mipric1','giprice'], axis=1))
bt_res = mod.fit()
print(bt_res)

                         BetweenOLS Estimation Summary                          
Dep. Variable:                  ltvfo   R-squared:                        0.9362
Estimator:                 BetweenOLS   R-squared (Between):              0.9362
No. Observations:                  28   R-squared (Within):              -44.291
Date:                Fri, May 03 2024   R-squared (Overall):             -11.983
Time:                        19:38:27   Log-likelihood                    20.901
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      41.929
Entities:                          28   P-value                           0.0000
Avg Obs:                       17.000   Distribution:                    F(7,20)
Min Obs:                       17.000                                           
Max Obs:                       17.000   F-statistic (robust):             41.929
                            