In [22]:
import pandas as pd
import numpy as np
from pydynpd import regression
import pathlib
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
import sys

# Questions
1. Should we add stationary controls?
We do not introduce stationary controls under most circumstance. 
Usually they do not change the results a lot, but that should be verified in the robustness check.

# Assumption tests
## AR
We expect to reject AR(1) but not AR(2).
## Overidentification
A rule of thumb is that any p-value between 0.1 and 0.25 should be fine as it is reasonably larger than a significance level of 0.05 and it is reasonably small enough to not suspect severe p-value inflation. 

In [3]:
unstationary = 'ecgrowth_demeaned logpop_M_diff logpopdens_diff logoutreg_diff democracy_diff'
stationary = 'logmountain ethnic_fractionalization religion_fractionalization language_fractionalization leg_british opec'

columns_to_check = ['onset2COWCS','decade','logpop_M_diff', 'logpopdens_diff', 'logoutreg_diff', 'ecgrowth_demeaned', 'democracy_diff']

path = pathlib.Path().resolve()
# iv = pd.read_csv(str(path).replace("code/Internal_IV", "data") + "/transportIV_file.csv", na_values= '#DIV/0!')

In [21]:
path

PosixPath('/Users/zwanran/Desktop/ECMA31320/final/ECMA31320-project/code/Internal_IV')

In [16]:
def load_data(predictor = "gvcomix"):
    df = pd.read_csv(str(path).replace("code/Internal_IV", "data") + "/" + predictor + "_data.csv", na_values= '#DIV/0!')
    # df = pd.merge(df, iv, on=["country", "t"])
    
    df[columns_to_check] = df[columns_to_check].replace([np.inf, -np.inf], np.nan)
    df = df.dropna(subset=columns_to_check)
    
    label = le.fit_transform(df['country'])
    df.drop('country', axis=1, inplace=True)
    df['country'] = label
    
    # df = df.set_index(['country', 't'])
    
    # df.fillna(0, inplace=True)
    return df

In [17]:
def reg(method=" ", additional_cntrl = False, time = False, iv = 'trans_outp_p'):
    # System GMM by default (method = "nolevel" produces difference GMM)
    # No stationary controls by default
    # IV = transport by default
    command_str = 'onset2COWCS L(1:?).onset2COWCS s' + sector + ' ' + iv + ' ' + unstationary + ' '
    if additional_cntrl:
        command_str = command_str + stationary
    command_str = command_str + ' | gmm(onset2COWCS, 2:3) gmm(s' + sector + ', 2:3) iv(' + iv + ') | ' + ' ' + method
    if time:
         command_str = command_str + ' timedumm'
    mydpd = regression.abond(command_str, df, ['country', 't']);
    return mydpd #.models[0] #.regression_table

In [18]:
def all_reg():
    # world average gvc participation IV.
    # # No time FE if we are using world-level average gvc participation as external IV
    ## System GMM
    m1 = reg(iv = 'avgs' + sector);
    ## Difference GMM
    m2 = reg(iv = 'avgs' + sector, method="nolevel");

    # Transport IV
    ## System GMM
    m3 = reg();
    m4 = reg(time = True);
    ## Difference GMM
    m5 = reg(method = "nolevel");
    m6 = reg(time = True, method="nolevel");

    results = [m1, m2, m3, m4, m5, m6]
    return results

In [19]:
def print_results(results):
    for s in range(len(results)): # traverse through four sectors
        if s == 0:
            print("******sector 1********")
        if s == 1:
            print("******sector 2********")
        if s == 2:
            print("******sector 6********")
        if s == 4:
            print("******sector 10*******")
        for idx in range(len(results[s])):
            if idx == 0:
                print("IV = World Average; Sys.")
            if idx == 1:
                print("IV = World Average; Dif.")
            if idx == 3:
                print("IV = Transport; Sys.")
            if idx == 4:
                print("IV = World Average; Sys; t-FE.")
            if idx == 5:
                print("IV = Transport; Dif.")
            if idx == 6:
                print("IV = World Average; Dif; t-FE.")
            m = results[idx].models[0]
            print(m.regression_table)
            print("AR(1) test P value: ", end=" ")
            print(m.AR_list[0].P_value)
            print("AR(2) test P value: ", end=" ")
            print(m.AR_list[1].P_value)
            print("Hansen test P value: ", end=" ")
            print(m.hansen.p_value)
            print("\n")

In [23]:
predictor_list = ["gvcomix", "gvcobp", "gvcofp"]
sector_list = ['1', '2', '6', '10']
results_list = []
for predictor in predictor_list:
    df = load_data(predictor);
    predictor_result = [] ## results for every predictor
                            ## in the order of s1, s2, s6, s10
    for sector in sector_list:
        predictor_result.append(all_reg());
    results_list.append(predictor_result) 
        

 m1
 Dynamic panel-data estimation, two-step system GMM
 Group variable: country                          Number of obs = 1646     
 Time variable: t                                 Min obs per group: 0     
 Number of instruments = 83                       Max obs per group: 13    
 Number of groups = 142                           Avg obs per group: 11.59 
+-------------------+-------------+---------------------+------------+-----------+---+
|    onset2COWCS    |    coef.    | Corrected Std. Err. |     z      |   P>|z|   |   |
+-------------------+-------------+---------------------+------------+-----------+---+
|   L1.onset2COWCS  |  0.0203546  |      0.0317503      | 0.6410837  | 0.5214683 |   |
|         s1        |  -0.0597897 |      0.1852820      | -0.3226957 | 0.7469257 |   |
|       avgs1       |  -0.3871658 |      1.4162941      | -0.2733654 | 0.7845724 |   |
| ecgrowth_demeaned |  -0.2211971 |      0.1016678      | -2.1756853 | 0.0295788 | * |
|   logpop_M_diff   |  5.563555

In [8]:
for idx in range(len(results_list)):
    print("========Predictor = ", predictor_list[idx], "========")
    sys.stdout = open(str(path).replace("code/Internal_IV", "output/Internal_IV/") +  predictor_list[idx] + ".txt", 'wt')
    print_results(results_list[idx])

IV = World Average; Sys.
         variable  coefficient   std_err   z_value   p_value sig
0  L1.onset2COWCS     0.048222  0.036888  1.307263  0.191123    
1              s1     2.554468  3.656845  0.698544  0.484837    
2           avgs1    -2.629226  3.554176 -0.739757  0.459448    
3        ecgrowth    -0.095577  0.152230 -0.627848  0.530103    
4        logpop_M     0.120985  0.061098  1.980185  0.047683   *
5      logpopdens    -0.053437  0.147177 -0.363080  0.716545    
6       logoutreg    -0.450222  0.500298 -0.899907  0.368170    
7            _con     0.078306  0.086990  0.900171  0.368029    
AR(1) test P value:  0.04959487197025249
AR(2) test P value:  0.9298475207023643
Hansen test P value:  0.6569328923149437


IV = World Average; Dif.
         variable  coefficient    std_err   z_value   p_value sig
0  L1.onset2COWCS     0.054148   0.037958  1.426539  0.153713    
1              s1     2.893515   3.944553  0.733547  0.463225    
2           avgs1   -11.560000  20.943313 -