In [1]:
import pandas as pd
import numpy as np
from linearmodels import IV2SLS

No fixed affects, No stationary contrls, Transport IV

In [26]:
# Load the data
data = pd.read_csv('gvcofp_transformed1.csv')

# Drop rows with missing values in any of the columns used in the regression
columns_to_check = ['onset2COWCS','decade','logpop_M_diff', 'logpopdens_diff', 'logoutreg_diff', 'ecgrowth_demeaned', 'democracy_diff']

# Replace infinite values with NaN
data[columns_to_check] = data[columns_to_check].replace([np.inf, -np.inf], np.nan)

# Drop rows with NaN in these columns
data = data.dropna(subset=columns_to_check)


# Set the MultiIndex
data = data.set_index(['country', 't'])

# Define the dependent variable
dependent = data['onset2COWCS']

# Create the regression formula
formula = 'onset2COWCS ~ 1 + [s10 ~ trans_outp_p] + decade + logpop_M_diff + logpopdens_diff + logoutreg_diff + ecgrowth_demeaned + democracy_diff'

# Run the 2SLS regression
model_iv = IV2SLS.from_formula(formula, data)
results_iv = model_iv.fit(cov_type='clustered', clusters=data.index.get_level_values('country'))

# Print the results
print(results_iv)

                          IV-2SLS Estimation Summary                          
Dep. Variable:            onset2COWCS   R-squared:                     -0.5568
Estimator:                    IV-2SLS   Adj. R-squared:                -0.5622
No. Observations:                2033   F-statistic:                    9.5079
Date:                Wed, May 17 2023   P-value (F-stat)                0.2182
Time:                        14:31:58   Distribution:                  chi2(7)
Cov. Estimator:             clustered                                         
                                                                              
                                 Parameter Estimates                                 
                   Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
-------------------------------------------------------------------------------------
Intercept             0.0309     0.0498     0.6197     0.5354     -0.0668      0.1285
decade               -0.

In [7]:
import pandas as pd
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

df = pd.read_csv('gvcofp_transformed1.csv')
columns_to_check = ['onset2COWCS','decade','logpop_M_diff', 'logpopdens_diff', 'logoutreg_diff', 'ecgrowth_demeaned', 'democracy_diff']
df = df.dropna(subset=columns_to_check)
df = df.dropna()  # drop rows with NaN values
df = df.replace([np.inf, -np.inf], np.nan).dropna()  # replace inf with NaN and then drop

X = df[['decade', 'logpop_M_diff', 'logpopdens_diff', 'logoutreg_diff', 'ecgrowth_demeaned', 'democracy_diff']]
X = sm.add_constant(X)  # adding a constant

vif_data = pd.DataFrame()
vif_data["feature"] = X.columns

vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(len(X.columns))]

print(vif_data)

             feature           VIF
0              const  2.351541e+02
1             decade  1.030383e+00
2      logpop_M_diff  9.027617e+08
3    logpopdens_diff  9.027621e+08
4     logoutreg_diff  1.010468e+00
5  ecgrowth_demeaned  1.052527e+00
6     democracy_diff  1.005451e+00


No fixed affects, No stationary controls, Avg GVC IV

In [27]:
# Load the data
data = pd.read_csv('gvcofp_transformed1.csv')

# Drop rows with missing values in any of the columns used in the regression
columns_to_check = ['onset2COWCS','decade','logpop_M_diff', 'logpopdens_diff', 'logoutreg_diff', 'ecgrowth_demeaned', 'democracy_diff']
# Replace infinite values with NaN
data[columns_to_check] = data[columns_to_check].replace([np.inf, -np.inf], np.nan)

# Drop rows with NaN in these columns
data = data.dropna(subset=columns_to_check)

# Set the MultiIndex
data = data.set_index(['country', 't'])

# Define the dependent variable
dependent = data['onset2COWCS']

# Create the regression formula
formula = 'onset2COWCS ~ 1 + [s10 ~ avgs10] + decade + logpop_M_diff + logpopdens_diff + logoutreg_diff + ecgrowth_demeaned + democracy_diff'

# Run the 2SLS regression
model_iv = IV2SLS.from_formula(formula, data)
results_iv = model_iv.fit(cov_type='clustered', clusters=data.index.get_level_values('country'))

# Print the results
print(results_iv)

                          IV-2SLS Estimation Summary                          
Dep. Variable:            onset2COWCS   R-squared:                      0.0106
Estimator:                    IV-2SLS   Adj. R-squared:                 0.0072
No. Observations:                2033   F-statistic:                    11.429
Date:                Wed, May 17 2023   P-value (F-stat)                0.1210
Time:                        14:33:21   Distribution:                  chi2(7)
Cov. Estimator:             clustered                                         
                                                                              
                                 Parameter Estimates                                 
                   Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
-------------------------------------------------------------------------------------
Intercept             0.0775     0.0336     2.3090     0.0209      0.0117      0.1432
decade               -0.

NO Fixed effects, YES stationary controls, Transport IV

In [28]:
# Load the data
data = pd.read_csv('gvcofp_transformed1.csv')

# Drop rows with missing values in any of the columns used in the regression
columns_to_check = ['onset2COWCS','decade',
                         'logmountain', 'ethnic_fractionalization',
                         'religion_fractionalization', 'language_fractionalization',
                         'leg_british', 'opec',
                         'logpop_M_diff', 'logpopdens_diff', 'logoutreg_diff', 'ecgrowth_demeaned', 'democracy_diff']
data[columns_to_check] = data[columns_to_check].replace([np.inf, -np.inf], np.nan)

# Drop rows with NaN in these columns
data = data.dropna(subset=columns_to_check)

# Set the MultiIndex
data = data.set_index(['country', 't'])

# Define the dependent variable
dependent = data['onset2COWCS']

# Create the regression formula
formula = 'onset2COWCS ~ 1 + [s10 ~ trans_outp_p] + decade + logpop_M_diff + logpopdens_diff + logoutreg_diff + ecgrowth_demeaned + democracy_diff + logmountain + ethnic_fractionalization + religion_fractionalization + language_fractionalization + leg_british + opec'

# Run the 2SLS regression
model_iv = IV2SLS.from_formula(formula, data)
results_iv = model_iv.fit(cov_type='clustered', clusters=data.index.get_level_values('country'))

# Print the results
print(results_iv)

                          IV-2SLS Estimation Summary                          
Dep. Variable:            onset2COWCS   R-squared:                     -0.5719
Estimator:                    IV-2SLS   Adj. R-squared:                -0.5845
No. Observations:                1646   F-statistic:                    19.573
Date:                Wed, May 17 2023   P-value (F-stat)                0.1064
Time:                        14:33:52   Distribution:                 chi2(13)
Cov. Estimator:             clustered                                         
                                                                              
                                     Parameter Estimates                                      
                            Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
----------------------------------------------------------------------------------------------
Intercept                      0.0544     0.0393     1.3840     0.1664     -0.0227 

NO Fixed effect, YES stationary controls, avg gvc IV

In [29]:
# Load the data
data = pd.read_csv('gvcofp_transformed1.csv')

# Drop rows with missing values in any of the columns used in the regression
columns_to_check = ['onset2COWCS','decade',
                         'logmountain', 'ethnic_fractionalization',
                         'religion_fractionalization', 'language_fractionalization',
                         'leg_british', 'opec',
                         'logpop_M_diff', 'logpopdens_diff', 'logoutreg_diff', 'ecgrowth_demeaned', 'democracy_diff']
# Replace infinite values with NaN
data[columns_to_check] = data[columns_to_check].replace([np.inf, -np.inf], np.nan)

# Drop rows with NaN in these columns
data = data.dropna(subset=columns_to_check)

# Set the MultiIndex
data = data.set_index(['country', 't'])

# Define the dependent variable
dependent = data['onset2COWCS']

# Create the regression formula
formula = 'onset2COWCS ~ 1 + [s10 ~ avgs10] + decade + logpop_M_diff + logpopdens_diff + logoutreg_diff + ecgrowth_demeaned + democracy_diff + logmountain + ethnic_fractionalization + religion_fractionalization + language_fractionalization + leg_british + opec'

# Run the 2SLS regression
model_iv = IV2SLS.from_formula(formula, data)
results_iv = model_iv.fit(cov_type='clustered', clusters=data.index.get_level_values('country'))

# Print the results
print(results_iv)


                          IV-2SLS Estimation Summary                          
Dep. Variable:            onset2COWCS   R-squared:                      0.0302
Estimator:                    IV-2SLS   Adj. R-squared:                 0.0224
No. Observations:                1646   F-statistic:                    19.466
Date:                Wed, May 17 2023   P-value (F-stat)                0.1093
Time:                        14:34:26   Distribution:                 chi2(13)
Cov. Estimator:             clustered                                         
                                                                              
                                     Parameter Estimates                                      
                            Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
----------------------------------------------------------------------------------------------
Intercept                      0.0858     0.0394     2.1786     0.0294      0.0086 

YES Fixed effects, NO stationary controls, Tranport IV

In [30]:
# Load the data
data = pd.read_csv('gvcofp_transformed1.csv')

# Set the MultiIndex
data = data.set_index(['country', 't'])

# Drop rows with missing values in any of the columns used in the regression
columns_to_check = ['onset2COWCS', 's6', 's19', 'logpop_M_diff', 'logpopdens_diff', 'logoutreg_diff', 'ecgrowth_demeaned', 'democracy_diff']
# Replace infinite values with NaN
data[columns_to_check] = data[columns_to_check].replace([np.inf, -np.inf], np.nan)

# Drop rows with NaN in these columns
data = data.dropna(subset=columns_to_check)

# Create country and year dummies for fixed effects
country_dummies = pd.get_dummies(data.index.get_level_values('country'), drop_first=True, prefix='C').astype(str)
year_dummies = pd.get_dummies(data.index.get_level_values('t'), drop_first=True, prefix='Y').astype(str)
dummies = pd.concat([country_dummies, year_dummies], axis=1)

# Reset index for data and concatenate with dummies
data = data.reset_index()
data_with_dummies = pd.concat([data, dummies], axis=1)

# Store the original index for later use
original_index = data_with_dummies.set_index(['country', 't']).index

# Create the regression formula including the dummies
fixed_effects = ' + '.join(dummies.columns)
formula = f'onset2COWCS ~ 1 + [s10 ~ trans_outp_p] + logpop_M_diff + logpopdens_diff + logoutreg_diff + ecgrowth_demeaned + democracy_diff + {fixed_effects}'

# Run the 2SLS regression with country and time fixed effects
model_iv = IV2SLS.from_formula(formula, data_with_dummies)
results_iv = model_iv.fit(cov_type='clustered', clusters=original_index.get_level_values('country'))

# Print the results
print(results_iv)

                          IV-2SLS Estimation Summary                          
Dep. Variable:            onset2COWCS   R-squared:                      0.1637
Estimator:                    IV-2SLS   Adj. R-squared:                 0.0884
No. Observations:                2033   F-statistic:                  -4.1e+14
Date:                Wed, May 17 2023   P-value (F-stat)                1.0000
Time:                        14:35:04   Distribution:                chi2(168)
Cov. Estimator:             clustered                                         
                                                                              
                                 Parameter Estimates                                 
                   Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
-------------------------------------------------------------------------------------
Intercept             0.3100     0.0481     6.4500     0.0000      0.2158      0.4042
C_AGO[T.True]        -0.

In [31]:
# Load the data
data = pd.read_csv('gvcofp_transformed1.csv')

# Set the MultiIndex
data = data.set_index(['country', 't'])

# Drop rows with missing values in any of the columns used in the regression
columns_to_check = ['onset2COWCS', 's6', 's19', 'logpop_M_diff', 'logpopdens_diff', 'logoutreg_diff', 'ecgrowth_demeaned', 'democracy_diff']
# Replace infinite values with NaN
data[columns_to_check] = data[columns_to_check].replace([np.inf, -np.inf], np.nan)

# Drop rows with NaN in these columns
data = data.dropna(subset=columns_to_check)

# Create country and year dummies for fixed effects
country_dummies = pd.get_dummies(data.index.get_level_values('country'), drop_first=True, prefix='C').astype(str)
dummies = pd.concat([country_dummies], axis=1)

# Reset index for data and concatenate with dummies
data = data.reset_index()
data_with_dummies = pd.concat([data, dummies], axis=1)

# Store the original index for later use
original_index = data_with_dummies.set_index(['country', 't']).index

# Create the regression formula including the dummies
fixed_effects = ' + '.join(dummies.columns)
formula = f'onset2COWCS ~ 1 + [s10 ~ trans_outp_p] + logpop_M_diff + logpopdens_diff + logoutreg_diff + ecgrowth_demeaned + democracy_diff + {fixed_effects}'

# Run the 2SLS regression with country and time fixed effects
model_iv = IV2SLS.from_formula(formula, data_with_dummies)
results_iv = model_iv.fit(cov_type='clustered', clusters=original_index.get_level_values('country'))

# Print the results
print(results_iv)

                          IV-2SLS Estimation Summary                          
Dep. Variable:            onset2COWCS   R-squared:                     -0.0695
Estimator:                    IV-2SLS   Adj. R-squared:                -0.1572
No. Observations:                2033   F-statistic:                -1.527e+14
Date:                Wed, May 17 2023   P-value (F-stat)                1.0000
Time:                        14:35:40   Distribution:                chi2(154)
Cov. Estimator:             clustered                                         
                                                                              
                                 Parameter Estimates                                 
                   Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
-------------------------------------------------------------------------------------
Intercept             0.2931     0.0239     12.282     0.0000      0.2463      0.3399
C_AGO[T.True]        -0.

YES fixed effects(no time), NO stationary controls, FUEL IV

In [32]:
# Load the data
data = pd.read_csv('gvcofp_transformed1.csv')

# Set the MultiIndex
data = data.set_index(['country', 't'])

# Drop rows with missing values in any of the columns used in the regression
columns_to_check = ['onset2COWCS', 's6', 'logpop_M_diff', 'logpopdens_diff', 'logoutreg_diff', 'ecgrowth_demeaned', 'democracy_diff']
# Replace infinite values with NaN
data[columns_to_check] = data[columns_to_check].replace([np.inf, -np.inf], np.nan)

# Drop rows with NaN in these columns
data = data.dropna(subset=columns_to_check)

# Create country and year dummies for fixed effects
country_dummies = pd.get_dummies(data.index.get_level_values('country'), drop_first=True, prefix='C').astype(str)
dummies = pd.concat([country_dummies], axis=1)

# Reset index for data and concatenate with dummies
data = data.reset_index()
data_with_dummies = pd.concat([data, dummies], axis=1)

# Store the original index for later use
original_index = data_with_dummies.set_index(['country', 't']).index

# Create the regression formula including the dummies
fixed_effects = ' + '.join(dummies.columns)
formula = f'onset2COWCS ~ 1 + [s10 ~ avgs10] + logpop_M_diff + logpopdens_diff + logoutreg_diff + ecgrowth_demeaned + democracy_diff + {fixed_effects}'

# Run the 2SLS regression with country and time fixed effects
model_iv = IV2SLS.from_formula(formula, data_with_dummies)
results_iv = model_iv.fit(cov_type='clustered', clusters=original_index.get_level_values('country'))

# Print the results
print(results_iv)

                          IV-2SLS Estimation Summary                          
Dep. Variable:            onset2COWCS   R-squared:                      0.1737
Estimator:                    IV-2SLS   Adj. R-squared:                 0.1060
No. Observations:                2033   F-statistic:                -1.691e+16
Date:                Wed, May 17 2023   P-value (F-stat)                1.0000
Time:                        14:36:23   Distribution:                chi2(154)
Cov. Estimator:             clustered                                         
                                                                              
                                 Parameter Estimates                                 
                   Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
-------------------------------------------------------------------------------------
Intercept             0.2833     0.0128     22.180     0.0000      0.2583      0.3084
C_AGO[T.True]        -0.