In [2]:
import pandas as pd
import numpy as np
from linearmodels import IV2SLS
from linearmodels.panel import PanelOLS, PooledOLS
from linearmodels.panel import compare

No fixed affects, No stationary contrls, Transport IV

In [4]:
# Load the data
data = pd.read_csv('gvcomix_transformed1.csv')

# Drop rows with missing values in any of the columns used in the regression
columns_to_check = ['onset2COWCS','decade','logpop_M_diff', 'logpopdens_diff', 'logoutreg_diff', 'ecgrowth_demeaned', 'democracy_diff']
data = data.dropna(subset=columns_to_check)

# Set the MultiIndex
data = data.set_index(['country', 't'])

# Define the dependent variable
dependent = data['onset2COWCS']

# Define the endogenous variable, exogenous controls, and instrument variable
endog = data['s6']
exog = data[['decade', 'logpop_M_diff', 'logpopdens_diff', 'logoutreg_diff', 'ecgrowth_demeaned', 'democracy_diff']]
instr = data['outp_p']

# Create the regression formula
formula = 'onset2COWCS ~ 1 + [s6 ~ outp_p] + logpop_M_diff + logpopdens_diff + logoutreg_diff + ecgrowth_demeaned + democracy_diff'

# Run the 2SLS regression
model_iv = IV2SLS.from_formula(formula, data)
results_iv = model_iv.fit(cov_type='clustered', clusters=data.index.get_level_values('country'))

# Print the results
print(results_iv)

                          IV-2SLS Estimation Summary                          
Dep. Variable:            onset2COWCS   R-squared:                     -4.0289
Estimator:                    IV-2SLS   Adj. R-squared:                -4.0446
No. Observations:                1929   F-statistic:                    4.3294
Date:                Mon, May 15 2023   P-value (F-stat)                0.6322
Time:                        13:57:29   Distribution:                  chi2(6)
Cov. Estimator:             clustered                                         
                                                                              
                                 Parameter Estimates                                 
                   Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
-------------------------------------------------------------------------------------
Intercept             0.0524     0.0294     1.7840     0.0744     -0.0052      0.1099
democracy_diff       -0.

No fixed affects, No stationary controls, Fuel IV

In [26]:
# Load the data
data = pd.read_csv('gvcomix_transformed1.csv')

# Drop rows with missing values in any of the columns used in the regression
columns_to_check = ['onset2COWCS','decade','logpop_M_diff', 'logpopdens_diff', 'logoutreg_diff', 'ecgrowth_demeaned', 'democracy_diff']
data = data.dropna(subset=columns_to_check)

# Set the MultiIndex
data = data.set_index(['country', 't'])

# Define the dependent variable
dependent = data['onset2COWCS']

# Define the endogenous variable, exogenous controls, and instrument variable
endog = data['s6']
exog = data[['decade','logpop_M_diff', 'logpopdens_diff', 'logoutreg_diff', 'ecgrowth_demeaned', 'democracy_diff']]
instr = data['avgs6']

# Create the regression formula
formula = 'onset2COWCS ~ 1 + [s6 ~ avgs6] + decade + logpop_M_diff + logpopdens_diff + logoutreg_diff + ecgrowth_demeaned + democracy_diff'

# Run the 2SLS regression
model_iv = IV2SLS.from_formula(formula, data)
results_iv = model_iv.fit(cov_type='clustered', clusters=data.index.get_level_values('country'))

# Print the results
print(results_iv)

                          IV-2SLS Estimation Summary                          
Dep. Variable:            onset2COWCS   R-squared:                     -0.0691
Estimator:                    IV-2SLS   Adj. R-squared:                -0.0729
No. Observations:                1929   F-statistic:                    12.231
Date:                Tue, May 09 2023   P-value (F-stat)                0.0932
Time:                        19:11:26   Distribution:                  chi2(7)
Cov. Estimator:             clustered                                         
                                                                              
                                 Parameter Estimates                                 
                   Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
-------------------------------------------------------------------------------------
Intercept             0.0764     0.0367     2.0790     0.0376      0.0044      0.1484
decade               -0.

NO Fixed effects, YES stationary controls, Transport IV

In [5]:
# Load the data
data = pd.read_csv('gvcomix_transformed1.csv')

# Drop rows with missing values in any of the columns used in the regression
columns_to_check = ['onset2COWCS','decade',
                         'logmountain', 'ethnic_fractionalization',
                         'religion_fractionalization', 'language_fractionalization',
                         'leg_british', 'opec',
                         'logpop_M_diff', 'logpopdens_diff', 'logoutreg_diff', 'ecgrowth_demeaned', 'democracy_diff']
data = data.dropna(subset=columns_to_check)

# Set the MultiIndex
data = data.set_index(['country', 't'])

# Define the dependent variable
dependent = data['onset2COWCS']

# Define the endogenous variable, exogenous controls, and instrument variable
endog = data['s6']
exog = data[['decade',
                'logmountain', 'ethnic_fractionalization',
                'religion_fractionalization', 'language_fractionalization',
                'leg_british', 'opec',
                'logpop_M_diff', 'logpopdens_diff', 'logoutreg_diff', 'ecgrowth_demeaned', 'democracy_diff']]
instr = data['outp_p']

# Create the regression formula
formula = 'onset2COWCS ~ 1 + [s6 ~ outp_p] + decade + logpop_M_diff + logpopdens_diff + logoutreg_diff + ecgrowth_demeaned + democracy_diff + logmountain + ethnic_fractionalization + religion_fractionalization + language_fractionalization + leg_british + opec'

# Run the 2SLS regression
model_iv = IV2SLS.from_formula(formula, data)
results_iv = model_iv.fit(cov_type='clustered', clusters=data.index.get_level_values('country'))

# Print the results
print(results_iv)

                          IV-2SLS Estimation Summary                          
Dep. Variable:            onset2COWCS   R-squared:                     -2.4868
Estimator:                    IV-2SLS   Adj. R-squared:                -2.5156
No. Observations:                1585   F-statistic:                    11.524
Date:                Mon, May 15 2023   P-value (F-stat)                0.5670
Time:                        13:58:05   Distribution:                 chi2(13)
Cov. Estimator:             clustered                                         
                                                                              
                                     Parameter Estimates                                      
                            Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
----------------------------------------------------------------------------------------------
Intercept                      0.0205     0.1027     0.1992     0.8421     -0.1808 

NO Fixed effect, YES stationary controls, FUEL IV

In [6]:
# Load the data
data = pd.read_csv('gvcomix_transformed1.csv')

# Drop rows with missing values in any of the columns used in the regression
columns_to_check = ['onset2COWCS','decade',
                         'logmountain', 'ethnic_fractionalization',
                         'religion_fractionalization', 'language_fractionalization',
                         'leg_british', 'opec',
                         'logpop_M_diff', 'logpopdens_diff', 'logoutreg_diff', 'ecgrowth_demeaned', 'democracy_diff']
data = data.dropna(subset=columns_to_check)

# Set the MultiIndex
data = data.set_index(['country', 't'])

# Define the dependent variable
dependent = data['onset2COWCS']

# Define the endogenous variable, exogenous controls, and instrument variable
endog = data['s6']
exog = data[['decade',
                'logmountain', 'ethnic_fractionalization',
                'religion_fractionalization', 'language_fractionalization',
                'leg_british', 'opec',
                'logpop_M_diff', 'logpopdens_diff', 'logoutreg_diff', 'ecgrowth_demeaned', 'democracy_diff']]
instr = data['avgs6']

# Create the regression formula
formula = 'onset2COWCS ~ 1 + [s6 ~ avgs6] + decade + logpop_M_diff + logpopdens_diff + logoutreg_diff + ecgrowth_demeaned + democracy_diff + logmountain + ethnic_fractionalization + religion_fractionalization + language_fractionalization + leg_british + opec'

# Run the 2SLS regression
model_iv = IV2SLS.from_formula(formula, data)
results_iv = model_iv.fit(cov_type='clustered', clusters=data.index.get_level_values('country'))

# Print the results
print(results_iv)


                          IV-2SLS Estimation Summary                          
Dep. Variable:            onset2COWCS   R-squared:                     -0.0857
Estimator:                    IV-2SLS   Adj. R-squared:                -0.0947
No. Observations:                1585   F-statistic:                    17.368
Date:                Mon, May 15 2023   P-value (F-stat)                0.1830
Time:                        13:58:24   Distribution:                 chi2(13)
Cov. Estimator:             clustered                                         
                                                                              
                                     Parameter Estimates                                      
                            Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
----------------------------------------------------------------------------------------------
Intercept                      0.0945     0.0425     2.2257     0.0260      0.0113 

YES Fixed effects, NO stationary controls, Tranport IV

In [7]:
# Load the data
data = pd.read_csv('gvcomix_transformed1.csv')

# Set the MultiIndex
data = data.set_index(['country', 't'])

# Drop rows with missing values in any of the columns used in the regression
columns_to_check = ['onset2COWCS', 's6', 's19', 'logpop_M_diff', 'logpopdens_diff', 'logoutreg_diff', 'ecgrowth_demeaned', 'democracy_diff']
data = data.dropna(subset=columns_to_check)

# Create country and year dummies for fixed effects
country_dummies = pd.get_dummies(data.index.get_level_values('country'), drop_first=True, prefix='C').astype(str)
year_dummies = pd.get_dummies(data.index.get_level_values('t'), drop_first=True, prefix='Y').astype(str)
dummies = pd.concat([country_dummies, year_dummies], axis=1)

# Reset index for data and concatenate with dummies
data = data.reset_index()
data_with_dummies = pd.concat([data, dummies], axis=1)

# Store the original index for later use
original_index = data_with_dummies.set_index(['country', 't']).index

# Create the regression formula including the dummies
fixed_effects = ' + '.join(dummies.columns)
formula = f'onset2COWCS ~ 1 + [s6 ~ outp_p] + logpop_M_diff + logpopdens_diff + logoutreg_diff + ecgrowth_demeaned + democracy_diff + {fixed_effects}'

# Run the 2SLS regression with country and time fixed effects
model_iv = IV2SLS.from_formula(formula, data_with_dummies)
results_iv = model_iv.fit(cov_type='clustered', clusters=original_index.get_level_values('country'))

# Print the results
print(results_iv)

                          IV-2SLS Estimation Summary                          
Dep. Variable:            onset2COWCS   R-squared:                     -2.3783
Estimator:                    IV-2SLS   Adj. R-squared:                -2.6945
No. Observations:                1929   F-statistic:                -9.629e+14
Date:                Mon, May 15 2023   P-value (F-stat)                1.0000
Time:                        13:58:38   Distribution:                chi2(165)
Cov. Estimator:             clustered                                         
                                                                              
                                 Parameter Estimates                                 
                   Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
-------------------------------------------------------------------------------------
Intercept             0.1847     0.0337     5.4773     0.0000      0.1186      0.2508
C_ALB[T.True]        -0.

YES fixed effects(no time), NO stationary controls, FUEL IV

In [8]:
# Load the data
data = pd.read_csv('gvcomix_transformed1.csv')

# Set the MultiIndex
data = data.set_index(['country', 't'])

# Drop rows with missing values in any of the columns used in the regression
columns_to_check = ['onset2COWCS', 's6', 'logpop_M_diff', 'logpopdens_diff', 'logoutreg_diff', 'ecgrowth_demeaned', 'democracy_diff']
data = data.dropna(subset=columns_to_check)

# Create country and year dummies for fixed effects
country_dummies = pd.get_dummies(data.index.get_level_values('country'), drop_first=True, prefix='C').astype(str)
dummies = pd.concat([country_dummies], axis=1)

# Reset index for data and concatenate with dummies
data = data.reset_index()
data_with_dummies = pd.concat([data, dummies], axis=1)

# Store the original index for later use
original_index = data_with_dummies.set_index(['country', 't']).index

# Create the regression formula including the dummies
fixed_effects = ' + '.join(dummies.columns)
formula = f'onset2COWCS ~ 1 + [s6 ~ avgs6] + logpop_M_diff + logpopdens_diff + logoutreg_diff + ecgrowth_demeaned + democracy_diff + {fixed_effects}'

# Run the 2SLS regression with country and time fixed effects
model_iv = IV2SLS.from_formula(formula, data_with_dummies)
results_iv = model_iv.fit(cov_type='clustered', clusters=original_index.get_level_values('country'))

# Print the results
print(results_iv)

                          IV-2SLS Estimation Summary                          
Dep. Variable:            onset2COWCS   R-squared:                      0.0830
Estimator:                    IV-2SLS   Adj. R-squared:                 0.0051
No. Observations:                1929   F-statistic:                 6.344e+16
Date:                Mon, May 15 2023   P-value (F-stat)                0.0000
Time:                        14:01:00   Distribution:                chi2(151)
Cov. Estimator:             clustered                                         
                                                                              
                                 Parameter Estimates                                 
                   Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
-------------------------------------------------------------------------------------
Intercept             0.1545     0.0103     15.018     0.0000      0.1343      0.1746
C_ALB[T.True]        -0.

no IV as comparision 

In [53]:
# Load the data
data = pd.read_csv('gvcomix_transformed1.csv')

# Set the MultiIndex
data = data.set_index(['country', 't'])

# Drop rows with missing values in any of the columns used in the regression
columns_to_check = ['onset2COWCS',
                         'ethnic_fractionalization',
                         'religion_fractionalization', 'language_fractionalization',
                         'opec',
                         'logpop_M_diff', 'logpopdens_diff', 'logoutreg_diff', 'ecgrowth_demeaned', 'democracy_diff']
data = data.dropna(subset=columns_to_check)

# Create country and year dummies for fixed effects
country_dummies = pd.get_dummies(data.index.get_level_values('country'), drop_first=True, prefix='C').astype(str)
year_dummies = pd.get_dummies(data.index.get_level_values('t'), drop_first=True, prefix='Y').astype(str)
dummies = pd.concat([country_dummies, year_dummies], axis=1)

# Reset index for data and concatenate with dummies
data = data.reset_index()
data_with_dummies = pd.concat([data, dummies], axis=1)

# Store the original index for later use
original_index = data_with_dummies.set_index(['country', 't']).index

# Create the regression formula including the dummies
fixed_effects = ' + '.join(dummies.columns)
formula = f'onset2COWCS ~ 1 + [s6] + logpop_M_diff + ecgrowth_demeaned + democracy_diff + opec + {fixed_effects}'

# Run the 2SLS regression with country and time fixed effects
model_iv = IV2SLS.from_formula(formula, data_with_dummies)
results_iv = model_iv.fit(cov_type='clustered', clusters=original_index.get_level_values('country'))

# Print the results
print(results_iv)

                            OLS Estimation Summary                            
Dep. Variable:            onset2COWCS   R-squared:                      0.1900
Estimator:                        OLS   Adj. R-squared:                 0.1152
No. Observations:                1859   F-statistic:                 6.816e+18
Date:                Tue, May 09 2023   P-value (F-stat)                0.0000
Time:                        19:40:56   Distribution:                chi2(157)
Cov. Estimator:             clustered                                         
                                                                              
                                 Parameter Estimates                                 
                   Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
-------------------------------------------------------------------------------------
Intercept             0.1752     0.0219     7.9969     0.0000      0.1323      0.2182
C_ALB[T.True]        -0.

In [9]:
# create an empty dataframe with the desired columns
df = pd.DataFrame(columns=['outcome', 'independent', 'instrument', 'IV Estimator', 'std_error', 'p_val', 'fixed effects', 'stationary controls'])

# add some data to the dataframe
df.loc[0] = ['onset2COWCS', 's6_mix', 'transport_IV', -4.2521, 1.8918, 0.0246, 'no', 'no']
df.loc[1] = ["onset2COWCS", 's6_mix', 'Fuel_IV', -1.2507, 1.0796, 0.2467, 'no', 'no']
df.loc[2] = ['onset2COWCS', 's6_mix', 'transport_IV', -3.3937, 1.8639, 0.0686, 'no', 'yes']
df.loc[3] = ["onset2COWCS", 's6_mix', 'Fuel_IV', -1.8195, 1.3629, 0.1819, 'no', 'yes']
df.loc[4] = ["onset2COWCS", 's6_mix', 'transport_IV', 0.1454, 1.1678, 0.9009, 'yes', 'no']
df.loc[5] = ["onset2COWCS", 's6_mix', 'Fuel_IV', 0.2775, 0.2319, 0.2315, 'yes', 'no']

# print the dataframe
print(df)

       outcome independent    instrument  IV Estimator  std_error   p_val   
0  onset2COWCS      s6_mix  transport_IV       -4.2521     1.8918  0.0246  \
1  onset2COWCS      s6_mix       Fuel_IV       -1.2507     1.0796  0.2467   
2  onset2COWCS      s6_mix  transport_IV       -3.3937     1.8639  0.0686   
3  onset2COWCS      s6_mix       Fuel_IV       -1.8195     1.3629  0.1819   
4  onset2COWCS      s6_mix  transport_IV        0.1454     1.1678  0.9009   
5  onset2COWCS      s6_mix       Fuel_IV        0.2775     0.2319  0.2315   

  fixed effects stationary controls  
0            no                  no  
1            no                  no  
2            no                 yes  
3            no                 yes  
4           yes                  no  
5           yes                  no  


In [11]:
# Load the data
data = pd.read_csv('gvcomix_transformed1.csv')

# Set the MultiIndex
data = data.set_index(['country', 't'])

# Drop rows with missing values in any of the columns used in the regression
columns_to_check = ['onset2COWCS', 's6', 's19', 'logpop_M_diff', 'logpopdens_diff', 'logoutreg_diff', 'ecgrowth_demeaned', 'democracy_diff']
data = data.dropna(subset=columns_to_check)

# Create country and year dummies for fixed effects
year_dummies = pd.get_dummies(data.index.get_level_values('t'), drop_first=True, prefix='Y').astype(str)
dummies = pd.concat([year_dummies], axis=1)

# Reset index for data and concatenate with dummies
data = data.reset_index()
data_with_dummies = pd.concat([data, dummies], axis=1)

# Store the original index for later use
original_index = data_with_dummies.set_index(['country', 't']).index

# Create the regression formula including the dummies
fixed_effects = ' + '.join(dummies.columns)
formula = f'onset2COWCS ~ 1 + [s6 ~ avgs6] + logpop_M_diff + logpopdens_diff + logoutreg_diff + ecgrowth_demeaned + democracy_diff + {fixed_effects}'

# Run the 2SLS regression with country and time fixed effects
model_iv = IV2SLS.from_formula(formula, data_with_dummies)
results_iv = model_iv.fit(cov_type='clustered', clusters=original_index.get_level_values('country'))

# Print the results
print(results_iv)

ValueError: instruments [exog instruments]  do not have full column rank