## Coding Discussion 7
#### Tianwei Liu
Nov 15th 2019

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# Read in all datasets
conflict = pd.read_csv ('acled_conflict_data.csv')
GDP = pd.read_csv('API_NY.GDP.MKTP.KD.ZG_DS2_en_csv_v2_422196.csv',skiprows = 4)
labor_force = pd.read_csv('API_SL.TLF.TOTL.IN_DS2_en_csv_v2_423580.csv',skiprows = 4)
population = pd.read_csv('API_SP.POP.TOTL_DS2_en_csv_v2_422125.csv',skiprows = 4)
# GDP.head()
# labor_force.head()
# population.head()

In [642]:
GDP = GDP.drop(['Unnamed: 64'], axis = 1)
labor_force = labor_force.drop(['Unnamed: 64'], axis = 1)
population = population.drop(['Unnamed: 64'], axis = 1)

In [643]:
## First we notice in the conflict dataset, there are some unwanted strings in the country column
conflict['country'] = conflict['country'].replace('Country Name = ','',regex=True).str.strip('*').str.capitalize()
#conflict['country'] = conflict['country'].str.capitalize()
conflict['country'].unique()

array(['Zimbabwe', 'Zambia', 'Uganda', 'Tunisia', 'Togo', 'Tanzania',
       'Sudan', 'South sudan', 'South africa', 'Somalia', 'Sierra leone',
       'Senegal', 'Rwanda', 'Republic of congo', 'Nigeria', 'Niger',
       'Namibia', 'Mozambique', 'Morocco', 'Mauritania', 'Mali', 'Malawi',
       'Madagascar', 'Libya', 'Liberia', 'Lesotho', 'Kenya',
       'Ivory coast', 'Guinea-bissau', 'Guinea', 'Ghana', 'Gambia',
       'Gabon', 'Ethiopia', 'Eswatini', 'Eritrea', 'Equatorial guinea',
       'Egypt', 'Djibouti', 'Democratic republic of congo', 'Chad',
       'Central african republic', 'Cameroon', 'Burundi', 'Burkina faso',
       'Botswana', 'Benin', 'Angola', 'Algeria'], dtype=object)

In [644]:
## Then we extract year from the date column, because the unit of analysis is country_year

conflict['date'] = pd.to_datetime(conflict['date'])
conflict["year"] = conflict["date"].dt.year
## Remove observations for 2019
conflict = conflict[conflict.year != 2019]

In [645]:
## As we need to perform join later, we need to make sure that the datatypes of the indices are consistent across dataframes
conflict.year.dtype

dtype('int64')

In [646]:
## And since they are not, we need to add necessary steps to correct for datatypes before we merge.
## I will do this later in the notebook.

In [647]:
## Count based on unit of analysis

africa_violence = conflict.groupby(['country','year']).sum()
africa_violence

Unnamed: 0_level_0,Unnamed: 1_level_0,non_violence,violence
country,year,Unnamed: 2_level_1,Unnamed: 3_level_1
Algeria,1997,0,130
Algeria,1998,1,57
Algeria,1999,0,56
Algeria,2000,2,165
Algeria,2001,109,142
Algeria,2002,54,207
Algeria,2003,52,161
Algeria,2004,53,87
Algeria,2005,25,81
Algeria,2006,17,176


In [648]:
## check if there are NA's
conflict.isna().values.any()

False

In [6]:
GDP97_18 = pd.concat([GDP['Country Name'],GDP[np.arange(1997,2019).astype('str')]], axis = 1).set_index(['Country Name']).stack().reset_index().rename(columns = {'level_1' : 'year', 0:"GDP", 'Country Name':'country'})
GDP97_18.year = GDP97_18.year.astype(int) ## Convert year index to int type
GDP97_18 = GDP97_18.set_index(['country','year'])
display(GDP97_18)

Unnamed: 0_level_0,Unnamed: 1_level_0,GDP
country,year,Unnamed: 2_level_1
Aruba,1997,7.046872
Aruba,1998,1.991988
Aruba,1999,1.238039
Aruba,2000,7.616590
Aruba,2001,-2.971257
Aruba,2002,-3.273647
Aruba,2003,1.975547
Aruba,2004,7.911565
Aruba,2005,1.214349
Aruba,2006,1.050607


In [650]:
labor_force97 = pd.concat([labor_force['Country Name'],labor_force[np.arange(1997,2019).astype('str')]], axis = 1).set_index(['Country Name']).stack().reset_index()
labor_force97.level_1 = labor_force97.level_1.astype(int) ## Convert year index to int type
labor_force97 = labor_force97.rename(columns = {'level_1' : 'year', 0:"labor_force", 'Country Name':'country'}).set_index(['country',"year"])
#display(labor_force97)

In [651]:
population97 = pd.concat([population['Country Name'],population[np.arange(1997,2019).astype('str')]], axis = 1).set_index(['Country Name']).stack().reset_index()
population97.level_1 = population97.level_1.astype(int) ## Convert year index to int type
population97 = population97.rename(columns = {'level_1' : 'year', 0:"population", 'Country Name':'country'}).set_index(['country',"year"])
#display(population97)

In [652]:
df = africa_violence.join(GDP97_18).join(labor_force97).join(population97)

In [653]:
#df.head(20)
df.shape

(1045, 5)

In [654]:
## Fill the NA's
df = df.reset_index()
df["population"] = df.groupby("country")["population"].transform(lambda x: x.fillna(x.mean()))
df["GDP"] = df.groupby("country")["GDP"].transform(lambda x: x.fillna(x.mean()))
df["labor_force"] = df.groupby("country")["labor_force"].transform(lambda x: x.fillna(x.mean()))
df = df.set_index(['country','year'])

In [655]:
df.skew()

non_violence    6.492594
violence        5.229086
GDP             5.924089
labor_force     2.932530
population      3.556566
dtype: float64

In [656]:
print (0 in list(df.non_violence),0 in list(df.violence),0 in list(df.GDP), 0 in list(df.labor_force), 0 in list(df.population))

True True True False False


In [657]:
## for variables containing zero, add 1
## Since all variables are right-skews, we log all five variables

df['non_violence'] = np.log(df['non_violence'] + 1)
df['violence'] = np.log(df['violence'] + 1)
df['GDP'] = np.log(df['GDP'] + 1)
df['labor_force'] = np.log(df['labor_force'])
df['population'] = np.log(df['population'])

  


In [658]:
import statsmodels.api as sm

In [659]:
X = df[['non_violence', 'violence', 'population', 'labor_force']]
Y = df['GDP']
X = sm.add_constant(X)

In [660]:
reg = sm.OLS(Y,X, missing = "drop").fit()
reg.summary()

0,1,2,3
Dep. Variable:,GDP,R-squared:,0.03
Model:,OLS,Adj. R-squared:,0.024
Method:,Least Squares,F-statistic:,5.504
Date:,"Fri, 15 Nov 2019",Prob (F-statistic):,0.000228
Time:,09:16:55,Log-Likelihood:,-813.45
No. Observations:,729,AIC:,1637.0
Df Residuals:,724,BIC:,1660.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-0.0267,0.505,-0.053,0.958,-1.019,0.965
non_violence,-0.0308,0.022,-1.391,0.165,-0.074,0.013
violence,-0.0122,0.021,-0.576,0.565,-0.054,0.029
population,-0.0489,0.170,-0.287,0.774,-0.383,0.285
labor_force,0.1679,0.161,1.043,0.297,-0.148,0.484

0,1,2,3
Omnibus:,402.742,Durbin-Watson:,1.493
Prob(Omnibus):,0.0,Jarque-Bera (JB):,4343.527
Skew:,-2.268,Prob(JB):,0.0
Kurtosis:,14.064,Cond. No.,433.0


## Intepretation

It appears that instability does not have a significant effect on GDP growth, as coefficients on both **non_violence** and **violence** are close to 0 and their p-value greater than conventional 0.05 level of significance.

Since the t-stat for non_violence is bigger (absolute value), therefore it has a bigger impact.

As population, labor force and GDP growth rates are time-series data, they are very likely to be auto-correlated. I suggest these variables be adjusted. Otherwise, autocorrelation could lead to higher variance of our coefficient thereby undermine the precision.