In [1]:
import os

import numpy as np
import pandas as pd
import networkx as nx
import pickle
from tqdm import tqdm
import geopandas as gpd

from scipy import stats

import re
import unidecode

In [2]:
with open(os.path.join("pickle", 'conflict.pickle'), 'rb') as data_source:
    conflict_df = pickle.load(data_source)

In [3]:
with open(os.path.join("pickle", 'refugee.pickle'), 'rb') as data_source:
    refugee_df = pickle.load(data_source)

# Combined displacement and event dataframe

The goal of this section is to create a dataframe that links the displacement information given by the UNHCR with the events and deaths given by the GED dataset. This will be grouped by year and will keep track of the number of events and of the event identifiers.

The columns will be:

|country_id|year|number_of_events|events_id|displacement_extern|displacement_intern|total_displacement|deaths|

where number_of_events is the number of events in a country in a year
where events_id is an array containing the id of each event accounted for
where displacement_extern is the sum of [refugee, asylum, stateless, others]
where displacement_intern is the value of internally_displaced

In [4]:
COUNTRY_HUMAN_COST_DF_COLUMNS = ["country_id", "year", "number_of_events", "events_id", "displacement_extern", "displacement_intern", "total_displacement", "deaths"]

## Manage index and data types

In [5]:
# Sort conflict dataframe so we can easily iter through it
sorted_conflict_df = conflict_df.sort_values(["year", "country_id"], axis=0)
sorted_conflict_df.head(2)

Unnamed: 0,id,year,type_of_violence,conflict_new_id,conflict_name,side_a_new_id,gwnoa,side_a,gwnob,side_b_new_id,...,longitude,geom_wkt,country,country_id,date_start,deaths_a,deaths_b,deaths_civilians,deaths_unknown,best
107883,186087,1989,1,381,Haiti:Government,6,41.0,Government of Haiti,,763,...,-72.335,POINT (-72.335000 18.539170),Haiti,41,1989,0,4,0,0,4
107890,186122,1989,1,381,Haiti:Government,6,41.0,Government of Haiti,,763,...,-72.335,POINT (-72.335000 18.539170),Haiti,41,1989,0,0,0,1,1


In [6]:
display(conflict_df.head(1))

Unnamed: 0,id,year,type_of_violence,conflict_new_id,conflict_name,side_a_new_id,gwnoa,side_a,gwnob,side_b_new_id,...,longitude,geom_wkt,country,country_id,date_start,deaths_a,deaths_b,deaths_civilians,deaths_unknown,best
0,4,2010,1,230,Yemen (North Yemen):Government,123,678.0,Government of Yemen (North Yemen),,881,...,44.206667,POINT (44.206667 15.354722),Yemen (North Yemen),678,2010,2,0,0,0,2


In [7]:
refugee_df["year"] = pd.to_numeric(refugee_df["year"])
refugee_df["country_id"] = pd.to_numeric(refugee_df["country_id"])

In [8]:
# Index refugee_df for easier search
try:
    refugee_df = refugee_df.set_index(["year", "country_id"])
except: # if we run this two times
    pass
# Get the displacement_extern value
refugee_df["displacement_extern"] = refugee_df.apply(\
                                    lambda row: row.refugee + row.asylum + row.stateless + row.others, axis=1)
refugee_df.head(1)

Unnamed: 0_level_0,Unnamed: 1_level_0,refugee,asylum,internally_displaced,stateless,others,total,displacement_extern
year,country_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1989.0,490.0,100786.0,0.0,0.0,0.0,0.0,100786.0,100786.0


In [9]:
refugee_df = refugee_df.sort_index()
display(refugee_df.head(5))

Unnamed: 0_level_0,Unnamed: 1_level_0,refugee,asylum,internally_displaced,stateless,others,total,displacement_extern
year,country_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1989.0,40.0,7682.0,0.0,0.0,0.0,0.0,7682.0,7682.0
1989.0,41.0,3344.0,0.0,0.0,0.0,0.0,3344.0,3344.0
1989.0,55.0,5.0,0.0,0.0,0.0,0.0,5.0,5.0
1989.0,90.0,45413.0,0.0,0.0,0.0,0.0,45413.0,45413.0
1989.0,91.0,5.0,0.0,0.0,0.0,0.0,5.0,5.0


In [10]:
print(COUNTRY_HUMAN_COST_DF_COLUMNS)

['country_id', 'year', 'number_of_events', 'events_id', 'displacement_extern', 'displacement_intern', 'total_displacement', 'deaths']


## Do actual merge

In [11]:
def get_total_deaths_from_event_ids(conflict_df, event_ids):
    """This function will return the total deaths associated with a list of event id"""
    deaths = 0
    for event in event_ids:
        index = conflict_df.index[conflict_df.id == event].tolist()[0]
        deaths += conflict_df.loc[index, "best"]
    return deaths

In [12]:
def get_append_series(conflict_df, refugee_df, year, country_id, event_ids=[]):
    """This function will create a serie to be appended to the country_human_cost_df with the proper values
       at each column
    """
    try:
        displacement_extern = refugee_df.displacement_extern[year, country_id]
        displacement_intern = refugee_df.internally_displaced[year, country_id]
        total_displacement  = refugee_df.total[year, country_id]
    except KeyError: # No value for this key
        displacement_extern = 0
        displacement_intern = 0
        total_displacement  = 0
        
    deaths = get_total_deaths_from_event_ids(conflict_df, event_ids)
    
    data = [ country_id, year, len(event_ids), event_ids,
             displacement_extern, displacement_intern,
             total_displacement, deaths
           ] 
    series_to_append = pd.Series(data = data, index=COUNTRY_HUMAN_COST_DF_COLUMNS)
    return series_to_append

In [13]:
# Create the country_human_cost_df and fill it for each country in which there was a conflict, by year
# Init variables
current_year = None
current_country_id = None
event_ids = []

country_human_cost_df = pd.DataFrame(columns=COUNTRY_HUMAN_COST_DF_COLUMNS) 
for row in tqdm(sorted_conflict_df.itertuples()):
    # Check if we are still in the same year and country
    if (current_year != row.year) or (current_country_id != row.country_id):
        # If we registered event, add a row to the country_human_cost_df
        if event_ids:
            series_to_append = get_append_series(conflict_df, refugee_df, current_year, current_country_id, event_ids)
            country_human_cost_df = country_human_cost_df.append(series_to_append, ignore_index=True)
        
        # Set the variables to the desired values
        event_ids = []
        current_year = row.year
        current_country_id = row.country_id
        
    # Add the event to the list
    event_ids.append(row.id)

135181it [00:39, 3385.06it/s]


In [14]:
display(country_human_cost_df.head(5))
country_human_cost_df["country_id"] = pd.to_numeric(country_human_cost_df["country_id"])
country_human_cost_df["year"] = pd.to_numeric(country_human_cost_df["year"])

Unnamed: 0,country_id,year,number_of_events,events_id,displacement_extern,displacement_intern,total_displacement,deaths
0,41,1989,4,"[186087, 186122, 186123, 186124]",3344.0,0.0,3344.0,48
1,51,1989,1,[107752],0.0,0.0,0.0,1
2,70,1989,2,"[182047, 182048]",0.0,0.0,0.0,3
3,90,1989,78,"[193728, 193732, 193764, 193765, 193766, 19376...",45413.0,0.0,45413.0,491
4,92,1989,58,"[120534, 120536, 120537, 120538, 120539, 12054...",39582.0,0.0,39582.0,4924


In [15]:
# Index country_human_cost_df by year and country
try:
    country_human_cost_df = country_human_cost_df.set_index(["year", "country_id"])
except:  # In case cell is re-run
    pass
country_human_cost_df = country_human_cost_df.sort_index()
country_human_cost_df.head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,number_of_events,events_id,displacement_extern,displacement_intern,total_displacement,deaths
year,country_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1989,41,4,"[186087, 186122, 186123, 186124]",3344.0,0.0,3344.0,48
1989,51,1,[107752],0.0,0.0,0.0,1
1989,70,2,"[182047, 182048]",0.0,0.0,0.0,3
1989,90,78,"[193728, 193732, 193764, 193765, 193766, 19376...",45413.0,0.0,45413.0,491
1989,92,58,"[120534, 120536, 120537, 120538, 120539, 12054...",39582.0,0.0,39582.0,4924


In [16]:
# Add the displacement informations for countries where there was no conflict
new_country_human_cost_df = country_human_cost_df
for row in tqdm(refugee_df.itertuples()):
    try:
        country_human_cost_df.loc[row.Index]
    except KeyError: # The row is not in the country_human_cost_df
        series_to_append = get_append_series(conflict_df, refugee_df, row.Index[0], row.Index[1]) 
        df_to_append = pd.DataFrame([series_to_append.transpose()])
        df_to_append = df_to_append.set_index(["year", "country_id"])
        new_country_human_cost_df = new_country_human_cost_df.append(df_to_append)
    except TypeError:
        continue


4749it [00:20, 231.72it/s]


In [17]:
country_human_cost_df = new_country_human_cost_df

In [18]:
country_human_cost_df = country_human_cost_df.sort_index()
display(country_human_cost_df.head(5))

Unnamed: 0_level_0,Unnamed: 1_level_0,number_of_events,events_id,displacement_extern,displacement_intern,total_displacement,deaths
year,country_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1989.0,40.0,0,[],7682.0,0.0,7682.0,0
1989.0,41.0,4,"[186087, 186122, 186123, 186124]",3344.0,0.0,3344.0,48
1989.0,51.0,1,[107752],0.0,0.0,0.0,1
1989.0,55.0,0,[],5.0,0.0,5.0,0
1989.0,70.0,2,"[182047, 182048]",0.0,0.0,0.0,3


## Add the GDP to the dataframe

In [19]:
with open(os.path.join("pickle", 'gdp.pickle'), 'rb') as data_source:
    gdp_df = pickle.load(data_source)

In [20]:
display(gdp_df.head(5))

Unnamed: 0_level_0,Unnamed: 1_level_0,gdp
year,country_id,Unnamed: 2_level_1
1989,2.0,22599.992143
1989,20.0,20700.799311
1989,31.0,14213.543372
1989,40.0,2577.207019
1989,41.0,393.347528


In [21]:
try: # Set index if not already done
    gdp_df = gdp_df.set_index(["year", "country_id"])
except:
    pass

In [22]:
gdp_df.index.duplicated().sum()

0

In [23]:
human_cost_gdp_df = pd.merge(country_human_cost_df, gdp_df, how='outer',
                             left_index=True, right_index=True, sort=True)
display(human_cost_gdp_df.head(5))      

Unnamed: 0_level_0,Unnamed: 1_level_0,number_of_events,events_id,displacement_extern,displacement_intern,total_displacement,deaths,gdp
year,country_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1989.0,2.0,,,,,,,22599.992143
1989.0,20.0,,,,,,,20700.799311
1989.0,31.0,,,,,,,14213.543372
1989.0,40.0,0.0,[],7682.0,0.0,7682.0,0.0,2577.207019
1989.0,41.0,4.0,"[186087, 186122, 186123, 186124]",3344.0,0.0,3344.0,48.0,393.347528


In [24]:
human_cost_gdp_df.fillna(value=0, inplace=True)
    
display(human_cost_gdp_df.head(5))

Unnamed: 0_level_0,Unnamed: 1_level_0,number_of_events,events_id,displacement_extern,displacement_intern,total_displacement,deaths,gdp
year,country_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1989.0,2.0,0,0,0.0,0.0,0.0,0,22599.992143
1989.0,20.0,0,0,0.0,0.0,0.0,0,20700.799311
1989.0,31.0,0,0,0.0,0.0,0.0,0,14213.543372
1989.0,40.0,0,[],7682.0,0.0,7682.0,0,2577.207019
1989.0,41.0,4,"[186087, 186122, 186123, 186124]",3344.0,0.0,3344.0,48,393.347528


In [25]:
country_human_cost_df = human_cost_gdp_df

## Add HDI to the DF

In [26]:
with open(os.path.join("pickle", 'hdi.pickle'), 'rb') as data_source:
    hdi_df = pickle.load(data_source)

In [27]:
display(hdi_df.head(2))

Unnamed: 0,year,hdi,country_id
0,1990,0.295,700
1,1991,0.3,700


In [28]:
hdi_df = hdi_df.set_index(["year", "country_id"])
print(len(hdi_df))

4267


In [29]:
hdi_duplicated_index = hdi_df.index.duplicated()
hdi_df = hdi_df.loc[~hdi_duplicated_index]
print(len(hdi_df))
display(hdi_df.head(1))


4162


Unnamed: 0_level_0,Unnamed: 1_level_0,hdi
year,country_id,Unnamed: 2_level_1
1990,700,0.295


In [30]:
human_cost_hdi_df = pd.merge(country_human_cost_df, hdi_df, how='outer',
                             left_index=True, right_index=True, sort=True)

In [31]:
human_cost_hdi_df.fillna(value=0, inplace=True)
    
display(human_cost_hdi_df.head(5))

Unnamed: 0_level_0,Unnamed: 1_level_0,number_of_events,events_id,displacement_extern,displacement_intern,total_displacement,deaths,gdp,hdi
year,country_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1989.0,2.0,0.0,0,0.0,0.0,0.0,0.0,22599.992143,0.0
1989.0,20.0,0.0,0,0.0,0.0,0.0,0.0,20700.799311,0.0
1989.0,31.0,0.0,0,0.0,0.0,0.0,0.0,14213.543372,0.0
1989.0,40.0,0.0,[],7682.0,0.0,7682.0,0.0,2577.207019,0.0
1989.0,41.0,4.0,"[186087, 186122, 186123, 186124]",3344.0,0.0,3344.0,48.0,393.347528,0.0


In [32]:
country_human_cost_df = human_cost_hdi_df

## Add country codes to the dataframe

In [33]:
country_codes_df = pd.read_csv(os.path.join("data", "countrycode.csv"))
display(country_codes_df.head(5))

Unnamed: 0.1,Unnamed: 0,ar5,continent,cowc,cown,eu28,eurocontrol_pru,eurocontrol_statfor,fao,fips105,...,country.name.es,country.name.fr,country.name.ru,country.name.zh,eurostat,wb_api2c,wb_api3c,p4_scode,p4_ccode,wvs
0,1,ASIA,Asia,AFG,700.0,,Asia,Asia/Pacific,2.0,AF,...,Afganistán,Afghanistan,Афганистан,阿富汗,AF,AF,AFG,AFG,700.0,4.0
1,2,OECD1990,Europe,,,,Eurocontrol,ESRA North-West,,,...,,,,,,,,,,
2,3,EIT,Europe,ALB,339.0,,Eurocontrol,ESRA East,3.0,AL,...,Albania,Albanie,Албания,阿尔巴尼亚,AL,AL,ALB,ALB,339.0,8.0
3,4,MAF,Africa,ALG,615.0,,Africa,North-Africa,4.0,AG,...,Argelia,Algérie,Алжир,阿尔及利亚,DZ,DZ,DZA,ALG,615.0,12.0
4,5,ASIA,Oceania,,,,Asia,Asia/Pacific,,AQ,...,,,,,AS,AS,ASM,,,16.0


In [34]:
def extract_iso3_from_gnow(gnwo):
    """Get the iso3 country code with the gnwo value, return None if no match was found"""
    try:
        iso3 = country_codes_df.loc[country_codes_df.cown == gnwo]["wb_api3c"].values[0]
    except IndexError as e:
        # 99 = Great Columbia
        # 711 = Tibet
        # 972 = Tonga
        # 973 = Tuvalu
        if not (gnwo == 99 or gnwo == 711 or gnwo == 971 or gnwo == 972 or gnwo == 973 or pd.isnull(gnwo) or gnwo == 340):
            print(e, "with gnwo: {}".format(gnwo))
        return None
    return iso3
print(extract_iso3_from_gnow(2))

USA


In [35]:
def extract_gnwo_countries_to_df():
    """Extract the countries from the gnwo and their id from the gnwo files"""
    countries_list = []
    # First gnwo file, contains id, code, name for all gnow numbers
    with open(os.path.join("data", "gnwo.txt"), "r") as gnow:
        for line in gnow:
            split_line = re.split(r'\t+', line)
            countries_list.append(split_line[0:3])
    countries_df = pd.DataFrame(countries_list, columns=["id", "code", "name"])
    countries_df["id"] = pd.to_numeric(countries_df["id"])
    countries_df = countries_df.drop_duplicates()
    return countries_df.set_index("id")
countries_list = extract_gnwo_countries_to_df()
display(countries_list.head(5))

Unnamed: 0_level_0,code,name
id,Unnamed: 1_level_1,Unnamed: 2_level_1
2,USA,United States of America
20,CAN,Canada
31,BHM,Bahamas
40,CUB,Cuba
41,HAI,Haiti


In [36]:
# Add code column to the country_human_cost_df
countries_codes = []
for row in country_human_cost_df.itertuples():
    countries_codes.append([row.Index[0], row.Index[1], extract_iso3_from_gnow(row.Index[1])])
countries_codes = pd.DataFrame(countries_codes, columns=["year", "country_id","country_code"]).set_index(["year", "country_id"])

In [37]:
if not "country_code" in country_human_cost_df.columns:
    country_human_cost_df = pd.concat([country_human_cost_df, countries_codes], axis=1, join_axes=[country_human_cost_df.index])
display(country_human_cost_df.head(5))

Unnamed: 0_level_0,Unnamed: 1_level_0,number_of_events,events_id,displacement_extern,displacement_intern,total_displacement,deaths,gdp,hdi,country_code
year,country_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1989.0,2.0,0.0,0,0.0,0.0,0.0,0.0,22599.992143,0.0,USA
1989.0,20.0,0.0,0,0.0,0.0,0.0,0.0,20700.799311,0.0,CAN
1989.0,31.0,0.0,0,0.0,0.0,0.0,0.0,14213.543372,0.0,BHS
1989.0,40.0,0.0,[],7682.0,0.0,7682.0,0.0,2577.207019,0.0,CUB
1989.0,41.0,4.0,"[186087, 186122, 186123, 186124]",3344.0,0.0,3344.0,48.0,393.347528,0.0,HTI


In [38]:
# For now drop bad rows... could do something different later on
country_human_cost_df = country_human_cost_df[~country_human_cost_df.country_code.isnull()]
display(country_human_cost_df.head(5))

Unnamed: 0_level_0,Unnamed: 1_level_0,number_of_events,events_id,displacement_extern,displacement_intern,total_displacement,deaths,gdp,hdi,country_code
year,country_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1989.0,2.0,0.0,0,0.0,0.0,0.0,0.0,22599.992143,0.0,USA
1989.0,20.0,0.0,0,0.0,0.0,0.0,0.0,20700.799311,0.0,CAN
1989.0,31.0,0.0,0,0.0,0.0,0.0,0.0,14213.543372,0.0,BHS
1989.0,40.0,0.0,[],7682.0,0.0,7682.0,0.0,2577.207019,0.0,CUB
1989.0,41.0,4.0,"[186087, 186122, 186123, 186124]",3344.0,0.0,3344.0,48.0,393.347528,0.0,HTI


In [39]:
display(country_human_cost_df.country_code.isnull().any())

False

In [40]:
# Compare the obtained country code with the one in the geopanda world dataset
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
display(world.head(5))

Unnamed: 0,pop_est,continent,name,iso_a3,gdp_md_est,geometry
0,28400000.0,Asia,Afghanistan,AFG,22270.0,"POLYGON ((61.21081709172574 35.65007233330923,..."
1,12799293.0,Africa,Angola,AGO,110300.0,(POLYGON ((16.32652835456705 -5.87747039146621...
2,3639453.0,Europe,Albania,ALB,21810.0,"POLYGON ((20.59024743010491 41.85540416113361,..."
3,4798491.0,Asia,United Arab Emirates,ARE,184300.0,"POLYGON ((51.57951867046327 24.24549713795111,..."
4,40913584.0,South America,Argentina,ARG,573900.0,(POLYGON ((-65.50000000000003 -55.199999999999...


In [41]:
# Drop country for which we don't have a corresponding country code
country_code_to_drop = []
for country_code in country_human_cost_df.country_code.unique():
    if not world.iso_a3.str.contains(country_code).any():
        country_code_to_drop.append(country_code)

# Remove unrecognized countries from the dataframe
country_human_cost_df = country_human_cost_df[~country_human_cost_df.country_code.isin(country_code_to_drop)]
display(country_human_cost_df.head(5))

Unnamed: 0_level_0,Unnamed: 1_level_0,number_of_events,events_id,displacement_extern,displacement_intern,total_displacement,deaths,gdp,hdi,country_code
year,country_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1989.0,2.0,0.0,0,0.0,0.0,0.0,0.0,22599.992143,0.0,USA
1989.0,20.0,0.0,0,0.0,0.0,0.0,0.0,20700.799311,0.0,CAN
1989.0,31.0,0.0,0,0.0,0.0,0.0,0.0,14213.543372,0.0,BHS
1989.0,40.0,0.0,[],7682.0,0.0,7682.0,0.0,2577.207019,0.0,CUB
1989.0,41.0,4.0,"[186087, 186122, 186123, 186124]",3344.0,0.0,3344.0,48.0,393.347528,0.0,HTI


## Pickle the result

In [42]:
with open(os.path.join("pickle", 'country_human_cost.pickle'), 'wb') as out:
    pickle.dump(country_human_cost_df, out)

In [43]:
with open(os.path.join("pickle", 'country_human_cost.pickle'), 'rb') as data_source:
    country_human_cost_df = pickle.load(data_source)