# Distance Network 

In [1]:
import numpy as np
import pandas as pd
import networkx as nx
import pickle
from tqdm import tqdm

from scipy import stats

import re
import unidecode

In [2]:
with open('conflict.pickle', 'rb') as data_source:
    conflict_df = pickle.load(data_source)

In [3]:
with open('refugee.pickle', 'rb') as data_source:
    refugee_df = pickle.load(data_source)

## Combined displacement and event dataframe

The goal of this section is to create a dataframe that links the displacement information given by the UNHCR with the events and deaths given by the GED dataset. This will be grouped by year and will keep track of the number of events and of the event identifiers.

The columns will be:

|country_id|year|number_of_events|events_id|displacement_extern|displacement_intern|total_displacement|deaths|

where number_of_events is the number of events in a country in a year
where events_id is an array containing the id of each event accounted for
where displacement_extern is the sum of [refugee, asylum, stateless, others]
where displacement_intern is the value of internally_displaced

In [4]:
COUNTRY_HUMAN_COST_DF_COLUMNS = ["country_id", "year", "number_of_events", "events_id", "displacement_extern", "displacement_intern", "total_displacement", "deaths"]

In [5]:
# Sort conflict dataframe so we can easily iter through it
sorted_conflict_df = conflict_df.sort_values(["year", "country_id"], axis=0)
sorted_conflict_df.head(2)

Unnamed: 0,id,year,type_of_violence,conflict_new_id,conflict_name,side_a_new_id,gwnoa,side_a,gwnob,side_b_new_id,...,longitude,geom_wkt,country,country_id,date_start,deaths_a,deaths_b,deaths_civilians,deaths_unknown,best
107883,186087,1989,1,381,Haiti:Government,6,41.0,Government of Haiti,,763,...,-72.335,POINT (-72.335000 18.539170),Haiti,41,1989,0,4,0,0,4
107890,186122,1989,1,381,Haiti:Government,6,41.0,Government of Haiti,,763,...,-72.335,POINT (-72.335000 18.539170),Haiti,41,1989,0,0,0,1,1


In [6]:
display(conflict_df.head(1))

Unnamed: 0,id,year,type_of_violence,conflict_new_id,conflict_name,side_a_new_id,gwnoa,side_a,gwnob,side_b_new_id,...,longitude,geom_wkt,country,country_id,date_start,deaths_a,deaths_b,deaths_civilians,deaths_unknown,best
0,4,2010,1,230,Yemen (North Yemen):Government,123,678.0,Government of Yemen (North Yemen),,881,...,44.206667,POINT (44.206667 15.354722),Yemen (North Yemen),678,2010,2,0,0,0,2


In [7]:
refugee_df["country_id"] = pd.to_numeric(refugee_df["country_id"])

In [8]:
# Index refugee_df for easier search
try:
    refugee_df = refugee_df.set_index(["year", "country_id"])
except: # if we run this two times
    pass
# Get the displacement_extern value
refugee_df["displacement_extern"] = refugee_df.apply(\
                                    lambda row: row.refugee + row.asylum + row.stateless + row.others, axis=1)
refugee_df.head(1)

Unnamed: 0_level_0,Unnamed: 1_level_0,refugee,asylum,internally_displaced,stateless,others,total,displacement_extern
year,country_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1989.0,490.0,100786.0,0.0,0.0,0.0,0.0,100786.0,100786.0


In [9]:
refugee_df = refugee_df.sort_index()
display(refugee_df.head(5))

Unnamed: 0_level_0,Unnamed: 1_level_0,refugee,asylum,internally_displaced,stateless,others,total,displacement_extern
year,country_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1989.0,40.0,7682.0,0.0,0.0,0.0,0.0,7682.0,7682.0
1989.0,41.0,3344.0,0.0,0.0,0.0,0.0,3344.0,3344.0
1989.0,55.0,5.0,0.0,0.0,0.0,0.0,5.0,5.0
1989.0,90.0,45413.0,0.0,0.0,0.0,0.0,45413.0,45413.0
1989.0,91.0,5.0,0.0,0.0,0.0,0.0,5.0,5.0


In [10]:
print(COUNTRY_HUMAN_COST_DF_COLUMNS)

['country_id', 'year', 'number_of_events', 'events_id', 'displacement_extern', 'displacement_intern', 'total_displacement', 'deaths']


In [11]:
def get_total_deaths_from_event_ids(conflict_df, event_ids):
    """This function will return the total deaths associated with a list of event id"""
    deaths = 0
    for event in event_ids:
        index = conflict_df.index[conflict_df.id == event].tolist()[0]
        deaths += conflict_df.loc[index, "best"]
    return deaths

In [12]:
def get_append_series(conflict_df, refugee_df, year, country_id, event_ids=[]):
    """This function will create a serie to be appended to the country_human_cost_df with the proper values
       at each column
    """
    try:
        displacement_extern = refugee_df.displacement_extern[year, country_id]
        displacement_intern = refugee_df.internally_displaced[year, country_id]
        total_displacement  = refugee_df.total[year, country_id]
    except KeyError: # No value for this key
        displacement_extern = 0
        displacement_intern = 0
        total_displacement  = 0
        
        
    deaths = get_total_deaths_from_event_ids(conflict_df, event_ids)
    data = [ country_id, year, len(event_ids), event_ids,
             displacement_extern, displacement_intern,
             total_displacement, deaths
           ] 
    series_to_append = pd.Series(data = data, index=COUNTRY_HUMAN_COST_DF_COLUMNS)
    return series_to_append

In [13]:
# Create the country_human_cost_df and fill it for each country in which there was a conflict, by year
# Init variables
current_year = None
current_country_id = None
event_ids = []

country_human_cost_df = pd.DataFrame(columns=COUNTRY_HUMAN_COST_DF_COLUMNS) 
for row in tqdm(sorted_conflict_df.itertuples()):
    # Check if we are still in the same year and country
    if (current_year != row.year) or (current_country_id != row.country_id):
        # If we registered event, add a row to the country_human_cost_df
        if event_ids:
            series_to_append = get_append_series(conflict_df, refugee_df, current_year, current_country_id, event_ids)
            country_human_cost_df = country_human_cost_df.append(series_to_append, ignore_index=True)
        
        # Set the variables to the desired values
        event_ids = []
        current_year = row.year
        current_country_id = row.country_id
        
    # Add the event to the list
    event_ids.append(row.id)

  return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  if self.run_code(code, result):
135181it [00:58, 2314.99it/s]


In [14]:
display(country_human_cost_df.head(5))

Unnamed: 0,country_id,year,number_of_events,events_id,displacement_extern,displacement_intern,total_displacement,deaths
0,41,1989,4,"[186087, 186122, 186123, 186124]",3344,0,3344,48
1,51,1989,1,[107752],0,0,0,1
2,70,1989,2,"[182047, 182048]",0,0,0,3
3,90,1989,78,"[193728, 193732, 193764, 193765, 193766, 19376...",45413,0,45413,491
4,92,1989,58,"[120534, 120536, 120537, 120538, 120539, 12054...",39582,0,39582,4924


In [15]:
# Index country_human_cost_df by year and country
try:
    country_human_cost_df = country_human_cost_df.set_index(["year", "country_id"])
except:  # In case cell is re-run
    pass
country_human_cost_df = country_human_cost_df.sort_index()
country_human_cost_df.head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,number_of_events,events_id,displacement_extern,displacement_intern,total_displacement,deaths
year,country_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1989,41,4,"[186087, 186122, 186123, 186124]",3344,0,3344,48
1989,51,1,[107752],0,0,0,1
1989,70,2,"[182047, 182048]",0,0,0,3
1989,90,78,"[193728, 193732, 193764, 193765, 193766, 19376...",45413,0,45413,491
1989,92,58,"[120534, 120536, 120537, 120538, 120539, 12054...",39582,0,39582,4924


In [16]:
# Add the displacement informations for countries where there was no conflict

for row in tqdm(refugee_df.itertuples()):
    try:
        country_human_cost_df[row.Index]
    except KeyError: # The row is not in the country_human_cost_df
        series_to_append = get_append_series(conflict_df, refugee_df, row.Index[0], row.Index[1]) 
        df_to_append = pd.DataFrame([series_to_append.transpose()])
        df_to_append = df_to_append.set_index(["year", "country_id"])
        country_human_cost_df = country_human_cost_df.append(df_to_append)

5043it [00:25, 200.91it/s]


In [17]:
country_human_cost_df = country_human_cost_df.sort_index()
display(country_human_cost_df.head(5))

Unnamed: 0_level_0,Unnamed: 1_level_0,number_of_events,events_id,displacement_extern,displacement_intern,total_displacement,deaths
year,country_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1989.0,40.0,0,[],7682,0,7682,0
1989.0,41.0,4,"[186087, 186122, 186123, 186124]",3344,0,3344,48
1989.0,41.0,0,[],3344,0,3344,0
1989.0,51.0,1,[107752],0,0,0,1
1989.0,55.0,0,[],5,0,5,0


In [18]:
with open('country_human_cost.pickle', 'wb') as out:
    pickle.dump(country_human_cost_df, out)

In [19]:
with open('country_human_cost.pickle', 'rb') as data_source:
    country_human_cost_df = pickle.load(data_source)