# Distance Network 

In [None]:
import numpy as np
import pandas as pd
import networkx as nx
import pickle
from tqdm import tqdm

from geopy.distance import vincenty, great_circle

from scipy import stats

import pycountry

import re
import unidecode

In [None]:
with open('conflict.pickle', 'rb') as data_source:
    conflict_df = pickle.load(data_source)

In [None]:
with open('refugee.pickle', 'rb') as data_source:
    refugee_df = pickle.load(data_source)

## Create a network of distances

We will split this network in two types of node: events and conflicts.
Each conflict location will be the centroid of all its events (mean longitude and latitude)
Each event will be linked to it's respective conflict with the edge representing the distance to the center of the conflict.
Each conflicts will be linked together by an edge describing the distance.

For each conflict node, we will also extract the statistics of its event distances with stats.describe.

This network will be used to find spatial relationship between various conflicts.
Also, because each node has a longitude and latitude information, it will be possible to print it on a map and
selectively print links between event and conflicts or between conflicts.

In [None]:
display(conflict_df.head(1))
display(refugee_df.head(1))

In [None]:
# graph indexed
conflict_df = conflict_df.set_index('id')
display(conflict_df.head(2))

In [None]:
def distance_between_nodes(graph, node_1_id, node_2_id):
    """Get distance between two nodes by using their latitude and longitude property"""
    pos_1 = (graph.node[node_1_id]["latitude"], graph.node[node_1_id]["longitude"])
    pos_2 = (graph.node[node_2_id]["latitude"], graph.node[node_2_id]["longitude"])
    # Sometime vincenty doesn't converge, just put None as weight
    try:
        distance = vincenty(pos_1, pos_2).km
    except:
        # Try to get great circle distance instead
        try:
            distance = great_circle(pos_1, pos_2).km
        except:
            print("Error: failed to get distance between node {} and node {}".format(node_1_id, node_2_id))
            print("Node 1 positions: {}".format(pos_1))
            print("Node 2 positions: {}".format(pos_2))
            distance = None
    return distance
    

In [None]:
def get_conflict_lat_long(conflict_id):
    """Get the average latitude and longitude for a particular conflict id"""
    longitude = conflict_df[conflict_df.conflict_new_id == conflict_id].longitude.mean()
    latitude = conflict_df[conflict_df.conflict_new_id == conflict_id].latitude.mean()
    return (latitude, longitude)
    

In [None]:
distance_graph = nx.Graph()

# Create a node for each conflict event
distance_graph.add_nodes_from(conflict_df.index.values, nature="event", year=conflict_df.year)

# Set longitude and latitude for each node
for index in conflict_df.index.values:
    distance_graph.node[index]["longitude"] = conflict_df.loc[index, "longitude"]
    distance_graph.node[index]["latitude"] = conflict_df.loc[index, "latitude"]

# Create a node for each unique conflict, use a special node id to avoid conflict with events
for conflict_id in conflict_df.conflict_new_id.unique():
    conflict_node_name = "conflict_{}".format(conflict_id)
    distance_graph.add_node(conflict_node_name, nature="conflict")
    
    # Get the average position for the conflict
    latitude, longitude = get_conflict_lat_long(conflict_id)
    distance_graph.node[conflict_node_name]["longitude"] = longitude
    distance_graph.node[conflict_node_name]["latitude"] = latitude

In [None]:
# Create edges from event to their conflict
for conflict_id in tqdm(conflict_df.conflict_new_id.unique()):
    conflict_node_name = "conflict_{}".format(conflict_id)
    
    for event_id in conflict_df[conflict_df.conflict_new_id == conflict_id].index.values:
        distance = distance_between_nodes(distance_graph, conflict_node_name, event_id)
        distance_graph.add_edge(conflict_node_name, event_id, weight=distance, nature="event_to_conflict")


In [None]:
# Extract the distance between all conflicts 
for conflict_id_1 in tqdm(conflict_df.conflict_new_id.unique()):
    conflict_node_name_1 = "conflict_{}".format(conflict_id_1)
    
    for conflict_id_2 in conflict_df.conflict_new_id.unique():   
        conflict_node_name_2 = "conflict_{}".format(conflict_id_2)   
        
        # No self loop
        if conflict_id_1 == conflict_id_2:
            pass
        
        # Don't do two times the same edge
        if distance_graph.has_edge(conflict_node_name_1, conflict_node_name_2):
            pass
        
        distance_graph.add_edge(conflict_node_name_1, conflict_node_name_2, 
                  weight = distance_between_nodes(distance_graph, conflict_node_name_1, conflict_node_name_2),
                  nature = "conflict_to_conflict")
        

In [None]:
def get_event_weights(graph, node, nature="event_to_conflict"):
    """Return the event weights for the given nature. Can be an array of natures"""
    edges = [e[2] for e in distance_graph.edges_iter(nbunch=node, data=True)] 
    if type(nature) == str:
        return [d["weight"] for d in edges if d["nature"] == nature]
    else:
        weights = [[] for x in range(len(nature))]
        for index, item in enumerate(nature):
            weights[index] = [d["weight"] for d in edges if d["nature"] == item]
        return weights
    

In [None]:
# Extract conflict statistics
for node in tqdm(distance_graph.nodes()):
    if distance_graph.node[node]["nature"] == "event":
        continue
    
    weights = get_event_weights(distance_graph, node, nature=["event_to_conflict", "conflict_to_conflict"])
    
    distance_graph.node[node]["events_stats"] = stats.describe(weights[0])
    #distance_graph.node[node]["conflicts_stats"] = stats.describe(weights[1])
        

In [None]:
with open('distance_nx.pickle', 'wb') as out:
    pickle.dump(distance_graph, out)

In [None]:
with open('distance_nx.pickle', 'rb') as data_source:
    distance_graph = pickle.load(data_source) 

## Combined displacement and event dataframe

The goal of this section is to create a dataframe that links the displacement information given by the UNHCR with the events and deaths given by the GED dataset. This will be grouped by year and will keep track of the number of events and of the event identifiers.

In [None]:
with open('conflict.pickle', 'rb') as data_source:
    conflict_df = pickle.load(data_source)

In [None]:
with open('refugee.pickle', 'rb') as data_source:
    refugee_df = pickle.load(data_source)

In [None]:
display(conflict_df.head(2))
display(refugee_df.head(2))

In [None]:
def extract_gnwo_countries_to_df():
    """Extract the countries from the gnwo and their id from the gnwo.txt file"""
    with open("gnwo.txt", "r") as gnow:
        countries_list = []
        for line in gnow:
            split_line = re.split(r'\t+', line)
            countries_list.append(split_line[0:3])
        # Create a dataframe
        return pd.DataFrame(countries_list, columns=["id", "code" ,"name"])
countries = extract_gnwo_countries_to_df()
display(countries.head(2))

In [None]:
def is_standard_country_name(country_name, country_df):
    """Check if the country name is in the standard countries dataset"""
    found = False
    for country in countries.name:
        if country_name == country:
            found = True
            break
    return found

In [None]:
def exist_in_substring(country_name, country_df):
    """Returns the id of the country if the substring is in the country_df names, else return none"""
    for data in country_df.itertuples():
        # Force ascii
        country_name = unidecode.unidecode(country_name)
        # manage Dem. Rep. of problem
        country_name = re.sub(r"Dem\. Rep\. of", "", country_name) 
        # manage Rep. of. problem
        country_name = re.sub(r"Rep\. of", "", country_name) 
        # Manage diminutive problem
        country_name = re.sub(r"Dem\.", "Democratic", country_name)
        country_name = re.sub(r"Rep\.", "Republic", country_name)
        # Manage China
        country_name = re.sub(r".*China.*", "China", country_name)
        # Manage Russia
        country_name = re.sub(r"Russian Federation", "Russia", country_name)


        try:
            if country_name in data.name:
                return data
            elif country_name.split()[0] in data.name:
                return data
        except:
            print("EXCEPTION!!!")
            print(country_name, data)
    return None

In [None]:
for country in refugee_df.origin.unique():
    found = is_standard_country_name(country, countries)
    
    # We constructed a dictionnary manually to change the names
    if not found:
        if not exist_in_substring(country, countries):
            print(country)
        #print(country)
        #print(change_country_names_refugee_dict[country])
        
        