# Distance Network 

In [26]:
import numpy as np
import pandas as pd
import networkx as nx
import pickle
from tqdm import tqdm

from geopy.distance import vincenty, great_circle

from scipy import stats

In [2]:
with open('conflict.pickle', 'rb') as data_source:
    conflict_df = pickle.load(data_source)

In [3]:
with open('refugee.pickle', 'rb') as data_source:
    refugee_df = pickle.load(data_source)

## Create a network of distances

We will split this network in two types of node: events and conflicts.
Each conflict location will be the centroid of all its events (mean longitude and latitude)
Each event will be linked to it's respective conflict with the edge representing the distance to the center of the conflict.
Each conflicts will be linked together by an edge describing the distance.

For each conflict node, we will also extract the statistics of its event distances with stats.describe.

This network will be used to find spatial relationship between various conflicts.
Also, because each node has a longitude and latitude information, it will be possible to print it on a map and
selectively print links between event and conflicts or between conflicts.

In [4]:
display(conflict_df.head(1))
display(refugee_df.head(1))

Unnamed: 0,id,year,type_of_violence,conflict_new_id,conflict_name,side_a_new_id,gwnoa,side_a,gwnob,side_b_new_id,...,longitude,geom_wkt,country,country_id,date_start,deaths_a,deaths_b,deaths_civilians,deaths_unknown,best
0,4,2010,1,230,Yemen (North Yemen):Government,123,678.0,Government of Yemen (North Yemen),,881,...,44.206667,POINT (44.206667 15.354722),Yemen (North Yemen),678,2010,2,0,0,0,2


Unnamed: 0,year,origin,refugee,asylum,internally_displaced,stateless,others,total
0,1989.0,Dem. Rep. of the Congo,100786.0,0.0,0.0,0.0,0.0,100786.0


In [5]:
# graph indexed
conflict_df = conflict_df.set_index('id')
display(conflict_df.head(2))

Unnamed: 0_level_0,year,type_of_violence,conflict_new_id,conflict_name,side_a_new_id,gwnoa,side_a,gwnob,side_b_new_id,side_b,...,longitude,geom_wkt,country,country_id,date_start,deaths_a,deaths_b,deaths_civilians,deaths_unknown,best
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4,2010,1,230,Yemen (North Yemen):Government,123,678.0,Government of Yemen (North Yemen),,881,AQAP,...,44.206667,POINT (44.206667 15.354722),Yemen (North Yemen),678,2010,2,0,0,0,2
5,2011,3,715,Government of Yemen (North Yemen) - Civilians,123,678.0,Government of Yemen (North Yemen),,1,Civilians,...,45.036667,POINT (45.036667 12.779444),Yemen (North Yemen),678,2011,0,0,0,0,0


In [6]:
def distance_between_nodes(graph, node_1_id, node_2_id):
    """Get distance between two nodes by using their latitude and longitude property"""
    pos_1 = (graph.node[node_1_id]["latitude"], graph.node[node_1_id]["longitude"])
    pos_2 = (graph.node[node_2_id]["latitude"], graph.node[node_2_id]["longitude"])
    # Sometime vincenty doesn't converge, just put None as weight
    try:
        distance = vincenty(pos_1, pos_2).km
    except:
        # Try to get great circle distance instead
        try:
            distance = great_circle(pos_1, pos_2).km
        except:
            print("Error: failed to get distance between node {} and node {}".format(node_1_id, node_2_id))
            print("Node 1 positions: {}".format(pos_1))
            print("Node 2 positions: {}".format(pos_2))
            distance = None
    return distance
    

In [7]:
def get_conflict_lat_long(conflict_id):
    """Get the average latitude and longitude for a particular conflict id"""
    longitude = conflict_df[conflict_df.conflict_new_id == conflict_id].longitude.mean()
    latitude = conflict_df[conflict_df.conflict_new_id == conflict_id].latitude.mean()
    return (latitude, longitude)
    

In [8]:
distance_graph = nx.Graph()

# Create a node for each conflict event
distance_graph.add_nodes_from(conflict_df.index.values, nature="event", year=conflict_df.year)

# Set longitude and latitude for each node
for index in conflict_df.index.values:
    distance_graph.node[index]["longitude"] = conflict_df.loc[index, "longitude"]
    distance_graph.node[index]["latitude"] = conflict_df.loc[index, "latitude"]

# Create a node for each unique conflict, use a special node id to avoid conflict with events
for conflict_id in conflict_df.conflict_new_id.unique():
    conflict_node_name = "conflict_{}".format(conflict_id)
    distance_graph.add_node(conflict_node_name, nature="conflict")
    
    # Get the average position for the conflict
    latitude, longitude = get_conflict_lat_long(conflict_id)
    distance_graph.node[conflict_node_name]["longitude"] = longitude
    distance_graph.node[conflict_node_name]["latitude"] = latitude

In [9]:
# Create edges from event to their conflict
for conflict_id in tqdm(conflict_df.conflict_new_id.unique()):
    conflict_node_name = "conflict_{}".format(conflict_id)
    
    for event_id in conflict_df[conflict_df.conflict_new_id == conflict_id].index.values:
        distance = distance_between_nodes(distance_graph, conflict_node_name, event_id)
        distance_graph.add_edge(conflict_node_name, event_id, weight=distance, nature="event_to_conflict")


100%|██████████| 997/997 [00:04<00:00, 234.37it/s]


In [10]:
# Extract the distance between all conflicts 
for conflict_id_1 in tqdm(conflict_df.conflict_new_id.unique()):
    conflict_node_name_1 = "conflict_{}".format(conflict_id_1)
    
    for conflict_id_2 in conflict_df.conflict_new_id.unique():   
        conflict_node_name_2 = "conflict_{}".format(conflict_id_2)   
        
        # No self loop
        if conflict_id_1 == conflict_id_2:
            pass
        
        # Don't do two times the same edge
        if distance_graph.has_edge(conflict_node_name_1, conflict_node_name_2):
            pass
        
        distance_graph.add_edge(conflict_node_name_1, conflict_node_name_2, 
                  weight = distance_between_nodes(distance_graph, conflict_node_name_1, conflict_node_name_2),
                  nature = "conflict_to_conflict")
        

100%|██████████| 997/997 [00:30<00:00, 32.82it/s]


In [11]:
def get_event_weights(graph, node, nature="event_to_conflict"):
    """Return the event weights for the given nature. Can be an array of natures"""
    edges = [e[2] for e in distance_graph.edges_iter(nbunch=node, data=True)] 
    if type(nature) == str:
        return [d["weight"] for d in edges if d["nature"] == nature]
    else:
        weights = [[] for x in range(len(nature))]
        for index, item in enumerate(nature):
            weights[index] = [d["weight"] for d in edges if d["nature"] == item]
        return weights
    

In [12]:
# Extract conflict statistics
for node in tqdm(distance_graph.nodes()):
    if distance_graph.node[node]["nature"] == "event":
        continue
    
    weights = get_event_weights(distance_graph, node, nature=["event_to_conflict", "conflict_to_conflict"])
    
    distance_graph.node[node]["events_stats"] = stats.describe(weights[0])
    #distance_graph.node[node]["conflicts_stats"] = stats.describe(weights[1])
        

  **kwargs)
  ret = ret.dtype.type(ret / rcount)
100%|██████████| 136178/136178 [00:00<00:00, 147750.61it/s] 


In [13]:
with open('distance_nx.pickle', 'wb') as out:
    pickle.dump(distance_graph, out)

In [14]:
with open('distance_nx.pickle', 'rb') as data_source:
    distance_graph = pickle.load(data_source) 