# Distance Network 

In [66]:
import os

import numpy as np
import pandas as pd
import networkx as nx
import pickle
from tqdm import tqdm

from geopy.distance import vincenty, great_circle

from scipy import stats

import pygraphviz as pgv
import matplotlib.pyplot as plt

In [67]:
with open(os.path.join("pickle", 'conflict.pickle'), 'rb') as data_source:
    conflict_df = pickle.load(data_source)

In [68]:
with open(os.path.join("pickle", 'refugee.pickle'), 'rb') as data_source:
    refugee_df = pickle.load(data_source)

## Create a network of distances

We split this network in two types of node: events and conflicts.
Each conflict location is the centroid of all its events (mean longitude and latitude).
Each event is linked to the conflict it belongs to by an edge whose weight represents the distance to the center of the conflict.
All the conflicts will be linked together by an edge describing the distance.

For each conflict node, we will also extract the statistics of its event distances using `stats.describe`.

This network will be used to find spatial relationship between various conflicts.
Also, because each node has a longitude and latitude information, it will be possible to print it on a map and
selectively print links between event and conflicts or between conflicts.

In [69]:
display(conflict_df.head(1))
display(refugee_df.head(1))

Unnamed: 0,id,year,type_of_violence,conflict_new_id,conflict_name,side_a_new_id,gwnoa,side_a,gwnob,side_b_new_id,...,country_id,region,date_start,date_end,deaths_a,deaths_b,deaths_civilians,deaths_unknown,best,deaths_sides
0,4,2010,1,230,Yemen (North Yemen):Government,123,678.0,Government of Yemen (North Yemen),,881,...,678,Middle East,2010,2010,2,0,0,0,2,2


Unnamed: 0,year,refugee,asylum,internally_displaced,stateless,others,total,country_id
0,1989.0,100786.0,0.0,0.0,0.0,0.0,100786.0,490.0


In [70]:
# graph indexed
conflict_df = conflict_df.set_index('id')
display(conflict_df.head(2))

Unnamed: 0_level_0,year,type_of_violence,conflict_new_id,conflict_name,side_a_new_id,gwnoa,side_a,gwnob,side_b_new_id,side_b,...,country_id,region,date_start,date_end,deaths_a,deaths_b,deaths_civilians,deaths_unknown,best,deaths_sides
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4,2010,1,230,Yemen (North Yemen):Government,123,678.0,Government of Yemen (North Yemen),,881,AQAP,...,678,Middle East,2010,2010,2,0,0,0,2,2
5,2011,3,715,Government of Yemen (North Yemen) - Civilians,123,678.0,Government of Yemen (North Yemen),,1,Civilians,...,678,Middle East,2011,2011,0,0,0,0,0,0


In [71]:
def distance_between_nodes(graph, node_1_id, node_2_id):
    """Get distance between two nodes by using their latitude and longitude property"""
    pos_1 = (graph.node[node_1_id]["latitude"], graph.node[node_1_id]["longitude"])
    pos_2 = (graph.node[node_2_id]["latitude"], graph.node[node_2_id]["longitude"])
    # Sometime vincenty doesn't converge, just put None as weight
    try:
        distance = vincenty(pos_1, pos_2).km
    except:
        # Try to get great circle distance instead
        try:
            distance = great_circle(pos_1, pos_2).km
        except:
            print("Error: failed to get distance between node {} and node {}".format(node_1_id, node_2_id))
            print("Node 1 positions: {}".format(pos_1))
            print("Node 2 positions: {}".format(pos_2))
            distance = None
    return distance
    

In [72]:
def get_conflict_lat_long(conflict_id):
    """Get the average latitude and longitude for a particular conflict id"""
    longitude = conflict_df[conflict_df.conflict_new_id == conflict_id].longitude.mean()
    latitude = conflict_df[conflict_df.conflict_new_id == conflict_id].latitude.mean()
    return (latitude, longitude)
    

In [73]:
def get_conflict_stats_by_year(conflict_id):
    """Get the conflict statistics by year"""
    local_conflict_df = conflict_df[conflict_df.conflict_new_id == conflict_id]
    local_conflict_dict = {}
    for year in local_conflict_df.year.unique():
        year_local_conflict_df = local_conflict_df[local_conflict_df.year == year]
        yearly_dict = {}
        yearly_dict["year"] = year
        yearly_dict["conflict_id"] = conflict_id 
        yearly_dict["type_of_violence"] = year_local_conflict_df.iloc[0].type_of_violence
        yearly_dict["side_a"] = year_local_conflict_df.iloc[0].side_a
        yearly_dict["side_b"] = year_local_conflict_df.iloc[0].side_b
        yearly_dict["gwnoa"] = year_local_conflict_df.iloc[0].gwnoa
        yearly_dict["gwnob"] = year_local_conflict_df.iloc[0].gwnob
        yearly_dict["deaths_a"] = year_local_conflict_df.deaths_a.sum()
        yearly_dict["deaths_b"] = year_local_conflict_df.deaths_b.sum()
        yearly_dict["deaths_civilians"] = year_local_conflict_df.deaths_civilians.sum()
        yearly_dict["best"] = year_local_conflict_df.best.sum() 
        yearly_dict["latitude"] = year_local_conflict_df.latitude.mean()
        yearly_dict["longitude"] = year_local_conflict_df.longitude.mean()
        local_conflict_dict[year] = yearly_dict
    return local_conflict_dict

After having defined some auxiliary functions, we can finally create the graph:

In [75]:
distance_graph = nx.Graph()

# Create a node for each conflict event
distance_graph.add_nodes_from(conflict_df.index.values, nature="event", year=conflict_df.year)

# Set longitude and latitude for each node
for index in conflict_df.index.values:
    distance_graph.node[index]["longitude"] = float(conflict_df.loc[index, "longitude"])
    distance_graph.node[index]["latitude"] = float(conflict_df.loc[index, "latitude"])

# Create a node for each unique conflict, use a special node id to avoid conflict with events
for conflict_id in tqdm(conflict_df.conflict_new_id.unique()):
    conflict_node_name = "conflict_{}".format(conflict_id)
    distance_graph.add_node(conflict_node_name, nature="conflict")
    
    # Get the average position for the conflict
    latitude, longitude = get_conflict_lat_long(conflict_id)
    
    sub_conflict_df = conflict_df[conflict_df.conflict_new_id == conflict_id]
    # Get parameters
    distance_graph.node[conflict_node_name]["longitude"] = float(longitude)
    distance_graph.node[conflict_node_name]["latitude"] = float(latitude)
    distance_graph.node[conflict_node_name]["conflict_id"] = int(conflict_id)
    distance_graph.node[conflict_node_name]["type_of_violence"] = int(sub_conflict_df["type_of_violence"].unique()[0])
    distance_graph.node[conflict_node_name]["country"] = sub_conflict_df["country"].unique()[0]
    distance_graph.node[conflict_node_name]["region"] = sub_conflict_df["region"].unique()[0]
    distance_graph.node[conflict_node_name]["side_a"] = sub_conflict_df["side_a"].unique()[0]
    distance_graph.node[conflict_node_name]["side_b"] = sub_conflict_df["side_b"].unique()[0]
    distance_graph.node[conflict_node_name]["deaths_a"] = int(sub_conflict_df["deaths_a"].sum())
    distance_graph.node[conflict_node_name]["deaths_b"] = int(sub_conflict_df["deaths_b"].sum())
    distance_graph.node[conflict_node_name]["deaths_civilians"] = int(sub_conflict_df["deaths_civilians"].sum())
    distance_graph.node[conflict_node_name]["deaths_total"] = int(sub_conflict_df["best"].sum())
    
    # Set informations about the conflict
    distance_graph.node[conflict_node_name]["yearly_dict"] = get_conflict_stats_by_year(conflict_id)

100%|██████████| 997/997 [00:11<00:00, 84.06it/s]


In [76]:
# Create edges from event to their conflict
for conflict_id in tqdm(conflict_df.conflict_new_id.unique()):
    conflict_node_name = "conflict_{}".format(conflict_id)
    
    for event_id in conflict_df[conflict_df.conflict_new_id == conflict_id].index.values:
        distance = distance_between_nodes(distance_graph, conflict_node_name, event_id)
        distance_graph.add_edge(conflict_node_name, event_id, weight=distance, nature="event_to_conflict")


100%|██████████| 997/997 [00:04<00:00, 234.03it/s]


In [77]:
# Extract the distance between all conflicts 
for conflict_id_1 in tqdm(conflict_df.conflict_new_id.unique()):
    conflict_node_name_1 = "conflict_{}".format(conflict_id_1)
    
    for conflict_id_2 in conflict_df.conflict_new_id.unique():   
        conflict_node_name_2 = "conflict_{}".format(conflict_id_2)   
        
        # No self loop
        if conflict_id_1 == conflict_id_2:
            pass
        
        # Don't do two times the same edge
        if distance_graph.has_edge(conflict_node_name_1, conflict_node_name_2):
            pass
        
        distance_graph.add_edge(conflict_node_name_1, conflict_node_name_2, 
                  weight = distance_between_nodes(distance_graph, conflict_node_name_1, conflict_node_name_2),
                  nature = "conflict_to_conflict")
        

100%|██████████| 997/997 [00:31<00:00, 31.22it/s]


In [78]:
def get_event_weights(graph, node, nature="event_to_conflict"):
    """Return the event weights for the given nature. Can be an array of natures"""
    edges = [e[2] for e in distance_graph.edges(nbunch=node, data=True)] 
    if type(nature) == str:
        return [d["weight"] for d in edges if d["nature"] == nature]
    else:
        weights = [[] for x in range(len(nature))]
        for index, item in enumerate(nature):
            weights[index] = [d["weight"] for d in edges if d["nature"] == item]
        return weights
    

In [79]:
# Extract conflict statistics
for node in tqdm(distance_graph.nodes()):
    if distance_graph.node[node]["nature"] == "event":
        continue
    
    weights = get_event_weights(distance_graph, node, nature=["event_to_conflict", "conflict_to_conflict"])
    
    distance_graph.node[node]["events_stats"] = stats.describe(weights[0])
    #distance_graph.node[node]["conflicts_stats"] = stats.describe(weights[1])
        

  **kwargs)
  ret = ret.dtype.type(ret / rcount)
100%|██████████| 136178/136178 [00:01<00:00, 116799.68it/s]


In [80]:
with open(os.path.join("pickle", 'distance_nx.pickle'), 'wb') as out:
    pickle.dump(distance_graph, out)

In [116]:
with open(os.path.join("pickle", 'distance_nx.pickle'), 'rb') as data_source:
    distance_graph = pickle.load(data_source) 

In [117]:
# Get a subgraph with only the conflicts
conflict_nodes = [node for node, data in distance_graph.nodes(data=True) if data["nature"] == "conflict"]
distance_graph_conflicts = distance_graph.subgraph(conflict_nodes)

print(distance_graph.number_of_nodes(), distance_graph.number_of_edges())
print(distance_graph_conflicts.number_of_nodes(), distance_graph_conflicts.number_of_edges())

136178 632684
997 497503


In [118]:
# Rename nodes
rename_dict = {node:node.replace("conflict_", "") for node in distance_graph_conflicts.nodes()}
distance_graph_conflicts = nx.relabel_nodes(distance_graph_conflicts, rename_dict)    

In [119]:
# Remove yearly dict
for node, data in distance_graph_conflicts.nodes(data=True):
    try:
        del data['events_stats']
        del data['yearly_dict']
        del data['nature']
    except:
        pass

In [120]:
# Only keep the edges where the weight is smaller than the maximal distance
def get_filtered_graph_conflicts(graph, maximum_distance):
    """Function to get a copy graph with the specific distance"""
    edges_to_remove = [edge for edge in graph.edges(data=True) \
                       if edge[2]["weight"] > maximum_distance]

    filtered_graph = graph.copy()
    filtered_graph.remove_edges_from(edges_to_remove)
    return filtered_graph

In [121]:
for node in distance_graph_conflicts.nodes(data=True):
    print(node[1])
    break

{'longitude': 6.980659750000001, 'latitude': 4.737405, 'conflict_id': 4786, 'type_of_violence': 2, 'country': 'Nigeria', 'region': 'Africa', 'side_a': 'Adoni', 'side_b': 'Ogoni', 'deaths_a': 2, 'deaths_b': 95, 'deaths_civilians': 0, 'deaths_total': 1085}


In [123]:
maximum_distance = 200
filtered_graph = get_filtered_graph_conflicts(distance_graph_conflicts,
                                              maximum_distance)
print(filtered_graph.number_of_edges())

6509


In [124]:
def get_color(node_data):
    type_of_violence = node_data["type_of_violence"]
    type_of_violence_color_dict = {
        1:'blue',
        2:'orange',
        3:'pink'
    }
    return type_of_violence_color_dict[type_of_violence]
color_array = [get_color(data) for node,data in filtered_graph.nodes(data=True)]
# Size 
#size_array = [data["events_stats"].mean for node,data in filtered_graph.nodes(data=True)]


In [125]:
# Get nodes position
pos_dict = {node: [data['longitude'], data['latitude']]for node,data in filtered_graph.nodes(data=True)}
pos_dict = nx.spring_layout(filtered_graph,
                            k =5,
                            pos=pos_dict,
                            iterations=100)

In [None]:
fig = plt.figure(1,figsize=(100,100))
plt.clf()

nx.draw_networkx(filtered_graph,
                 pos_dict,
                 with_labels=False,
                 node_size=10,
                 node_color=color_array,
                 width=0.1
                ) 
plt.savefig(os.path.join("results","distance_graph.png"))

In [126]:
nx.write_graphml(filtered_graph, os.path.join("results", "filtered_graph.graphml"))