In [1]:
import os

import numpy as np
import pandas as pd
import networkx as nx

import pickle

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from collections import Counter
from IPython.display import HTML

# Import the data

In [2]:
with open(os.path.join('pickle', 'conflict.pickle'), 'rb') as src:
    events = pickle.load(src)
    
events.type_of_violence = events.type_of_violence.astype('category')
events.where_prec       = events.where_prec.astype('category')
events.type_of_violence.cat.rename_categories(['state-based', 'non-state', 'one-sided'], inplace=True)

In [3]:
events.head(2)

Unnamed: 0,id,year,type_of_violence,conflict_new_id,conflict_name,side_a_new_id,gwnoa,side_a,gwnob,side_b_new_id,...,longitude,geom_wkt,country,country_id,date_start,deaths_a,deaths_b,deaths_civilians,deaths_unknown,best
0,4,2010,state-based,230,Yemen (North Yemen):Government,123,678.0,Government of Yemen (North Yemen),,881,...,44.206667,POINT (44.206667 15.354722),Yemen (North Yemen),678,2010,2,0,0,0,2
1,5,2011,one-sided,715,Government of Yemen (North Yemen) - Civilians,123,678.0,Government of Yemen (North Yemen),,1,...,45.036667,POINT (45.036667 12.779444),Yemen (North Yemen),678,2011,0,0,0,0,0


# Combining events into conflicts

The fundental unit of the dataset is the event. However we are interested in conflicts. Therefore we group events into conflicts and aggregate some of the features:

- `date_start` and `date_end` are combined to obtain a length for each event that will be summed across all events.
- `date_start` becomes the oldest date in the list of start dates of the events in a conflict.
- `date_end` similarly becomes the most recent
- all the deaths counts are summed.
- A list of all sides, countries and coordinates is kept for each conflict

In [4]:
format = '%Y-%m-%d'
events['date_start'] = pd.to_datetime(events.date_start, format=format)
events['date_end'] = pd.to_datetime(events.date_end, format=format)
events['duration'] = events.date_end - events.date_start

AttributeError: 'DataFrame' object has no attribute 'date_end'

In [None]:
conflicts = events.groupby('conflict_new_id').agg({
    'duration': 'sum',
    'date_start': 'min',
    'date_end': 'max',
    'deaths_civilians': 'sum',
    'deaths_unknown': 'sum',
    'best': 'sum',
    'latitude': lambda x: list(x),
    'longitude': lambda x: list(x),
    'country': lambda x: set(x),
    'side_a': lambda x: list(x),
    'side_b': lambda x: list(x)
})

In [None]:
names = events[['conflict_new_id', 'conflict_name']].drop_duplicates()\
                                                    .set_index('conflict_new_id').conflict_name

In [None]:
conflicts['name'] = names

In [None]:
conflicts = conflicts.rename(columns={
    'country': 'countries',
    'side_a': 'sides_a',
    'side_b': 'sides_b',
    'latitude': 'latitudes',
    'longitude': 'longitudes'
})

In [None]:
conflicts.head()

In [None]:
conflicts[['name', 'best']].sort_values(by='best')[-50:].set_index('name')\
                           .plot(kind='barh', figsize=(10,10), title='Deadliest conflicts')

# Analyzing factions (sides)

## Create the dataframe with one record per (side_a, side_b)

In [None]:
sides_a = events[['side_a_new_id', 'side_a']]
sides_a.columns = ['id', 'name']
sides_a = sides_a.set_index('id')

sides_b = events[['side_b_new_id', 'side_b']]
sides_b.columns = ['id', 'name']
sides_b = sides_b.set_index('id')

sides = pd.concat([sides_a, sides_b]).drop_duplicates()

print('The total number of sides is {}'.format(len(sides)))

In [None]:
# Aggregate data
sides_victims = events.groupby(['side_a_new_id', 'side_b_new_id']).best.sum()
sum_victims_a = events.groupby(['side_a_new_id', 'side_b_new_id']).deaths_a.sum()
sum_victims_b = events.groupby(['side_a_new_id', 'side_b_new_id']).deaths_b.sum()

# Build new dataframe
sides_victims = pd.DataFrame({
    'dead': sides_victims,
    'dead_a': sum_victims_a,
    'dead_b': sum_victims_b
}).reset_index()

# Merge to have side names
sides_victims = sides_victims.merge(sides, left_on='side_a_new_id', right_index=True)
sides_victims = sides_victims.merge(sides, left_on='side_b_new_id', right_index=True)

# Rename columns and sort
sides_victims.columns = ['id_a', 'id_b', 'dead', 'dead_a', 'dead_b', 'name_a', 'name_b']
sides_victims.sort_values(by='dead', inplace=True, ascending=False)

In [None]:
sides_victims.to_json('sides.json', orient='records') # Export to json to build visualizations

In [None]:
sides_victims.head()

In [None]:
sides_victims[(sides_victims.dead_a + sides_victims.dead_b) == (sides_victims.dead)].head()

## Create a network of sides

In [None]:
sides_net = nx.from_pandas_dataframe(sides_victims[:100], source='name_a', target='name_b', edge_attr='dead')

In [None]:
fig, ax = plt.subplots(figsize=(15, 15))
nx.draw_networkx(sides_net, ax=ax)

In [None]:
faction, degree = zip(*map(tuple, sides_net.degree().items()))

In [None]:
def degree_distribution(degrees):
    c = Counter()
    for deg in degrees:
        c[deg] += 1
    return c

In [None]:
degree, count = zip(*sorted(degree_distribution(degree).items(), key=lambda k: k[0]))

In [None]:
ax = sns.barplot(degree, count)
ax.set_title('Degree distribution of the network of sides')
ax.set_xlabel('Number of rival factions')
ax.set_ylabel('Factions with that number of rivals')

We see that most of the faction only fought against one opponent. There is no "big villain" who tries to conquer the world.
As most of the nodes have degree 1, it is not really a graph.

As we characterized each edge with the number of dead people for that pair of sides, we can visualize this as a flow using a sankey diagram. The color and width of the edges varies from blue to red proportionally to the number of victims.
Here is the graph for the top 20 most deadly rivalities

![sankey](resusankey.svg)

In addition that, when we have enough precision in the data such that the $victims_{a} + victims_{b} = total$ we can use a chord diagram to visualize, for the nodes with degree bigger than one, the proportion of victims caused by each enemy:

![chord](chord.png)