In [4]:
import numpy as np
import pandas as pd
import networkx as nx

import pickle

# Import the data

In [5]:
with open('conflict.pickle', 'rb') as src:
    events = pickle.load(src)

In [7]:
events.head(2)

Unnamed: 0,id,year,type_of_violence,conflict_new_id,conflict_name,side_a_new_id,gwnoa,side_a,gwnob,side_b_new_id,...,geom_wkt,country,country_id,date_start,date_end,deaths_a,deaths_b,deaths_civilians,deaths_unknown,best
0,4,2010,1,230,Yemen (North Yemen):Government,123,678.0,Government of Yemen (North Yemen),,881,...,POINT (44.206667 15.354722),Yemen (North Yemen),678,2010,2010,2,0,0,0,2
1,5,2011,3,715,Government of Yemen (North Yemen) - Civilians,123,678.0,Government of Yemen (North Yemen),,1,...,POINT (45.036667 12.779444),Yemen (North Yemen),678,2011,2011,0,0,0,0,0


# Combining events into conflicts

The fundental unit of the dataset is the event. However we are interested in conflicts. Therefore we group events into conflicts and aggregate some of the features:

- `date_start` and `date_end` are combined to obtain a length for each event that will be summed across all events.
- `date_start` becomes the oldest date in the list of start dates of the events in a conflict.
- `date_end` similarly becomes the most recent
- all the deaths counts are summed.
- A list of all sides, countries and coordinates is kept for each conflict

In [9]:
format = '%Y-%m-%d'
events['date_start'] = pd.to_datetime(events.date_start, format=format)
events['date_end'] = pd.to_datetime(events.date_end, format=format)
events['duration'] = events.date_end - events.date_start

In [10]:
conflicts = events.groupby('conflict_new_id').agg({
    'duration': 'sum',
    'date_start': 'min',
    'date_end': 'max',
    'deaths_civilians': 'sum',
    'deaths_unknown': 'sum',
    'best': 'sum',
    'latitude': lambda x: list(x),
    'longitude': lambda x: list(x),
    'country': lambda x: set(x),
    'side_a': lambda x: list(x),
    'side_b': lambda x: list(x)
})

In [12]:
names = events[['conflict_new_id', 'conflict_name']].drop_duplicates()\
                                                    .set_index('conflict_new_id').conflict_name

In [13]:
conflicts['name'] = names

In [14]:
conflicts = conflicts.rename(columns={
    'country': 'countries',
    'side_a': 'sides_a',
    'side_b': 'sides_b',
    'latitude': 'latitudes',
    'longitude': 'longitudes'
})

# Analyzing factions (sides)

## Create the dataframe with one record per (side_a, side_b)

In [None]:
sides_a = conflict_df[['side_a_new_id', 'side_a']]
sides_a.columns = ['id', 'name']
sides_a = sides_a.set_index('id')

sides_b = conflict_df[['side_b_new_id', 'side_b']]
sides_b.columns = ['id', 'name']
sides_b = sides_b.set_index('id')

sides = pd.concat([sides_a, sides_b]).drop_duplicates()

display('The total number of sides is {}'.format(len(sides)))

In [None]:
sides_victims = conflict_df.groupby(['side_a_new_id', 'side_b_new_id']).best.sum()
sum_victims_a = conflict_df.groupby(['side_a_new_id', 'side_b_new_id']).deaths_a.sum()
sum_victims_b = conflict_df.groupby(['side_a_new_id', 'side_b_new_id']).deaths_b.sum()
sides_victims = pd.DataFrame({
    'dead': sides_victims,
    'dead_a': sum_victims_a,
    'dead_b': sum_victims_b
}).reset_index()
sides_victims = sides_victims.merge(sides, left_on='side_a_new_id', right_index=True)
sides_victims = sides_victims.merge(sides, left_on='side_b_new_id', right_index=True)
sides_victims.columns = ['id_a', 'id_b', 'dead', 'dead_a', 'dead_b', 'name_a', 'name_b']
sides_victims.sort_values(by='dead', inplace=True, ascending=False)

In [None]:
sides_victims.to_json('sides.json', orient='records') # Export to json to build visualizations

In [None]:
sides_victims[(sides_victims.dead_a + sides_victims.dead_b) == (sides_victims.dead)]

## Create a network of sides

In [None]:
sides_net = nx.from_pandas_dataframe(sides_victims[:100], source='name_a', target='name_b', edge_attr='dead')

In [None]:
fig, ax = plt.subplots(figsize=(20, 20))
nx.draw_networkx(sides_net, ax=ax)