# Explore ACLED data

A notebook for sharing exploration code for the ACLED data

In [None]:
import pandas as pd
from datetime import date
import matplotlib.pyplot as plt
import seaborn as sns
from functools import partial, lru_cache
import sys
import os 
module_path = os.path.abspath(os.path.join('../src'))
os.chdir(module_path)
sys.path.insert(0, module_path)
from data.utils import *

## Load conflict data and cache

In [None]:
### Load conflict data from relevant directory

@lru_cache()
def load_conflicts_data(countries=None):
    conflicts = load_data(BUCKET, "CONFLICT/ACLED_data_Africa_Ucodes.csv").reset_index()
    if countries is not None:
        conflicts = conflicts.loc[df["adm0_name"].isin(countries)].copy()
    return conflicts

In [None]:
load_conflicts_data().columns

### Just to understand what the data looks like ...

In [None]:
display(high_fatalities := load_conflicts_data().query('fatalities >1000'))
print('\n'.join(
    high_fatalities[['notes', 'event_date', 'country']]
    .apply(lambda r: r['event_date'] + ' ' + r['country'] + ' ' + r['notes'], axis=1)
))

## Visualize the trend of conflicts by various groupings

In [None]:
def visualize_conflict_trend(*, resample='D', weight=None, group_keys=[], begin=None):
    ''' Plot the trend of conflicts by the `group_keys`. By default the trend
    is shown for the `count` of conflicts. If `weight` is specified, then the
    trend is shown for `count * weight`. If more than one group key is specified,
    subplots are created for the last key. Return DataFrame plotted.
    '''
    conflicts = load_conflicts_data()
    assert weight == None or weight in conflicts.columns, "`weight` should be one of the columns"
    df = conflicts.copy()
    df['date'] = pd.to_datetime(df['event_date'], format='%d %B %Y')
    if begin is not None:
        df = df.loc[df.date.dt.date >= begin]
    
    if weight:
        metric = df.groupby(['date'] + group_keys)[weight].sum()
    else:
        metric = df.groupby(['date'] + group_keys).count().max(axis=1)
        metric.name = 'count'
    
    metric = metric.groupby(group_keys).resample(resample, level='date').sum().reset_index()
    
    plot = sns.relplot
    if len(group_keys) > 1:
        plot = partial(plot, col=group_keys[-1], col_wrap=8)
        
    plot(data=metric,
        kind='line',
        x='date',
        y=weight if weight is not None else 'count',
        # TODO: I would like to show the full range of the conflict count or fatalities
        # instead of the default confidence interval 95%. This should be achievable with
        # the `errorbar` param, but our version of seaborn doesn't seem to have it yet.
    )
    
    return metric

### Conflicts and fatalities by country

In [None]:
count_trend_by_country = visualize_conflict_trend(resample='Y', group_keys=['country'])

In [None]:
fatalities_trend_by_country = visualize_conflict_trend(resample='Y', group_keys=['country'], weight='fatalities')

#### There was an extraodinary event just before 2000. Let's leave that to another study and focus on the trend from 2000 forward.

In [None]:
fatalities_trend_by_country = visualize_conflict_trend(resample='Y', group_keys=['country'], weight='fatalities', begin=date(2000, 1, 1))

### Conflicts and fatalities by country and event_type

In [None]:
count_trend_by_country_and_event_type = visualize_conflict_trend(resample='Y', group_keys=['country', 'event_type'], begin=date(2000, 1, 1))

In [None]:
fatalities_trend_by_country_and_event_type = visualize_conflict_trend(resample='Y', group_keys=['country', 'event_type'], weight='fatalities', begin=date(2000, 1, 1))

#### Which event type account for the most fatalities?

In [None]:
fatalities_index_by_event_type = fatalities_trend_by_country_and_event_type.set_index(['event_type', 'country', 'date'])
fatalities_fraction_by_event_type = (
    fatalities_index_by_event_type
    .groupby(level=['event_type', 'country', 'date'])
    .sum() 
    / fatalities_index_by_event_type
    .groupby(level=['country', 'date'])
    .sum()
)
sns.relplot(
    data=fatalities_fraction_by_event_type.rename(columns={'fatalities': 'proportion of fatalities'}),
    kind='line',
    x='date',
    y='proportion of fatalities',
    hue='event_type',
    style='event_type'
)

### Conflicts and fatalities by country and reporting source

In [None]:
count_trend_by_country_and_source = visualize_conflict_trend(resample='Y', group_keys=['country', 'source_scale'], begin=date(2000, 1, 1))

In [None]:
fatalities_trend_by_country_and_source = visualize_conflict_trend(resample='Y', group_keys=['country', 'source_scale'], weight='fatalities', begin=date(2000, 1, 1))

#### Fatalities per event of different sources

In [None]:
fatalities_rate = (
    fatalities_trend_by_country_and_source
    .set_index(['country', 'source_scale', 'date'])
    ['fatalities']
    / count_trend_by_country_and_source
    .set_index(['country', 'source_scale', 'date'])
    ['count']
).reset_index().rename(columns={0: 'fatality_rate'})

In [None]:
fatalities_rate_median = (
    fatalities_rate
    .groupby('source_scale')[['fatality_rate']]
    .median()
    .sort_values('fatality_rate', ascending=False)
)
sns.barplot(data=fatalities_rate_median.reset_index(), y='source_scale', x='fatality_rate')