# Part 3: Election Violence Prediction

In [None]:
# import libraries

import os
import pandas as pd
pd.set_option('display.max_columns', 200)
import re
import numpy as np
from numpy import arange, argmax
import matplotlib.pyplot as plt
import os
import seaborn as sns
import random
from multiprocessing import Pool
from tqdm import tqdm
import math

import tensorflow as tf
import tensorflow_hub as hub
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import precision_recall_curve, roc_curve, auc, f1_score
from sklearn.model_selection import ParameterGrid

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

from absl import logging


In [None]:
# mount google drive

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# 1. Load Data

In [None]:
# load data

data_path = '/content/drive/MyDrive/data606-capstone/William Simpson - Capstone 606/3. Datasets'

# data of election related fatalities
deco = pd.read_csv(os.path.join(data_path,'DECO_v.1.0.csv'))

# curated news dataset from Part 2
news = pd.read_csv(os.path.join(data_path,'relevant_kenya_news_AUG2016-NOV2017_FINAL_BEST.csv'), 
                   lineterminator='\n', index_col=0, parse_dates=['DATE'])


# 2. Data Preparation

### 2.1 Prepare DECO election violence data

In [None]:
# get election violence by selected country

country_selected = 'Kenya'
country_deco = deco.loc[deco.country==country_selected].copy()

In [None]:
# convert dates to datetime object

country_deco['date_start'] = pd.to_datetime(country_deco.date_start)
country_deco['date_end'] = pd.to_datetime(country_deco.date_end)

In [None]:
# view DECO data

country_deco

Unnamed: 0,id,type_of_violence,conflict_new_id,conflict_name,dyad_new_id,side_a_new_id,side_a,side_b_new_id,side_b,country_id,country,region,source_article,year,date_start,date_end,deaths_a,deaths_b,civilian_deaths,unknown,best,high,low,latitude,longitude,electoral_vio,electoral_vio_uncertainty,electoral_vio_source,electoral_purpose,electoral_side_a,electoral_side_a_2,electoral_side_a_inc,electoral_side_b,electoral_side_b_2,electoral_side_b_inc,electoral_perpetrator,electoral_targets,electoral_type,electoral_timing,relid,ucdp_ged
1217,40800,2,4953,Degodia - Garre,5563,918,Degodia,1079,Garre,501,Kenya,Africa,"Capital FM/All Africa 2012-08-21 ""Five People ...",2012,2012-08-20,2012-08-20,5,0,0,0,5,5,5,3.500000,40.75,1,1,,preemptive violence,Other,,0,Other,,0,2,3,1;2,1,KEN-2012-2-435-2,1
1218,40804,2,4953,Degodia - Garre,5563,918,Degodia,1079,Garre,501,Kenya,Africa,"NTV 2012-08-25 through BBC ""Situation in volat...",2012,2012-08-23,2012-08-23,0,1,0,0,1,1,1,1.747220,40.0689,1,2,,preemptive violence,Other,,0,Other,,0,2,3,1;2,1,KEN-2012-2-435-5,1
1219,40805,2,4953,Degodia - Garre,5563,918,Degodia,1079,Garre,501,Kenya,Africa,"The Standard 2012-08-27 through BBC ""Four more...",2012,2012-08-26,2012-08-26,0,0,0,4,4,4,4,3.860000,40.5,1,2,,preemptive violence,Other,,0,Other,,0,2,3,1;2,1,KEN-2012-2-435-6,1
1220,40808,2,9601,Kalenjin - Luo,10211,1024,Kalenjin,901,Luo,501,Kenya,Africa,"The Star 2012-03-02 through BBC 2012-03-03 ""Ke...",2012,2012-02-24,2012-02-26,0,0,0,6,6,6,6,0.500000,36,1,1,,disrupt,Other,,0,Other,,0,3,3,1;2,1,KEN-2012-2-X983-1,0
1221,40816,2,11216,Meru - Turkana,11826,3546,Meru,693,Turkana,501,Kenya,Africa,"IRIN 2012-02-15 ""KENYA: Hundreds displaced by ...",2012,2012-02-13,2012-02-13,0,0,0,3,3,3,3,0.401147,37.685247,1,1,,disrupt,Other,,0,Other,,0,3,3,1;2,1,KEN-2012-2-X5097-1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4228,253925,3,469,Government of Kenya - Civilians,936,91,Government of Kenya,1,Civilians,501,Kenya,Africa,"""Human Rights Watch,2018-02-25,Kenya: Fresh Ev...",2017,2017-10-31,2017-10-31,0,0,1,0,1,1,1,-1.283333,36.816667,1,1,,crack down on protest and/or riot,Security forces,,1,Civilians,,0,1,5,2,3,KEN-2017-3-936-70,1
4229,253926,4,13998,Government of Kenya - Opponents of Kenyatta,15173,91,Government of Kenya,6777,Opponents of Kenyatta,501,Kenya,Africa,"""Human Rights Watch,2018-02-25,Kenya: Fresh Ev...",2017,2017-11-17,2017-11-17,0,0,0,1,1,1,1,-1.283333,36.816667,1,2,,crack down on protest and/or riot,Security forces,,1,Civilians,,0,1,5,2,3,KEN-2017-4-15173-57,0
4230,253927,3,469,Government of Kenya - Civilians,936,91,Government of Kenya,1,Civilians,501,Kenya,Africa,"""Human Rights Watch,2018-02-25,Kenya: Fresh Ev...",2017,2017-11-20,2017-11-20,0,0,1,0,1,1,1,-1.283333,36.816667,1,2,,crack down on protest and/or riot,Security forces,,1,Civilians,,0,1,5,2,3,KEN-2017-3-936-73,1
4231,253928,3,469,Government of Kenya - Civilians,936,91,Government of Kenya,1,Civilians,501,Kenya,Africa,"""Human Rights Watch,2018-02-25,Kenya: Fresh Ev...",2017,2017-11-28,2017-11-28,0,0,1,0,1,1,1,-1.283333,36.816667,1,2,,crack down on protest and/or riot,Security forces,,1,Civilians,,0,1,5,2,3,KEN-2017-3-936-74,1


In [None]:
# filter for events of election violence in selected election cycle timeframe

country_deco_time = country_deco.loc[country_deco.date_start>'2016-08-07'].copy()
country_deco_time = country_deco_time.loc[country_deco_time.date_end<'2017-11-29'].copy()

print('min date:', country_deco_time.date_start.min())
print('max date:', country_deco_time.date_end.max())

min date: 2017-02-02 00:00:00
max date: 2017-11-28 00:00:00


In [None]:
# size data for Kenya 2017 election cycle

country_deco_time.shape

(98, 41)

In [None]:
# aggregate deaths by day

def agg_deco_one_day(deco_data):
    '''
    aggregates the count of election violence deaths each day

    Parameters:
    ———————————
    deco_data: pandas dataframe
        original deco data dataframe
    Outputs:
    ————————
    deco_data: pandas groupby object
        deco data grouped by day
    '''
    deco_data = deco_data[['date_start', 'best']]
    deco_data = deco_data.groupby(by=['date_start']).sum()
    
    return deco_data.reset_index()

In [None]:
# apply aggregation function

country_deco_day = agg_deco_one_day(country_deco_time)
country_deco_day

Unnamed: 0,date_start,best
0,2017-02-02,1
1,2017-03-19,10
2,2017-05-13,1
3,2017-05-20,1
4,2017-05-26,7
5,2017-06-01,21
6,2017-06-02,1
7,2017-07-28,2
8,2017-08-08,1
9,2017-08-09,17


In [None]:
# fill missing dates with zero fatalities

# get full date range
date_range = pd.date_range(start='2016-08-08', 
                  end=country_deco_day.date_start.max())
country_date_fill = country_deco_day.reindex(date_range).reset_index()

# merge full date range with dates of election violence
country_date_fill = country_date_fill.merge(country_deco_day, how='left', left_on='index', right_on='date_start')

# fill with zero fatalities
country_date_fill = country_date_fill.fillna(0)

# get relevant columns
country_date_fill = country_date_fill[['index', 'best_y']]
country_date_fill = country_date_fill.rename(columns={'best_y':'fatalities'})

In [None]:
country_date_fill

Unnamed: 0,index,fatalities
0,2016-08-08,0.0
1,2016-08-09,0.0
2,2016-08-10,0.0
3,2016-08-11,0.0
4,2016-08-12,0.0
...,...,...
473,2017-11-24,0.0
474,2017-11-25,0.0
475,2017-11-26,0.0
476,2017-11-27,0.0


In [None]:
# aggregate fatalities by time unit

def agg_deco(deco_data, unit=1):
    '''
    get the sum of election related fatalities in a custom range of time
    
    Parameters:
    ———————————
    deco_data: pandas dataframe
        data of election fatalities by one day
    unit: int
        the number of days to group by

    Outputs:
    ————————
    deco_data: pandas dataframe
        transformed grouped data of election deaths
    '''
    deco_data = deco_data.set_index(['index'])
    deco_data = deco_data.groupby(pd.Grouper(freq=f'{unit}D')).sum()
    return deco_data

### 2.2 Prepare News Data

In [None]:
# prepare embedding model

logging.set_verbosity(logging.ERROR)

module_url = "https://tfhub.dev/google/universal-sentence-encoder/4" 
model = hub.load(module_url)

def embed(input):
    return model(input)

# 3. Model Building

In [None]:
# define functions to convert election fatalities target to categorical

# define binary class labels
pos_class = 1  # fatal class
neg_class = 0  # non-fatal class

# binary classes
def to_categorial_binary_0(val):
    if val == 0:
        return neg_class
    else:
        return pos_class
def to_categorial_binary_1(val):
    if val <= 1:
        return neg_class
    else:
        return pos_class
def to_categorial_binary_2(val):
    if val <= 2:
        return neg_class
    else:
        return pos_class
def to_categorial_binary_3(val):
    if val <= 3:
        return neg_class
    else:
        return pos_class
def to_categorial_binary_4(val):
    if val <= 4:
        return neg_class
    else:
        return pos_class
def to_categorial_binary_6(val):
    if val <= 6:
        return neg_class
    else:
        return pos_class

In [None]:
# create dataframe to track data structure, model parameters, and performance

model_perf = pd.DataFrame(columns=['day_aggregation', 'time_shift', 'categorical_function', 'classifier_alg', 'params', 'train_index_size','X','y',
                                   'train_acc', 'test_acc', 'AUC', 'threshold', 'threshold_test_F1','fatal_precision','fatal_recall','non_fatal_precision','non_fatal_recall'])
model_perf.columns

Index(['day_aggregation', 'time_shift', 'categorical_function',
       'classifier_alg', 'params', 'train_index_size', 'X', 'y', 'train_acc',
       'test_acc', 'AUC', 'threshold', 'threshold_test_F1', 'fatal_precision',
       'fatal_recall', 'non_fatal_precision', 'non_fatal_recall'],
      dtype='object')

### 3.1 Search for optimal data structure and model 

In [None]:
def to_labels(pos_probs, threshold):
	'''
	function to convert predicted class probabilities to a class label from custom threshold

	Parameters:
	———————————
	pos_probs: numpy array, list
			positive class predicted probabilities
	threshold: int
			custom decision threshold

	Ouputs:
	———————
	(pos_probs >= threshold).astype('int'): numpy array, list
			list of class labels (0, 1)
	'''
	return (pos_probs >= threshold).astype('int')

In [None]:
# search for best model

aggregations = [1,2,3,4,5,6]
shifts = [-1]
categories = [to_categorial_binary_0, 
              to_categorial_binary_1,
              to_categorial_binary_2,
              to_categorial_binary_3,
              to_categorial_binary_4,
              to_categorial_binary_6]
classifiers = [SVC, RandomForestClassifier]

for unit_of_aggregation in aggregations:
    print('Aggregation:', unit_of_aggregation)

    ### DATA PREPARATION ###
    # aggregate election fatalities by unit
    country_deco_grouped = agg_deco(country_date_fill, unit=unit_of_aggregation)

    # get average embeddings from date group
    avg_embeddings_by_unit_time = []

    time_unit = unit_of_aggregation
    news_days = news.DATE.unique()
    sub_divisions = math.ceil(len(news_days) / time_unit)

    for idx, date in enumerate(news_days):
        # non start dates of each group
        if idx % time_unit != 0:
            continue
        # special logic for last batch
        elif idx >= time_unit * (sub_divisions - 1):
            date_sub_range = pd.date_range(start=date, 
                                          end=news_days[-1])      
        else:
            date_sub_range = pd.date_range(start=date, 
                                          end=news_days[idx+time_unit-1])

        # get news for time range
        news_text_sub = news.loc[news.DATE.isin(date_sub_range)]
        # get embeddings of articles in date group
        embeddings = []
        for idx, txt in enumerate(news_text_sub['news_text']):
            txt_embedding = embed([txt])[0]
            embeddings.append(txt_embedding)
        # convert to numpy array
        embeddings_np = [np.asarray(emb).astype('float32') for emb in embeddings]
        # average embeddings in time range
        avg_embeddings = np.mean(embeddings_np, axis=0)
        avg_embeddings_by_unit_time.append(avg_embeddings)

    # get final df for prediction
    deco_news_grouped = country_deco_grouped.copy()
    deco_news_grouped['avg_embedding'] = avg_embeddings_by_unit_time

    ### MODELING ###
    for shift in shifts:
        print('Shift:', shift)
        # prepare data for model
        y = deco_news_grouped.fatalities.shift(shift)
        y = y.dropna(axis=0).copy()
        X = np.array(avg_embeddings_by_unit_time)
        X = X[:shift].copy()

        for to_categorical in categories:
            print('Categorical Function:', to_categorical)
            # transform data structure
            y_cat = [to_categorical(row) for row in y]
            
            vl_ct = pd.Series(y_cat).value_counts()
            sns.barplot(x=list(vl_ct.keys()),
                        y=vl_ct.values)
            plt.show()

            # define train-test
            train_index = math.floor(X.shape[0]*0.8)
            X_train = X[:train_index]
            y_train = y_cat[:train_index]
            X_test = X[train_index:]
            y_test = y_cat[train_index:]

            # over sample training data to correct class imbalance
            oversample = RandomOverSampler(sampling_strategy='all', random_state=42)
            X_train, y_train = oversample.fit_resample(X_train, y_train)

            # define classification algorithm
            for clf_model in classifiers:
                print('Model:', clf_model)
                if clf_model == SVC:
                    clf = clf_model(random_state=0, probability=True)
                    grid = {'C': [0.1,0.6,0.7,0.8,0.9,1.0,1.3], 
                            'gamma': ['scale', 'auto', 1,0.1,0.01,0.001],
                            'kernel': ['rbf', 'poly', 'sigmoid'],
                            'class_weight':['balanced',None]}
                # elif clf_model == XGBClassifier:
                #     clf = clf_model(random_state=0)
                #     grid = {'gamma': [0,0.1,2],
                #             'learning_rate': [0.01, 0.1],
                #             'max_depth': [5,7,9],
                #             'n_estimators': [100,1000],
                #             'scale_pos_weight':[1,10,50]}
                elif clf_model == RandomForestClassifier:
                    clf = clf_model(random_state=0)
                    grid = {'n_estimators': [100,1000,5000], 
                            'criterion': ['gini', 'entropy'],
                            'max_depth': [5,7,9],
                            'class_weight':['balanced',None]}    

                # train the model
                for g in ParameterGrid(grid):
                    print(g)
                    # set configuration of hyperparameters
                    clf.set_params(**g)
                    # train model
                    clf.fit(X_train, y_train)
                    # get base predictions
                    y_pred = clf.predict(X_test)
                    # get prediction probabilities
                    y_pred_proba = clf.predict_proba(X_test)[:,1]
                    # scoring
                    train_acc = clf.score(X_train, y_train)
                    test_acc = clf.score(X_test, y_test)
                    fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
                    roc_auc = auc(fpr, tpr)
                    # threshold tuning
                    thresholds = arange(0, 1, 0.001)               
                    scores = [f1_score(y_test, to_labels(y_pred_proba, t)) for t in thresholds]
                    # best threshold
                    ix = argmax(scores)
                    best_threshold = thresholds[ix]
                    threshold_score = scores[ix]
                    # get predicitions with optimized decision threshold
                    y_pred = to_labels(y_pred_proba, best_threshold)

                    # performance monitoring
                    cm = confusion_matrix(y_test, y_pred, labels=clf.classes_)
                    disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                                                  display_labels=clf.classes_)
                    # disp.plot()
                    # plt.show()

                    # granular evaluation metrics
                    clf_rpt = classification_report(y_test, y_pred, target_names=clf.classes_, output_dict=True)
                    fatal_precision = clf_rpt[1]['precision']
                    fatal_recall = clf_rpt[1]['recall']
                    non_fatal_precision = clf_rpt[0]['precision']
                    non_fatal_recall = clf_rpt[0]['recall']

                    # save performance
                    model_perf = model_perf.append({'day_aggregation':unit_of_aggregation, 
                                      'time_shift':shift, 
                                      'categorical_function':to_categorical, 
                                      'classifier_alg':clf_model,
                                      'params': g,
                                      'train_index_size':train_index,
                                      'X':X,
                                      'y':y,
                                      'train_acc':train_acc, 
                                      'test_acc':test_acc, 
                                      'AUC':roc_auc,
                                      'threshold':best_threshold,
                                      'threshold_test_F1':threshold_score,
                                      'fatal_precision':fatal_precision,
                                      'fatal_recall':fatal_recall,
                                      'non_fatal_precision':non_fatal_precision,
                                      'non_fatal_recall':non_fatal_recall}, ignore_index=True)

In [None]:
# save model parameter search results

model_perf.to_csv('/content/drive/MyDrive/data606-capstone/William Simpson - Capstone 606/3. Datasets/Part_3_FINAL_BEST_performance_GridSearch.csv')

In [None]:
# number of model-parameter combinations

model_perf.shape

(10368, 17)

In [None]:
# view best models

model_perf.loc[(model_perf.fatal_recall>=.7) & (model_perf.fatal_precision>=0.7) & (model_perf.train_acc - model_perf.test_acc < 0.2) &
               (model_perf.train_acc >= model_perf.test_acc) & model_perf.AUC > 0.75  ]


Unnamed: 0,day_aggregation,time_shift,categorical_function,classifier_alg,params,train_index_size,X,y,train_acc,test_acc,AUC,threshold,threshold_test_F1,fatal_precision,fatal_recall,non_fatal_precision,non_fatal_recall
7885,5,-1,<function to_categorial_binary_3 at 0x7f704d35...,<class 'sklearn.svm._classes.SVC'>,"{'C': 0.8, 'class_weight': 'balanced', 'gamma'...",76,"[[-0.007872172, -0.04352264, -0.023229796, -0....",index 2016-08-08 0.0 2016-08-13 0.0 20...,0.868056,0.842105,0.75,0.56,0.75,0.75,0.75,0.933333,0.933333
7903,5,-1,<function to_categorial_binary_3 at 0x7f704d35...,<class 'sklearn.svm._classes.SVC'>,"{'C': 0.8, 'class_weight': None, 'gamma': 'sca...",76,"[[-0.007872172, -0.04352264, -0.023229796, -0....",index 2016-08-08 0.0 2016-08-13 0.0 20...,0.868056,0.842105,0.75,0.56,0.75,0.75,0.75,0.933333,0.933333
7921,5,-1,<function to_categorial_binary_3 at 0x7f704d35...,<class 'sklearn.svm._classes.SVC'>,"{'C': 0.9, 'class_weight': 'balanced', 'gamma'...",76,"[[-0.007872172, -0.04352264, -0.023229796, -0....",index 2016-08-08 0.0 2016-08-13 0.0 20...,0.861111,0.842105,0.766667,0.501,0.75,0.75,0.75,0.933333,0.933333
7939,5,-1,<function to_categorial_binary_3 at 0x7f704d35...,<class 'sklearn.svm._classes.SVC'>,"{'C': 0.9, 'class_weight': None, 'gamma': 'sca...",76,"[[-0.007872172, -0.04352264, -0.023229796, -0....",index 2016-08-08 0.0 2016-08-13 0.0 20...,0.861111,0.842105,0.766667,0.501,0.75,0.75,0.75,0.933333,0.933333
7957,5,-1,<function to_categorial_binary_3 at 0x7f704d35...,<class 'sklearn.svm._classes.SVC'>,"{'C': 1.0, 'class_weight': 'balanced', 'gamma'...",76,"[[-0.007872172, -0.04352264, -0.023229796, -0....",index 2016-08-08 0.0 2016-08-13 0.0 20...,0.861111,0.842105,0.766667,0.426,0.75,0.75,0.75,0.933333,0.933333
7975,5,-1,<function to_categorial_binary_3 at 0x7f704d35...,<class 'sklearn.svm._classes.SVC'>,"{'C': 1.0, 'class_weight': None, 'gamma': 'sca...",76,"[[-0.007872172, -0.04352264, -0.023229796, -0....",index 2016-08-08 0.0 2016-08-13 0.0 20...,0.861111,0.842105,0.766667,0.426,0.75,0.75,0.75,0.933333,0.933333
7993,5,-1,<function to_categorial_binary_3 at 0x7f704d35...,<class 'sklearn.svm._classes.SVC'>,"{'C': 1.3, 'class_weight': 'balanced', 'gamma'...",76,"[[-0.007872172, -0.04352264, -0.023229796, -0....",index 2016-08-08 0.0 2016-08-13 0.0 20...,0.861111,0.842105,0.8,0.285,0.75,0.75,0.75,0.933333,0.933333
8011,5,-1,<function to_categorial_binary_3 at 0x7f704d35...,<class 'sklearn.svm._classes.SVC'>,"{'C': 1.3, 'class_weight': None, 'gamma': 'sca...",76,"[[-0.007872172, -0.04352264, -0.023229796, -0....",index 2016-08-08 0.0 2016-08-13 0.0 20...,0.861111,0.842105,0.8,0.285,0.75,0.75,0.75,0.933333,0.933333
