# Terror5

# Step 1. Load data
Load csv file in a dataframe, check encoding and low_memory=False because some columns are mix types.
Columns (4,61,62,66,116,117,123) have mixed types.

In [1]:
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline
%config InlineBackend.figure_format='retina'

# %pdb

import pandas as pd

gtd = pd.read_csv('gtd_utf.csv', encoding='latin1', low_memory=False)
gtd.columns

Index(['eventid', 'iyear', 'imonth', 'iday', 'approxdate', 'extended',
       'resolution', 'country', 'country_txt', 'region',
       ...
       'addnotes', 'scite1', 'scite2', 'scite3', 'dbsource', 'INT_LOG',
       'INT_IDEO', 'INT_MISC', 'INT_ANY', 'related'],
      dtype='object', length=137)

**Check groups bias**

Which group is responsible for the attacks?

In [2]:
from collections import Counter

# Terrorist group name is the target
target = gtd['gname']

gcount = Counter(target)
print('Number of groups = {}'.format(len(gcount)))
g1 = gcount.most_common(1)[0]
print('Be careful with bias, {} = {:.4f}%'.format(g1[0], 100 * g1[1] / target.size))
gcount.most_common(5)

Number of groups = 3290
Be careful with bias, Unknown = 45.8768%


[('Unknown', 71922),
 ('Taliban', 5502),
 ('Shining Path (SL)', 4548),
 ('Farabundo Marti National Liberation Front (FMLN)', 3351),
 ('Islamic State of Iraq and the Levant (ISIL)', 2833)]

# Step 2. Preprocessing

In [3]:
from collections import Counter

threshold = 10
group_count = Counter(gtd['gname'])
groups = [group for group, counter in group_count.items() if counter >= threshold]
len(groups)

527

In [4]:
gtd_clean = gtd[gtd['gname'].isin(groups)]
len(gtd_clean)

150816

### Step 2.1 Remove unknowns

In [5]:
# Remove Unknowns

region2 = gtd_clean[(gtd_clean['region']==2) & (gtd_clean['gname']!='Unknown')]
total_region2 = len(region2)

### Step 2.2 Remove a particular biased group

In [6]:
# Remove a particular group that skews predictions

#region2 = region2[(region2['gname']!='Farabundo Marti National Liberation Front (FMLN)')]
#len(region2)

# Step 3. Naive Bayes Classifier

### Step 3.1 Splitting into Training and Testing sets

In [7]:
from sklearn.model_selection import train_test_split

features_used = ['iyear', 'country']

data_target = region2['gname']
data_features = region2[features_used]

X_train, X_test, Y_train, Y_test = train_test_split(data_features, data_target, test_size=0.30)

train = pd.concat([Y_train, X_train], axis=1)
test = pd.concat([Y_test, X_test], axis=1)

year_counts = X_train['iyear'].value_counts()
total_years = len(year_counts)
year_probabilities = year_counts / total_years

country_counts = X_train['country'].value_counts()
total_country = len(country_counts)
country_probabilities = country_counts / total_country

train_groups = pd.unique(Y_train)
train_years = pd.unique(X_train['iyear'])
train_countries = pd.unique(X_train['country'])

train_groups_len = len(train_groups)
train_years_len = len(train_years)
train_countries_len = len(train_countries)

train_groups_year_likelihoods = [(x, y, 0) for x in train_groups for y in train_years]
train_groups_country_likelihoods = [(x, y, 0) for x in train_groups for y in train_countries]

train_groups_year_likelihoods = pd.DataFrame(train_groups_year_likelihoods)
train_groups_country_likelihoods = pd.DataFrame(train_groups_country_likelihoods)

train_groups_year_likelihoods.columns = ['gname', 'year', 'likelihood']
train_groups_country_likelihoods.columns = ['gname', 'country', 'likelihood']

### Step 3.2 Calculating priors

In [8]:
group_counts = Y_train.value_counts()
total_groups_train = len(group_counts)
total_train = len(Y_train)
prior_probabilities = group_counts / total_train

### Step 3.3 Calculating the class conditional likelihoods

In [9]:
year_likelihoods = []
country_likelihoods = []

sum2 = 0
for gname in pd.unique(Y_train): # for each group
    #print(idx, val)
    filtered_by_group = train[train['gname'] == gname]
    #print("Len for group " + str(gname) + " is " + str(len(filtered_by_group)))
    sum2 = sum2 + len(filtered_by_group)
    
    attacks_by_this_group = len(filtered_by_group)
    
    year_given_group_probability = 0
    country_given_group_probability = 0
    
    for year in train_years:
        filtered_by_group_by_year = filtered_by_group[filtered_by_group['iyear'] == year]
        attacks_by_this_group_this_year = len(filtered_by_group_by_year)
        year_given_group_probability = attacks_by_this_group_this_year / attacks_by_this_group
        
        #train_groups_year_likelihoods[train_groups_year_likelihoods['gname']==gname]['likelihood'] = year_given_group_probability
        #train_groups_year_likelihoods.set_value(gname, year, year_given_group_probability)
        year_likelihoods.append(year_given_group_probability)
        
    for country in train_countries:
        filtered_by_group_by_country = filtered_by_group[filtered_by_group['country'] == country]
        attacks_by_this_group_this_country = len(filtered_by_group_by_country)
        country_given_group_probability = attacks_by_this_group_this_country / attacks_by_this_group
    
        #train_groups_country_likelihoods[gname][country]["likelihood"] = country_given_group_probability
        country_likelihoods.append(country_given_group_probability)

#print(str(train_groups_year_likelihoods.shape))
#print(str(len(year_likelihoods)))
train_groups_year_likelihoods['likelihood'] = year_likelihoods  
train_groups_country_likelihoods['likelihood'] = country_likelihoods

### Step 3. Testing predictions

In [None]:
test_len = len(test)

#print(test)

print("Predicting " + str(test_len) + " attack in test set for which we will predict the group")
print()
print("Total groups in train " + str(total_groups_train))
print("Total attacks in train " + str(total_train))
print("Total number of years in train " + str(len(train_years)))
print("Total number of countries in train " + str(len(train_countries)))

Predicting 1908 attack in test set for which we will predict the group

Total groups in train 63
Total attacks in train 4450
Total number of years in train 30
Total number of countries in train 12


In [None]:
correct = 0
predictions = 0

for index, row in test.iterrows():
    real_gname = row['gname']
    real_iyear = row['iyear']
    real_country = row['country']
    
    predictions = predictions + 1;
    
    #print("Predicting " + str(predictions) + " out of " + str(test_len) + " below")
    #print(str(real_gname) + " " + str(real_iyear) + " " + str(real_country))
    
    gname_posterior = [(x, 0) for x in pd.unique(Y_train)]
    
    gname_posterior = pd.DataFrame(gname_posterior)
    gname_posterior.columns = ['gname', 'posterior']
    
    #print(gname_posterior)
    
    posteriors = []
    
    for gname in pd.unique(Y_train): 
        the_prior = prior_probabilities[gname]
        year_likelihood = train_groups_year_likelihoods[(train_groups_year_likelihoods['gname'] == gname) & (train_groups_year_likelihoods['year'] == real_iyear)]['likelihood']
        try:
            year_likelihood = year_likelihood.iloc[0]
        except IndexError:
            #print("Real year " + str(real_iyear) + " was not in our training set so we got an error")
            year_likelihood = 0
        
        country_likelihood = train_groups_country_likelihoods[(train_groups_country_likelihoods['gname'] == gname) & (train_groups_country_likelihoods['country'] == real_country)]['likelihood']
        try:
            country_likelihood = country_likelihood.iloc[0]
        except IndexError:
            #print("Real country " + str(real_country) + " was not in our training set so we got an error")
            country_likelihood = 0
        the_posterior = year_likelihood * country_likelihood * the_prior
        posteriors.append(the_posterior)
        
        #year_likelihood = train_groups_year_likelihoods[train_groups_year_likelihoods[('gname' == gname) & ('year' == real_iyear)]]['likelihood']
        #country_likelihood = train_groups_year_likelihoods[train_groups_year_likelihoods[('gname' == gname) & ('country' == real_country)]]['likelihood']
        
        '''
        print("Calculating posterior probability as if it was: " + gname)
        print("Prior: " + str(the_prior))
        print("Year likelihood: " + str(year_likelihood))
        print("Country likelihood: " + str(country_likelihood))
        print("Posterior probability: " + str(the_posterior))
        print()
        '''
        
    gname_posterior['posterior'] = posteriors
    #print(gname_posterior)
    #print(pd.unique(Y_train))
    
    predicted_gname = gname_posterior.sort_values(['posterior'], ascending=[False]).iloc[0]
    predicted_gname = predicted_gname['gname']
    
    #print("Real group is: " + real_gname)
    #print("Predicted group is: " + predicted_gname)
    
    if predicted_gname == real_gname:
        correct = correct + 1
    
    #print(predictions, end="\r")
    
    print(str(predictions) + "/" + str(test_len) + "; accuracy on test set until now is: " + str(correct/predictions*100) + "%", end="\r")
    
    #break # after first prediction for debug purposes

print("Accuracy on all of the test set is: " + str(correct/len(test)*100) + "%")

1803/1908; accuracy on test set until now is: 84.02662229617304%

In [None]:
# run all above from here

# Proof that priors add up to 1

In [None]:
print(prior_probabilities.sum())

# Proof that class conditional likelihoods add up to 1

In [None]:
for gname in pd.unique(Y_train):
    print(gname)
    year_l_filtered_by_group = train_groups_year_likelihoods[train_groups_year_likelihoods['gname'] == gname]
    country_l_filtered_by_group = train_groups_country_likelihoods[train_groups_country_likelihoods['gname'] == gname]
    
    the_sum = year_l_filtered_by_group.sum()
    the_sum2 = country_l_filtered_by_group.sum()
    
    print("Above group has " + str(len(year_l_filtered_by_group)) + " likelihoods for year")
    print("Above group has " + str(len(country_l_filtered_by_group)) + " likelihoods for country")
    print()
    print("Year likelihoods below")
    print(year_l_filtered_by_group)
    print()
    print("Sum should be 1 (all year likelihoods for a particular group summed up): " + str(the_sum[2]))
    print("Sum should be 1 (all country likelihoods for a particular group summed up): " + str(the_sum2[2]))
    
    break