# Terror5
### Step 1. Load data
Load csv file in a dataframe, check encoding and low_memory=False because some columns are mix types.
Columns (4,61,62,66,116,117,123) have mixed types.

In [1]:
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline
%config InlineBackend.figure_format='retina'
%pdb

import pandas as pd

gtd = pd.read_csv('gtd_utf.csv', encoding='latin1', low_memory=False)
gtd.columns

Automatic pdb calling has been turned ON


Index(['eventid', 'iyear', 'imonth', 'iday', 'approxdate', 'extended',
       'resolution', 'country', 'country_txt', 'region',
       ...
       'addnotes', 'scite1', 'scite2', 'scite3', 'dbsource', 'INT_LOG',
       'INT_IDEO', 'INT_MISC', 'INT_ANY', 'related'],
      dtype='object', length=137)

**Check groups bias**

Which group is responsible for the attacks?

In [2]:
from collections import Counter

# Terrorist group name is the target
target = gtd['gname']

gcount = Counter(target)
print('Number of groups = {}'.format(len(gcount)))
g1 = gcount.most_common(1)[0]
print('Be careful with bias, {} = {:.4f}%'.format(g1[0], 100 * g1[1] / target.size))
gcount.most_common(5)

Number of groups = 3290
Be careful with bias, Unknown = 45.8768%


[('Unknown', 71922),
 ('Taliban', 5502),
 ('Shining Path (SL)', 4548),
 ('Farabundo Marti National Liberation Front (FMLN)', 3351),
 ('Islamic State of Iraq and the Levant (ISIL)', 2833)]

### Step 2. Preprocessing

Change target classes from text to int

In [3]:
from collections import Counter

threshold = 10
group_count = Counter(gtd['gname'])
groups = [group for group, counter in group_count.items() if counter >= threshold]
len(groups)

527

In [4]:
gtd_clean = gtd[gtd['gname'].isin(groups)]
len(gtd_clean)

150816

In [5]:
# Remove Unknowns

region2 = gtd_clean[(gtd_clean['region']==2) & (gtd_clean['gname']!='Unknown')]
total_region2 = len(region2)

In [6]:
# Remove a particular group that skews predictions

#region2 = region2[(region2['gname']!='Farabundo Marti National Liberation Front (FMLN)')]
#len(region2)

Naive Bayes Classifier

In [7]:
#group_names = np.unique(region2['gname'])
group_counts = region2['gname'].value_counts()
total_groups_region2 = len(group_counts)
prior_probabilities = group_counts / total_region2

In [8]:
# print("Total groups in region2 " + str(total_groups_region2))
# print("Total attacks in region2 " + str(total_region2))
# print("Attack count for Nicaraguan " + str(group_counts['Nicaraguan Resistance']))
# print("Prior for Nicaraguan " + str(prior_probabilities['Nicaraguan Resistance']))

features_used = ['iyear', 'country']

data_target = region2['gname']
data_features = region2[features_used]

# print("Data Target Shape " + str(data_target.shape))
# print("Data Features Shape " + str(data_features.shape))

from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(data_features, data_target, test_size=0.30)

train = pd.concat([Y_train, X_train], axis=1)
test = pd.concat([Y_test, X_test], axis=1)

# print("Train Shape" + str(train.shape))
# print("Test Shape" + str(test.shape))
# print()
# print("X_train Shape " + str(X_train.shape))
# print("X_test Shape " + str(X_test.shape))
# print("Y_train Shape " + str(Y_train.shape))
# print("Y_test Shape " + str(Y_test.shape))

group_counts = Y_train.value_counts()
total_groups_train = len(group_counts)
total_train = len(Y_train)
prior_probabilities = group_counts / total_train

#prior_probabilities = pd.DataFrame(prior_probabilities)
#prior_probabilities['gname'] = prior_probabilities.index
#prior_probabilities.columns = ['gname', 'probability']

#prior_probabilities

In [None]:


print();
print("Total groups in train " + str(total_groups_train))
print("Total attacks in train " + str(total_train))
print("Attack count for Nicaraguan " + str(group_counts['Nicaraguan Resistance']))
print("Prior for Nicaraguan " + str(prior_probabilities['Nicaraguan Resistance']))

# for feature used

year_counts = X_train['iyear'].value_counts()
total_years = len(year_counts)
year_probabilities = year_counts / total_years

country_counts = X_train['country'].value_counts()
total_country = len(country_counts)
country_probabilities = country_counts / total_country

#ndarray, optional = np.unique(Y_train, return_inverse=True)
#print("ndarray")
#print(ndarray)
#print()
#print("optional")
#print(optional)

train_groups = pd.unique(Y_train)
train_years = pd.unique(X_train['iyear'])
train_countries = pd.unique(X_train['country'])

train_groups_len = len(train_groups)
train_years_len = len(train_years)
train_countries_len = len(train_countries)

#train_groups_year_likelihoods = pd.DataFrame({"gname":pd.unique(Y_train)})
#train_groups_country_likelihoods = pd.DataFrame({"gname":pd.unique(Y_train)})

#train_groups_year_likelihoods["year"] = ""
#train_groups_year_likelihoods["likelihood"] = ""

#train_groups_country_likelihoods["country"] = ""
#train_groups_country_likelihoods["likelihood"] = ""

train_groups_year_likelihoods = [(x, y, 0) for x in train_groups for y in train_years]
train_groups_country_likelihoods = [(x, y, 0) for x in train_groups for y in train_countries]

train_groups_year_likelihoods = pd.DataFrame(train_groups_year_likelihoods)
train_groups_country_likelihoods = pd.DataFrame(train_groups_country_likelihoods)

#train_groups_year_likelihoods.rename(columns={'0':'gname', '1':'year', '2':'likelihood'}, inplace=True)
#train_groups_country_likelihoods.rename(columns={'0':'gname', '1':'country', '2':'likelihood'}, inplace=True)

train_groups_year_likelihoods.columns = ['gname', 'year', 'likelihood']
train_groups_country_likelihoods.columns = ['gname', 'country', 'likelihood']

year_likelihoods = []
country_likelihoods = []

sum2 = 0
for gname in pd.unique(Y_train): # for each group
    #print(idx, val)
    filtered_by_group = train[train['gname'] == gname]
    #print("Len for group " + str(gname) + " is " + str(len(filtered_by_group)))
    sum2 = sum2 + len(filtered_by_group)
    
    attacks_by_this_group = len(filtered_by_group)
    
    year_given_group_probability = 0
    country_given_group_probability = 0
    
    for year in train_years:
        filtered_by_group_by_year = filtered_by_group[filtered_by_group['iyear'] == year]
        attacks_by_this_group_this_year = len(filtered_by_group_by_year)
        year_given_group_probability = attacks_by_this_group_this_year / attacks_by_this_group
        
        #train_groups_year_likelihoods[train_groups_year_likelihoods['gname']==gname]['likelihood'] = year_given_group_probability
        #train_groups_year_likelihoods.set_value(gname, year, year_given_group_probability)
        year_likelihoods.append(year_given_group_probability)
        
    for country in train_countries:
        filtered_by_group_by_country = filtered_by_group[filtered_by_group['country'] == country]
        attacks_by_this_group_this_country = len(filtered_by_group_by_country)
        country_given_group_probability = attacks_by_this_group_this_country / attacks_by_this_group
    
        #train_groups_country_likelihoods[gname][country]["likelihood"] = country_given_group_probability
        country_likelihoods.append(country_given_group_probability)

#print(str(train_groups_year_likelihoods.shape))
#print(str(len(year_likelihoods)))
train_groups_year_likelihoods['likelihood'] = year_likelihoods  
train_groups_country_likelihoods['likelihood'] = country_likelihoods

#print(train_groups_year_likelihoods)
#print(train_groups_country_likelihoods)


Total groups in train 62
Total attacks in train 4450
Attack count for Nicaraguan 150
Prior for Nicaraguan 0.0337078651685


In [None]:
correct = 0

#print(train_groups_year_likelihoods)

test_len = len(test)

for index, row in test.iterrows():
    real_gname = row['gname']
    real_iyear = row['iyear']
    real_country = row['country']
    
    print(str(index) + " out of " + test_len)
    #print(str(real_gname) + " " + str(real_iyear) + " " + str(real_country))
    
    gname_posterior = [(x, 0) for x in pd.unique(Y_train)]
    
    gname_posterior = pd.DataFrame(gname_posterior)
    gname_posterior.columns = ['gname', 'posterior']
    
    posteriors = []
    
    for gname in pd.unique(Y_train): 
        the_prior = prior_probabilities[gname]
        year_likelihood = train_groups_year_likelihoods[(train_groups_year_likelihoods['gname'] == gname) & (train_groups_year_likelihoods['year'] == real_iyear)]['likelihood']
        year_likelihood = year_likelihood.iloc[0]
        country_likelihood = train_groups_country_likelihoods[(train_groups_country_likelihoods['gname'] == gname) & (train_groups_country_likelihoods['country'] == real_country)]['likelihood']
        country_likelihood = country_likelihood.iloc[0]
        #year_likelihood = train_groups_year_likelihoods[train_groups_year_likelihoods[('gname' == gname) & ('year' == real_iyear)]]['likelihood']
        #country_likelihood = train_groups_year_likelihoods[train_groups_year_likelihoods[('gname' == gname) & ('country' == real_country)]]['likelihood']
        
        #print(year_likelihood)
        #print(country_likelihood)
        
        the_posterior = year_likelihood * country_likelihood * the_prior
        
        posteriors.append(the_posterior)
    

    
    #print(gname_posterior)
    
    predicted_gname = gname_posterior.sort_values(['posterior'], ascending=[False]).iloc[0]
    predicted_gname = predicted_gname['gname']
    
    print("Real " + real_gname + " predicted " + predicted_gname)
    
    if predicted_gname == gname:
        correct = correct + 1

print(correct/len(test)*100)

In [58]:
#gname = 'Separatists'
#year_likelihood = train_groups_year_likelihoods[(train_groups_year_likelihoods['gname'] == gname) & (train_groups_year_likelihoods['year'] == real_iyear)]['likelihood']
#year_likelihood = year_likelihood.iloc[0]
#country_likelihood = train_groups_country_likelihoods[(train_groups_country_likelihoods['gname'] == gname) & (train_groups_country_likelihoods['country'] == real_country)]['likelihood']
#country_likelihood = country_likelihood.iloc[0]

#print(year_likelihood)
#print(country_likelihood)

#print(gname_posterior)

#test = gname_posterior.sort_values(['posterior'], ascending=[False]).iloc[0]
#test['gname']
#test = gname_posterior.sort('posterior', ascending=False).iloc[0]
#test.iloc[0]

0.0
0.0


'Farabundo Marti National Liberation Front (FMLN)'

In [None]:
region2.to_csv('gtd_region2_2.csv')

In [None]:
import numpy as np

# Choose most frequent groups as a lower index
group_map = sorted(gcount, key=gcount.__getitem__, reverse=True)
itarget = target.apply(group_map.index)
# Just to test, check how to differentiate unknown from known
itarget = itarget.apply(lambda x: 1*(x!=0))

In [None]:
%matplotlib notebook

import matplotlib.pyplot as plt

features = gtd[['country', 'attacktype1', 'iyear']]
unk_features = features[itarget==0]
known_features = features[itarget==1]
fig = plt.figure()
plt.scatter(unk_features['iyear'], unk_features['country'], c='b')
plt.scatter(known_features['iyear'], known_features['country'], c='r', alpha=0.5)


### Step 3. Neural Network classifier
Use a MLP (Multi Layer Perceptron) as a classifier of the group

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier

X_train, X_test, y_train, y_test = train_test_split(features, itarget, test_size=0.20)

nnet = MLPClassifier(hidden_layer_sizes=(1,))
nnet.fit(X_train, y_train)
nnet.score(X_test, y_test)