In [61]:
import pandas as pd
import tensorflow as tf
from collections import Counter
from sklearn.preprocessing import LabelBinarizer, StandardScaler
from sklearn.model_selection import train_test_split


In [6]:
gtd = pd.read_csv('gtd_utf.csv', low_memory=False)
gtd.columns


Index(['eventid', 'iyear', 'imonth', 'iday', 'approxdate', 'extended',
       'resolution', 'country', 'country_txt', 'region',
       ...
       'addnotes', 'scite1', 'scite2', 'scite3', 'dbsource', 'INT_LOG',
       'INT_IDEO', 'INT_MISC', 'INT_ANY', 'related'],
      dtype='object', length=137)

### Preprocessing

In [18]:
# Keep terrorist groups with at least a threashold number of attacks and remove 'Unknown'
threshold = 10
gcount = Counter(gtd['gname'])
groups = [group for group, counter in gcount.items() if (counter >= threshold) and group != 'Unknown']
print('Groups with more than {} attacks = {}'.format(threshold, len(groups)))


Groups with more than 10 attacks = 526


In [17]:
gtd_groups = gtd[gtd['gname'].isin(groups)]
print('GTD length = {} to just important = {}'.format(len(gtd), len(gtd_groups)))


GTD length = 156772 to just important = 78894


In [94]:
# Define features
feature_data = gtd_groups[['iyear', 'country', 'attacktype1', 'weaptype1']].as_matrix()
# Binarize labels
lb = LabelBinarizer()
# label_data = lb.fit_transform(gtd_groups['gname'])
label_data = gtd_groups['gname']
print('Features defined and label binarized')


Features defined and label binarized


In [95]:
# Test data
X_train, X_test, y_train, y_test = train_test_split(feature_data, label_data, test_size=0.20, random_state=1)
# Validation data
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.10, random_state=1)
print('Data splitted in train, validation and test')


Data splitted in train, validation and test


### Prepare neural network model

In [90]:
features_count = 4
labels_count = len(groups)

features = tf.placeholder(tf.float32, [None, features_count])
labels = tf.placeholder(tf.float32, [None, labels_count])

feed_train = {features: X_train, labels: y_train}
feed_valid = {features: X_valid, labels: y_valid}
feed_test = {features: X_test, labels: y_test}


**One layer NN:** Equivalent to logistic regression

In [91]:
# One layer
weights = tf.Variable(tf.truncated_normal([features_count, labels_count]))
biases = tf.Variable(tf.zeros([labels_count,]))

logits = tf.matmul(features, weights) + biases
prediction = tf.nn.softmax(logits)
# Loss function
cross_entropy = -tf.reduce_sum(labels * tf.log(prediction), reduction_indices=1)
loss = tf.reduce_mean(cross_entropy)


In [92]:
# Run after model preparation
learning_rate = 0.2

optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)
init = tf.global_variables_initializer()
with tf.Session() as session:
    session.run(init)
    print(session.run([optimizer, loss], feed_dict=feed_train))

[None, nan]


In [96]:
from sklearn.neural_network import MLPClassifier

nnet = MLPClassifier(hidden_layer_sizes=(4,4,4,4))
nnet.fit(X_train, y_train)
nnet.score(X_valid, y_valid)


0.35630544993662866