# Import required libraries

In [None]:
import numpy as np
import tensorflow as tf
import matplotlib.cm as cm
from matplotlib import pyplot as plt 
from keras.models import Sequential
from keras.layers import Activation, Dropout, Flatten, Dense, Conv1D
from keras.layers.recurrent import GRU
from keras.layers.wrappers import TimeDistributed
from keras.preprocessing.sequence import pad_sequences
from keras.layers.embeddings import Embedding
from sklearn.metrics import confusion_matrix, accuracy_score
from conlleval import conlleval

# packages for learning from crowds
from crowd_layer.crowd_layers import CrowdsClassification, MaskedMultiSequenceCrossEntropy
from crowd_layer.crowd_aggregators import CrowdsCategoricalAggregator

# prevent tensorflow from allocating the entire GPU memory at once
config = tf.ConfigProto()
config.gpu_options.allow_growth=True
sess = tf.Session(config=config)

# Configuration parameters

In [2]:
NUM_RUNS = 30
DATA_PATH = "/home/fmpr/datasets/deep-crowds-datasets/ner-mturk/"
EMBEDDING_DIM = 300
BATCH_SIZE = 64

# Load indexing word vectors

In [3]:
embeddings_index = {}
f = open("/home/fmpr/datasets/glove.6B/glove.6B.%dd.txt" % (EMBEDDING_DIM,))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Found %s word vectors' % len(embeddings_index))

Found 400000 word vectors


# Load data

In [4]:
def read_conll(filename):
    raw = open(filename, 'r').readlines()
    all_x = []
    point = []
    for line in raw:
        stripped_line = line.strip().split(' ')
        point.append(stripped_line)
        if line == '\n':
            if len(point[:-1]) > 0:
                all_x.append(point[:-1])
            point = []
    all_x = all_x
    return all_x

In [5]:
all_answers = read_conll(DATA_PATH+'answers.txt')
all_mv = read_conll(DATA_PATH+'mv.txt')
all_ground_truth = read_conll(DATA_PATH+'ground_truth.txt')
all_test = read_conll(DATA_PATH+'testset.txt')
all_docs = all_ground_truth + all_test
print "Answers data size:", len(all_answers)
print "Majority voting data size:", len(all_mv)
print "Ground truth data size:", len(all_ground_truth)
print "Test data size:", len(all_test)
print "Total sequences:", len(all_docs)

Answers data size: 5985
Majority voting data size: 5985
Ground truth data size: 5985
Test data size: 3250
Total sequences: 9235


# Process documents

In [6]:
X_train = [[c[0] for c in x] for x in all_answers]
y_answers = [[c[1:] for c in y] for y in all_answers]
y_mv = [[c[1] for c in y] for y in all_mv]
y_ground_truth = [[c[1] for c in y] for y in all_ground_truth]
X_test = [[c[0] for c in x] for x in all_test]
y_test = [[c[1] for c in y] for y in all_test]
X_all = [[c[0] for c in x] for x in all_docs]
y_all = [[c[1] for c in y] for y in all_docs]

N_ANNOT = len(y_answers[0][0])
print "Num annnotators:", N_ANNOT

lengths = [len(x) for x in all_docs]
all_text = [c for x in X_all for c in x]
words = list(set(all_text))
word2ind = {word: index for index, word in enumerate(words)}
ind2word = {index: word for index, word in enumerate(words)}
labels = list(set([c for x in y_all for c in x]))
print "Labels:", labels
label2ind = {label: (index + 1) for index, label in enumerate(labels)}
ind2label = {(index + 1): label for index, label in enumerate(labels)}
ind2label[0] = "O" # padding index
print 'Input sequence length range: ', max(lengths), min(lengths)

max_label = max(label2ind.values()) + 1
print "Max label:", max_label

maxlen = max([len(x) for x in X_all])
print 'Maximum sequence length:', maxlen

Num annnotators: 47
Labels: ['I-LOC', 'B-ORG', 'I-PER', 'O', 'I-MISC', 'B-MISC', 'I-ORG', 'B-LOC', 'B-PER']
Input sequence length range:  109 1
Max label: 10
Maximum sequence length: 109


# Prepare embedding matrix

In [7]:
num_words = len(word2ind)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word2ind.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

# Convert data to one-hot encoding

In [8]:
def encode(x, n):
    result = np.zeros(n)
    result[x] = 1
    return result

In [9]:
X_train_enc = [[word2ind[c] for c in x] for x in X_train]
y_ground_truth_enc = [[0] * (maxlen - len(ey)) + [label2ind[c] for c in ey] for ey in y_ground_truth]
y_ground_truth_enc = [[encode(c, max_label) for c in ey] for ey in y_ground_truth_enc]
y_mv_enc = [[0] * (maxlen - len(ey)) + [label2ind[c] for c in ey] for ey in y_mv]
y_mv_enc = [[encode(c, max_label) for c in ey] for ey in y_mv_enc]

y_answers_enc = []
for r in xrange(N_ANNOT):
    annot_answers = []
    for i in xrange(len(y_answers)):
        seq = []
        for j in xrange(len(y_answers[i])):
            #enc = -1*np.ones(max_label)
            enc = -1
            if y_answers[i][j][r] != "?":
                enc = label2ind[y_answers[i][j][r]]
            seq.append(enc)
        annot_answers.append(seq)
    y_answers_enc.append(annot_answers)

X_test_enc = [[word2ind[c] for c in x] for x in X_test]
y_test_enc = [[0] * (maxlen - len(ey)) + [label2ind[c] for c in ey] for ey in y_test]
y_test_enc = [[encode(c, max_label) for c in ey] for ey in y_test_enc]

# Pad sequences

In [10]:
# pad sequences
X_train_enc = pad_sequences(X_train_enc, maxlen=maxlen)
y_ground_truth_enc = pad_sequences(y_ground_truth_enc, maxlen=maxlen)
X_test_enc = pad_sequences(X_test_enc, maxlen=maxlen)
y_test_enc = pad_sequences(y_test_enc, maxlen=maxlen)

y_answers_enc_padded = []
for r in xrange(N_ANNOT):
    padded_answers = pad_sequences(y_answers_enc[r], maxlen=maxlen)
    y_answers_enc_padded.append(padded_answers)

y_answers_enc_padded = np.array(y_answers_enc_padded)
y_answers_enc = np.transpose(np.array(y_answers_enc_padded), [1, 2, 0])

n_train = len(X_train_enc)
n_test = len(X_test_enc)

print 'Training and testing tensor shapes:'
print X_train_enc.shape, X_test_enc.shape, y_ground_truth_enc.shape, y_test_enc.shape

print "Answers shape:", y_answers_enc.shape

N_CLASSES = len(label2ind) + 1
print "Num classes:", N_CLASSES

Training and testing tensor shapes:
(5985, 109) (3250, 109) (5985, 109, 10) (3250, 109, 10)
Answers shape: (5985, 109, 47)
Num classes: 10


# Define the base deep learning model

Here we shall use features representation produced by the VGG16 network as the input. Our base model is then simply composed by one densely-connected layer with 128 hidden units and an output dense layer. We use 50% dropout between the two dense layers.

In [11]:
def build_base_model():
    base_model = Sequential()
    base_model.add(Embedding(num_words,
                        300,
                        weights=[embedding_matrix],
                        input_length=maxlen,
                        trainable=True))
    base_model.add(Conv1D(512, 5, padding="same", activation="relu"))
    base_model.add(Dropout(0.5))
    base_model.add(GRU(50, return_sequences=True))
    base_model.add(TimeDistributed(Dense(N_CLASSES, activation='softmax')))
    base_model.compile(loss='categorical_crossentropy', optimizer='adam')

    return base_model

# Auxiliary functions for evaluating the models

In [12]:
def score(yh, pr):
    coords = [np.where(yhh > 0)[0][0] for yhh in yh]
    yh = [yhh[co:] for yhh, co in zip(yh, coords)]
    ypr = [prr[co:] for prr, co in zip(pr, coords)]
    fyh = [c for row in yh for c in row]
    fpr = [c for row in ypr for c in row]
    return fyh, fpr

def eval_model(model):
    pr_test = model.predict(X_test_enc, verbose=2)
    pr_test = np.argmax(pr_test, axis=2)

    yh = y_test_enc.argmax(2)
    fyh, fpr = score(yh, pr_test)
    print 'Testing accuracy:', accuracy_score(fyh, fpr)
    print 'Testing confusion matrix:'
    print confusion_matrix(fyh, fpr)

    preds_test = []
    for i in xrange(len(pr_test)):
        row = pr_test[i][-len(y_test[i]):]
        row[np.where(row == 0)] = 1
        preds_test.append(row)
    preds_test = [ list(map(lambda x: ind2label[x], y)) for y in preds_test]

    results_test = conlleval(preds_test, y_test, X_test, 'r_test.txt')
    print "Results for testset:", results_test

    return results_test

# Train the model on the true labels (ground truth) and evaluate on testset

In [13]:
model = build_base_model()
model.fit(X_train_enc, y_ground_truth_enc, batch_size=BATCH_SIZE, epochs=20, verbose=2)

Epoch 1/20
14s - loss: 0.1156
Epoch 2/20
13s - loss: 0.0358
Epoch 3/20
13s - loss: 0.0200
Epoch 4/20
13s - loss: 0.0107
Epoch 5/20
13s - loss: 0.0059
Epoch 6/20
13s - loss: 0.0034
Epoch 7/20
13s - loss: 0.0023
Epoch 8/20
13s - loss: 0.0015
Epoch 9/20
13s - loss: 0.0012
Epoch 10/20
13s - loss: 8.8251e-04
Epoch 11/20
13s - loss: 7.1133e-04
Epoch 12/20
13s - loss: 5.9820e-04
Epoch 13/20
13s - loss: 4.8791e-04
Epoch 14/20
13s - loss: 4.0111e-04
Epoch 15/20
12s - loss: 3.5680e-04
Epoch 16/20
13s - loss: 2.7868e-04
Epoch 17/20
13s - loss: 2.3333e-04
Epoch 18/20
13s - loss: 2.0176e-04
Epoch 19/20
13s - loss: 1.8314e-04
Epoch 20/20
13s - loss: 1.8913e-04


<keras.callbacks.History at 0x7f4cb47b5e50>

In [14]:
results_test = eval_model(model)

Testing accuracy: 0.947860285814
Testing confusion matrix:
[[    0     0     0     0     0     0     0     0     0     0]
 [    0   150     3    15    20    11     0    53     4     1]
 [    0     0  1100     7   126     2    44    12    23    27]
 [    0     2     5  1138   111     0     2    37     0    12]
 [    1   123   217    97 42010    33    97    94    33    58]
 [    0     6    12     5    58   216    19    19     4     6]
 [    0     1    74     0   118     7   680     2    26    11]
 [    0     8    32    15    82    20     3   580    10     1]
 [    0     2   204     4   114     0    67     7  1424    15]
 [    0     1   211    26   159     2    30     9    18  1386]]
Results for testset: {'p': 70.28, 'r': 72.45, 'f1': 71.35}


# Train the model on the output of majority voting and evaluate on testset

In [15]:
model = build_base_model()
model.fit(X_train_enc, y_mv_enc, batch_size=BATCH_SIZE, epochs=10, verbose=2)

Epoch 1/10
13s - loss: 0.1046
Epoch 2/10
13s - loss: 0.0382
Epoch 3/10
13s - loss: 0.0267
Epoch 4/10
13s - loss: 0.0204
Epoch 5/10
13s - loss: 0.0161
Epoch 6/10
13s - loss: 0.0127
Epoch 7/10
13s - loss: 0.0101
Epoch 8/10
13s - loss: 0.0082
Epoch 9/10
13s - loss: 0.0070
Epoch 10/10
13s - loss: 0.0060


<keras.callbacks.History at 0x7f4c884298d0>

In [16]:
results_test = eval_model(model)

Testing accuracy: 0.910108640629
Testing confusion matrix:
[[    0     0     0     0     0     0     0     0     0     0]
 [    0   122     0     6    67     1     1    40    16     4]
 [    0     0   506     6   594     1    14    16   177    27]
 [    0     0     2   942   332     3     0    14     1    13]
 [    2    39    49    33 42381    40    89    48    46    36]
 [    0     6     4     4   200    93     9    21     2     6]
 [    0     1    57     0   534     1   266     4    46    10]
 [    0    44     7    27   371     6     1   264    26     5]
 [    0     7    98     1   581     0    14     4  1118    14]
 [    0     1    46    12   693     1     2     7    27  1053]]
Results for testset: {'p': 64.48, 'r': 44.54, 'f1': 52.68}


# Train the model using proposed DL-MW approach and evaluate on testset

We start by pre-training the base model for a few iteration using the output of majority voting, as this improves the stability of the crowds layers. We then add a new layer (CrowdsClassification) on top of our neural network. Finally, we make use of a special loss (MaskedMultiSequenceCrossEntropy) to handle the missing labels from some of the annotators (encoded as "-1").

In [17]:
model = build_base_model()

# pre-train base model for a few iterations using the output of majority voting
model.fit(X_train_enc, y_mv_enc, batch_size=BATCH_SIZE, epochs=5, verbose=2)

# add crowds layer on top of the base model
model.add(CrowdsClassification(N_CLASSES, N_ANNOT, conn_type="MW"))

# instantiate specialized masked loss to handle missing answers
loss = MaskedMultiSequenceCrossEntropy(N_CLASSES).loss

# compile model with masked loss and train
model.compile(optimizer='adam', loss=loss)
model.fit(X_train_enc, y_answers_enc, batch_size=BATCH_SIZE, epochs=30, verbose=2)

Epoch 1/5
13s - loss: 0.1023
Epoch 2/5
13s - loss: 0.0364
Epoch 3/5
13s - loss: 0.0258
Epoch 4/5
13s - loss: 0.0194
Epoch 5/5
13s - loss: 0.0148
Epoch 1/30
14s - loss: 0.0177
Epoch 2/30
13s - loss: 0.0163
Epoch 3/30
13s - loss: 0.0150
Epoch 4/30
13s - loss: 0.0138
Epoch 5/30
13s - loss: 0.0128
Epoch 6/30
13s - loss: 0.0118
Epoch 7/30
13s - loss: 0.0109
Epoch 8/30
13s - loss: 0.0101
Epoch 9/30
13s - loss: 0.0093
Epoch 10/30
13s - loss: 0.0087
Epoch 11/30
13s - loss: 0.0081
Epoch 12/30
13s - loss: 0.0076
Epoch 13/30
13s - loss: 0.0071
Epoch 14/30
13s - loss: 0.0066
Epoch 15/30
13s - loss: 0.0063
Epoch 16/30
13s - loss: 0.0059
Epoch 17/30
13s - loss: 0.0056
Epoch 18/30
13s - loss: 0.0053
Epoch 19/30
13s - loss: 0.0050
Epoch 20/30
13s - loss: 0.0048
Epoch 21/30
13s - loss: 0.0046
Epoch 22/30
13s - loss: 0.0044
Epoch 23/30
13s - loss: 0.0042
Epoch 24/30
13s - loss: 0.0040
Epoch 25/30
13s - loss: 0.0039
Epoch 26/30
13s - loss: 0.0037
Epoch 27/30
13s - loss: 0.0036
Epoch 28/30
13s - loss: 0.0

<keras.callbacks.History at 0x7f4d325cdf10>

Before evaluating our model, we need to remove the crowds layer used during training in order to expose the aggregation (bottleneck) layer

In [18]:
# save weights from crowds layer for later
weights = model.layers[5].get_weights()

# remove crowds layer before making predictions
model.pop() 
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

results_test = eval_model(model)

Testing accuracy: 0.926891476189
Testing confusion matrix:
[[  143     2     8    41     8     0    37    17     1]
 [    1   689     5   333     2    32    19   226    34]
 [    1     4  1156   109     2     1    17     1    16]
 [  138    84    56 41883   123   215   132    85    47]
 [    7     6     8    96   174    12    27     5    10]
 [    1    63     0   327     9   433     4    58    24]
 [   56    17    29   227    14     4   371    26     7]
 [    1    80     2   269     0    37     6  1426    16]
 [    1    40    16   402     3     8     7    33  1332]]
Results for testset: {'p': 65.21, 'r': 59.27, 'f1': 62.1}
