In [None]:
!wget -qq https://raw.githubusercontent.com/svinkapeppa/trash/master/conlleval.py
!wget -qq https://raw.githubusercontent.com/svinkapeppa/trash/master/utils.py
!wget -qq https://raw.githubusercontent.com/svinkapeppa/trash/master/install
!pip install git+https://www.github.com/keras-team/keras-contrib.git

In [None]:
!chmod 777 install
!./install

In [None]:
from keras.models import Model
from keras.layers import TimeDistributed, Conv1D, Dense, Embedding, Input, Dropout, LSTM, Bidirectional, MaxPooling1D, Flatten, concatenate
from keras.initializers import RandomUniform
from keras.optimizers import Nadam
from keras.losses import sparse_categorical_crossentropy
from keras_contrib.layers import CRF
from keras_contrib.losses import crf_loss

import utils
import random
import time
import math
import pickle

In [None]:
NUM_EPOCHS = 200
BATCH_SIZE = 128
LSTM_DIM = 300
CHAR_EMB_DIM = 30
KERNEL_SIZE = 4
CHAR_FILTERS = 30
DROPOUT = 0.5
RECURRENT_DROPOUT = 0.3

In [None]:
with open('features/all_onehot.train', 'rb') as f:
    features_train = pickle.load(f)
with open('features/all_onehot.testa', 'rb') as f:
    features_dev = pickle.load(f)
with open('features/all_onehot.testb', 'rb') as f:
    features_test = pickle.load(f)

In [None]:
with open('features/gazetteer_PERLOC.train', 'rb') as f:
    gaze_train = pickle.load(f)
with open('features/gazetteer_PERLOC.testa', 'rb') as f:
    gaze_dev = pickle.load(f)
with open('features/gazetteer_PERLOC.testb', 'rb') as f:
    gaze_test = pickle.load(f)

In [None]:
train_sentences = utils.read_sentences('data/train')
valid_sentences = utils.read_sentences('data/valid')
test_sentences = utils.read_sentences('data/test')

print('Number of TRAIN sentences: {}'.format(len(train_sentences)))
print('Number of VALID sentences: {}'.format(len(valid_sentences)))
print('Number of TEST sentences: {}'.format(len(test_sentences)))

In [None]:
utils.convert_tags(train_sentences)
utils.convert_tags(valid_sentences)
utils.convert_tags(test_sentences)

In [None]:
tag_idx, idx_tag = utils.create_tag_mapping([train_sentences, valid_sentences, test_sentences])

In [None]:
word_idx, idx_word, word_embeddings = utils.create_word_mapping('glove/glove.6B.100d.txt')

In [None]:
char_idx, idx_char = utils.create_char_mapping([train_sentences, valid_sentences, test_sentences])

In [None]:
case_idx, case_embeddings = utils.create_case_mapping()

In [None]:
max_word_length = utils.get_max_word_length([train_sentences, valid_sentences, test_sentences])

In [None]:
utils.add_auxiliary_information(train_sentences)
utils.add_auxiliary_information(valid_sentences)
utils.add_auxiliary_information(test_sentences)

In [None]:
train_batches = utils.create_batches(train_sentences, BATCH_SIZE, max_word_length, word_idx,
                                     char_idx, tag_idx, features_train, gaze_train)
valid_batches = utils.create_batches(valid_sentences, BATCH_SIZE, max_word_length, word_idx,
                                     char_idx, tag_idx, features_dev, gaze_dev)
test_batches = utils.create_batches(test_sentences, BATCH_SIZE, max_word_length, word_idx,
                                    char_idx, tag_idx, features_test, gaze_test)

In [None]:
def CharCNNBiLSTM(word_vocab_size, case_vocab_size, char_vocab_size,
                  word_embeddings_dim, case_embeddings_dim, max_word_length,
                  word_embeddings, case_embeddings, tag_set_size):
    word_input = Input(shape=(None,))
    word_embeddings = Embedding(
        word_vocab_size, word_embeddings_dim, weights=[word_embeddings], trainable=True
    )(word_input)

    case_input = Input(shape=(None,))
    case_embeddings = Embedding(
        case_vocab_size, case_embeddings_dim, weights=[case_embeddings], trainable=True
    )(case_input)
    
    char_input = Input(shape=(None, max_word_length))
    char_embeddings = TimeDistributed(Embedding(
        char_vocab_size, CHAR_EMB_DIM, embeddings_initializer=RandomUniform(minval=-math.sqrt(0.1), maxval=math.sqrt(0.1))
    ))(char_input)
    char_embeddings = Dropout(DROPOUT)(char_embeddings)
    char_embeddings = TimeDistributed(Conv1D(
        kernel_size=KERNEL_SIZE, filters=CHAR_FILTERS, padding='same', activation='tanh', strides=1
    ))(char_embeddings)
    char_embeddings = TimeDistributed(MaxPooling1D(max_word_length))(char_embeddings)
    char_embeddings = TimeDistributed(Flatten())(char_embeddings)

    feature_input = Input(shape=(None, 196))
    gaze_input = Input(shape=(None, 3))

    embeddings = concatenate([word_embeddings, case_embeddings, char_embeddings, feature_input, gaze_input])
    embeddings = Dropout(DROPOUT)(embeddings)
    
    output = Bidirectional(LSTM(
        LSTM_DIM, return_sequences=True, dropout=DROPOUT, recurrent_dropout=RECURRENT_DROPOUT
    ))(embeddings)

    lstm = TimeDistributed(Dense(LSTM_DIM, activation='elu'))(output)
    lstm = CRF(tag_set_size, sparse_target=True)(lstm)
    gaze = TimeDistributed(Dense(3, activation='elu'))(output)
    shape = TimeDistributed(Dense(151, activation='elu'))(output)
    position = TimeDistributed(Dense(45, activation='elu'))(output)
    
    return Model(inputs=[word_input, case_input, char_input, feature_input, gaze_input], outputs=[lstm, gaze, shape, position])

In [None]:
model = CharCNNBiLSTM(len(word_idx), len(case_idx), len(char_idx),
              len(word_embeddings[0]), len(case_idx), max_word_length,
              word_embeddings, case_embeddings, len(tag_idx))
model.compile(loss=[crf_loss, utils.weighted_sparse_categorical_crossentropy, sparse_categorical_crossentropy, sparse_categorical_crossentropy], optimizer=Nadam(clipnorm=5))

In [None]:
from conlleval import evaluate


def evaluate_model(model, batches, idx_tag):
    true_seqs, pred_seqs = [], []

    for batch in batches:
        target = batch['tag']
        target = target.reshape((target.shape[0], target.shape[1]))
        logits = model.predict([batch['word'], batch['case'], batch['char'], batch['features'], batch['gaze']], verbose=False)[0].argmax(axis=-1)

        for seq_ind, seq_len in enumerate(batch['lengths']):
            true_seqs.append(' '.join([idx_tag[ind.item()] for ind in target[seq_ind, 1: seq_len + 1]]))
            pred_seqs.append(' '.join([idx_tag[ind.item()] for ind in logits[seq_ind, 1: seq_len + 1]]))

    return evaluate(true_seqs, pred_seqs, verbose=False)

In [None]:
for epoch in range(1, NUM_EPOCHS + 1):
    random.shuffle(train_batches)
    
    print('----------------------------------- EPOCH: {} -----------------------------------'.format(epoch))
    print('----------------------------------- Training -----------------------------------')
    
    start_time = time.time()

    for batch in train_batches:
        model.train_on_batch([batch['word'], batch['case'], batch['char'], batch['features'], batch['gaze']], [batch['tag'], batch['gazetteers'], batch['shape'], batch['position']])
    
    finish_time = time.time()
    
    print('Time: {:.2f}s'.format(finish_time - start_time))
    
    print('---------------------------------- Evaluating ----------------------------------')
    
    start_time = time.time()
    
    f, precision, recall = evaluate_model(model, train_batches, idx_tag)
    
    print('================================== Train Data ==================================')
    print('F1 = {:.2f}%, Precision = {:.2f}%, Recall = {:.2f}%'.format(f, precision, recall))
    
    f, precision, recall = evaluate_model(model, valid_batches, idx_tag)
    
    print('================================== Valid Data ==================================')
    print('F1 = {:.2f}%, Precision = {:.2f}%, Recall = {:.2f}%'.format(f, precision, recall))
    
    f, precision, recall = evaluate_model(model, test_batches, idx_tag)
    
    print('================================== Test  Data ==================================')
    print('F1 = {:.2f}%, Precision = {:.2f}%, Recall = {:.2f}%'.format(f, precision, recall))
    
    finish_time = time.time()
    
    print('Time: {:.2f}s\n'.format(finish_time - start_time))