In [0]:
!wget -qq https://raw.githubusercontent.com/svinkapeppa/trash/master/conlleval.py
!wget -qq https://raw.githubusercontent.com/svinkapeppa/trash/master/utils.py
!wget -qq https://raw.githubusercontent.com/svinkapeppa/trash/master/install
!pip install git+https://www.github.com/keras-team/keras-contrib.git

In [0]:
!chmod 777 install
!./install

In [3]:
from keras.models import Model
from keras.layers import TimeDistributed, Conv1D, Dense, Embedding, Input, Dropout, LSTM, Bidirectional, MaxPooling1D, Flatten, concatenate
from keras.initializers import RandomUniform
from keras.optimizers import Nadam
from keras_contrib.layers import CRF
from keras_contrib.losses import crf_loss

import utils
import random
import time
import math

Using TensorFlow backend.


In [0]:
NUM_EPOCHS = 200
BATCH_SIZE = 128
LSTM_DIM = 300
CHAR_EMB_DIM = 30
KERNEL_SIZE = 4
CHAR_FILTERS = 30
DROPOUT = 0.5
RECURRENT_DROPOUT = 0.3

In [72]:
train_sentences = utils.read_sentences('data/train')
valid_sentences = utils.read_sentences('data/valid')
test_sentences = utils.read_sentences('data/test')

print('Number of TRAIN sentences: {}'.format(len(train_sentences)))
print('Number of VALID sentences: {}'.format(len(valid_sentences)))
print('Number of TEST sentences: {}'.format(len(test_sentences)))

Number of TRAIN sentences: 14041
Number of VALID sentences: 3250
Number of TEST sentences: 3453


In [0]:
utils.convert_tags(train_sentences)
utils.convert_tags(valid_sentences)
utils.convert_tags(test_sentences)

In [0]:
tag_idx, idx_tag = utils.create_tag_mapping([train_sentences, valid_sentences, test_sentences])

In [0]:
word_idx, idx_word, word_embeddings = utils.create_word_mapping('glove/glove.6B.100d.txt')

In [0]:
char_idx, idx_char = utils.create_char_mapping([train_sentences, valid_sentences, test_sentences])

In [0]:
case_idx, case_embeddings = utils.create_case_mapping()

In [0]:
max_word_length = utils.get_max_word_length([train_sentences, valid_sentences, test_sentences])

In [0]:
train_batches = utils.create_batches(train_sentences, BATCH_SIZE, max_word_length, word_idx, char_idx, tag_idx)
valid_batches = utils.create_batches(valid_sentences, BATCH_SIZE, max_word_length, word_idx, char_idx, tag_idx)
test_batches = utils.create_batches(test_sentences, BATCH_SIZE, max_word_length, word_idx, char_idx, tag_idx)

In [0]:
def CharCNNBiLSTM(word_vocab_size, case_vocab_size, char_vocab_size,
                  word_embeddings_dim, case_embeddings_dim, max_word_length,
                  word_embeddings, case_embeddings, tag_set_size):
    word_input = Input(shape=(None,))
    word_embeddings = Embedding(
        word_vocab_size, word_embeddings_dim, weights=[word_embeddings], trainable=True
    )(word_input)

    case_input = Input(shape=(None,))
    case_embeddings = Embedding(
        case_vocab_size, case_embeddings_dim, weights=[case_embeddings], trainable=True
    )(case_input)
    
    char_input = Input(shape=(None, max_word_length))
    char_embeddings = TimeDistributed(Embedding(
        char_vocab_size, CHAR_EMB_DIM, embeddings_initializer=RandomUniform(minval=-math.sqrt(0.1), maxval=math.sqrt(0.1))
    ))(char_input)
    char_embeddings = Dropout(DROPOUT)(char_embeddings)
    char_embeddings = TimeDistributed(Conv1D(
        kernel_size=KERNEL_SIZE, filters=CHAR_FILTERS, padding='same', activation='tanh', strides=1
    ))(char_embeddings)
    char_embeddings = TimeDistributed(MaxPooling1D(max_word_length))(char_embeddings)
    char_embeddings = TimeDistributed(Flatten())(char_embeddings)
    
    embeddings = concatenate([word_embeddings, case_embeddings, char_embeddings])
    embeddings = Dropout(DROPOUT)(embeddings)
    
    output = Bidirectional(LSTM(
        LSTM_DIM, return_sequences=True, dropout=DROPOUT, recurrent_dropout=RECURRENT_DROPOUT
    ))(embeddings)
    output = TimeDistributed(Dense(LSTM_DIM, activation='elu'))(output)
    output = CRF(tag_set_size, sparse_target=True)(output)
    
    return Model(inputs=[word_input, case_input, char_input], outputs=[output])

In [0]:
model = CharCNNBiLSTM(len(word_idx), len(case_idx), len(char_idx),
              len(word_embeddings[0]), len(case_idx), max_word_length,
              word_embeddings, case_embeddings, len(tag_idx))
model.compile(loss=crf_loss, optimizer=Nadam(clipnorm=5))

In [0]:
from conlleval import evaluate


def evaluate_model(model, batches, idx_tag):
    true_seqs, pred_seqs = [], []

    for batch in batches:
        target = batch['tag']
        target = target.reshape((target.shape[0], target.shape[1]))
        logits = model.predict([batch['word'], batch['case'], batch['char']], verbose=False).argmax(axis=-1)

        for seq_ind, seq_len in enumerate(batch['lengths']):
            true_seqs.append(' '.join([idx_tag[ind.item()] for ind in target[seq_ind, 1: seq_len + 1]]))
            pred_seqs.append(' '.join([idx_tag[ind.item()] for ind in logits[seq_ind, 1: seq_len + 1]]))

    return evaluate(true_seqs, pred_seqs, verbose=False)

In [83]:
for epoch in range(1, NUM_EPOCHS + 1):
    random.shuffle(train_batches)
    
    print('----------------------------------- EPOCH: {} -----------------------------------'.format(epoch))
    print('----------------------------------- Training -----------------------------------')
    
    start_time = time.time()

    for batch in train_batches:
        model.train_on_batch([batch['word'], batch['case'], batch['char']], batch['tag'])
    
    finish_time = time.time()
    
    print('Time: {:.2f}s'.format(finish_time - start_time))
    
    print('---------------------------------- Evaluating ----------------------------------')
    
    start_time = time.time()
    
    f, precision, recall = evaluate_model(model, train_batches, idx_tag)
    
    print('================================== Train Data ==================================')
    print('F1 = {:.2f}%, Precision = {:.2f}%, Recall = {:.2f}%'.format(f, precision, recall))
    
    f, precision, recall = evaluate_model(model, valid_batches, idx_tag)
    
    print('================================== Valid Data ==================================')
    print('F1 = {:.2f}%, Precision = {:.2f}%, Recall = {:.2f}%'.format(f, precision, recall))
    
    f, precision, recall = evaluate_model(model, test_batches, idx_tag)
    
    print('================================== Test  Data ==================================')
    print('F1 = {:.2f}%, Precision = {:.2f}%, Recall = {:.2f}%'.format(f, precision, recall))
    
    finish_time = time.time()
    
    print('Time: {:.2f}s\n'.format(finish_time - start_time))

----------------------------------- EPOCH: 1 -----------------------------------
----------------------------------- Training -----------------------------------
Time: 34.65s
---------------------------------- Evaluating ----------------------------------
F1 = 62.85%, Precision = 72.42%, Recall = 67.29%
F1 = 67.87%, Precision = 75.68%, Recall = 71.56%
F1 = 60.31%, Precision = 70.61%, Recall = 65.05%
Time: 24.04s

----------------------------------- EPOCH: 2 -----------------------------------
----------------------------------- Training -----------------------------------
Time: 21.04s
---------------------------------- Evaluating ----------------------------------
F1 = 83.37%, Precision = 86.82%, Recall = 85.06%
F1 = 80.91%, Precision = 85.66%, Recall = 83.22%
F1 = 76.46%, Precision = 82.47%, Recall = 79.35%
Time: 22.29s

----------------------------------- EPOCH: 3 -----------------------------------
----------------------------------- Training -----------------------------------
Time

KeyboardInterrupt: ignored