In [1]:
import sys
import os
import fastText

from validation import compute_f1

from keras.models import load_model
from keras.callbacks import ModelCheckpoint, Callback

import models
import utils

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=0

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=1


In [3]:
trainSentences = utils.get_sentences_germeval('../data/GermEVAL/NER-de-train.tsv')
devSentences = utils.get_sentences_germeval('../data/GermEVAL/NER-de-dev.tsv')
testSentences = utils.get_sentences_germeval('../data/GermEVAL/NER-de-test.tsv')

# trainSentences = get_sentences('../data/CONLL/deu/deu_utf.train')
# devSentences = get_sentences('../data/CONLL/deu/deu_utf.testa')
# testSentences = get_sentences('../data/CONLL/deu/deu_utf.testb')

print(len(trainSentences))
print(len(devSentences))
print(len(testSentences))


24000
2200
5100


In [4]:
print(testSentences[0])

[['1951', 'O', 'O'], ['bis', 'O', 'O'], ['1953', 'O', 'O'], ['wurde', 'O', 'O'], ['der', 'O', 'O'], ['nördliche', 'O', 'O'], ['Teil', 'O', 'O'], ['als', 'O', 'O'], ['Jugendburg', 'O', 'O'], ['des', 'O', 'O'], ['Kolpingwerkes', 'B-OTH', 'O'], ['gebaut', 'O', 'O'], ['.', 'O', 'O']]


In [5]:
labelSet = set()
characters= set()
models.max_sequence_length = 0

for dataset in [trainSentences, devSentences, testSentences]:
    for sentence in dataset:
        for word, label, label_inner in sentence:
            for char in word:
                characters.add(char)
            labelSet.add(label)
        if len(sentence) > models.max_sequence_length:
            models.max_sequence_length = len(sentence)

In [6]:
print(len(labelSet))
print(models.max_sequence_length)

25
56


In [7]:
# :: Create a mapping for the labels ::
models.label2Idx = {"PADDING_TOKEN":0}
for label in labelSet:
    models.label2Idx[label] = len(models.label2Idx)

In [8]:
print(models.label2Idx)

{'B-OTH': 1, 'B-ORGpart': 2, 'I-LOCpart': 20, 'B-ORGderiv': 3, 'I-OTHderiv': 14, 'PADDING_TOKEN': 0, 'B-LOCpart': 4, 'B-PERpart': 17, 'I-PERpart': 5, 'I-PERderiv': 15, 'B-LOC': 6, 'I-LOCderiv': 7, 'O': 8, 'B-OTHderiv': 18, 'I-OTHpart': 19, 'B-ORG': 21, 'I-LOC': 9, 'I-ORGpart': 10, 'I-PER': 23, 'I-ORG': 22, 'I-ORGderiv': 11, 'I-OTH': 24, 'B-PERderiv': 25, 'B-LOCderiv': 12, 'B-PER': 13, 'B-OTHpart': 16}


In [9]:
# :: Hard coded case lookup ::
models.case2Idx = {'PADDING_TOKEN':0, 'numeric': 1, 'allLower':2, 'allUpper':3, 'initialUpper':4, 'other':5, 'mainly_numeric':6, 'contains_digit': 7}

In [10]:
print(models.case2Idx)

{'allUpper': 3, 'other': 5, 'mainly_numeric': 6, 'numeric': 1, 'contains_digit': 7, 'initialUpper': 4, 'PADDING_TOKEN': 0, 'allLower': 2}


In [11]:
print(type(trainSentences))

<class 'list'>


In [12]:
print(trainSentences[0])

[['Schartau', 'B-PER', 'O'], ['sagte', 'O', 'O'], ['dem', 'O', 'O'], ['"', 'O', 'O'], ['Tagesspiegel', 'B-ORG', 'O'], ['"', 'O', 'O'], ['vom', 'O', 'O'], ['Freitag', 'O', 'O'], [',', 'O', 'O'], ['Fischer', 'B-PER', 'O'], ['sei', 'O', 'O'], ['"', 'O', 'O'], ['in', 'O', 'O'], ['einer', 'O', 'O'], ['Weise', 'O', 'O'], ['aufgetreten', 'O', 'O'], [',', 'O', 'O'], ['die', 'O', 'O'], ['alles', 'O', 'O'], ['andere', 'O', 'O'], ['als', 'O', 'O'], ['überzeugend', 'O', 'O'], ['war', 'O', 'O'], ['"', 'O', 'O'], ['.', 'O', 'O']]


In [13]:
models.char2Idx={"PADDING_TOKEN":0}
for char in characters:
    models.char2Idx[char] = len(models.char2Idx)
models.char2Idx['UNKNOWN'] = len(models.char2Idx)
print(models.char2Idx)

{'œ': 1, 'j': 3, 'h': 245, 'η': 212, '\x99': 2, '·': 326, 'Á': 4, 'ế': 273, '⊃': 8, 'i': 7, '守': 9, 'ş': 10, 'ệ': 12, 'Y': 272, 'A': 13, 'έ': 14, 'ῦ': 15, 'ū': 17, 'p': 18, "'": 20, '«': 274, 'с': 25, '+': 249, '´': 27, '¸': 28, 'ε': 29, 'ό': 30, 'Ü': 31, 'â': 32, 'V': 33, 'Q': 34, 'ō': 158, 'В': 35, '§': 36, 'č': 39, 'G': 38, '£': 43, '▪': 41, 'κ': 208, 'ا': 44, '*': 45, 'e': 46, '傳': 54, '>': 48, 'û': 49, 'л': 50, 'q': 51, 'C': 53, '南': 279, 'PADDING_TOKEN': 0, '’': 55, '4': 56, 'ḳ': 57, '`': 280, 'ض': 58, 'в': 59, 'Z': 60, 'b': 61, 'B': 254, '[': 6, 'я': 255, 'ю': 62, 'ς': 63, '—': 64, '\x9a': 65, '!': 66, '_': 67, 'Č': 68, '›': 71, '冲': 75, 'Ö': 72, 'ń': 73, 'Ш': 223, '造': 76, 'É': 77, '“': 78, '$': 284, 'a': 299, '≘': 80, 'ي': 81, '妃': 84, 'R': 83, 'ē': 74, 'ö': 85, 'z': 86, 'φ': 87, '~': 88, 'Π': 89, 'g': 69, '2': 90, '²': 91, 'o': 92, '별': 95, '台': 306, 'з': 96, 'Ä': 97, 'а': 99, 'Â': 289, 'm': 100, '懿': 79, 'î': 159, '”': 11, 'O': 103, 'σ': 104, '6': 316, '″': 105, 'Œ': 106, '公

In [14]:
print(trainSentences[0]) 

[['Schartau', 'B-PER', 'O'], ['sagte', 'O', 'O'], ['dem', 'O', 'O'], ['"', 'O', 'O'], ['Tagesspiegel', 'B-ORG', 'O'], ['"', 'O', 'O'], ['vom', 'O', 'O'], ['Freitag', 'O', 'O'], [',', 'O', 'O'], ['Fischer', 'B-PER', 'O'], ['sei', 'O', 'O'], ['"', 'O', 'O'], ['in', 'O', 'O'], ['einer', 'O', 'O'], ['Weise', 'O', 'O'], ['aufgetreten', 'O', 'O'], [',', 'O', 'O'], ['die', 'O', 'O'], ['alles', 'O', 'O'], ['andere', 'O', 'O'], ['als', 'O', 'O'], ['überzeugend', 'O', 'O'], ['war', 'O', 'O'], ['"', 'O', 'O'], ['.', 'O', 'O']]


In [15]:
models.ft = fastText.load_model("../embeddings/wiki.de.bin")

In [16]:
print(models.nb_embedding_dims)
print(len(trainSentences[0]))

300
25


In [17]:
models.idx2Label = {v: k for k, v in models.label2Idx.items()}
print(len(models.label2Idx))
print(len(models.idx2Label))

26
26


# Test Model

In [18]:
# import importlib
# importlib.reload(models)

In [19]:
tmp_model_filename = 'tmp_lstm_bi-lstm.h5'
# checkpoint = ModelCheckpoint(tmp_model_filename, verbose=1, save_best_only = True, monitor = 'val_acc')
history = utils.F1History(tmp_model_filename, devSet = devSentences)
model = models.get_model_lstm()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
char_input (InputLayer)         (None, None, 52)     0                                            
__________________________________________________________________________________________________
casing_input (InputLayer)       (None, None)         0                                            
__________________________________________________________________________________________________
char_embedding (TimeDistributed (None, None, 52, 32) 10560       char_input[0][0]                 
__________________________________________________________________________________________________
words_input (InputLayer)        (None, None, 300)    0                                            
__________________________________________________________________________________________________
case_embed

In [20]:
# importlib.reload(utils)
# print(models.max_sequence_length)

In [21]:
model.fit_generator(
    utils.NerSequence(trainSentences[1:100], shuffle_data=True, batch_size=32), 
    validation_data = utils.NerSequence(devSentences[1:100], batch_size=256), 
    epochs = 5, callbacks = [history]
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fe22c552e10>

In [22]:
print(history.acc)
print(history.f1_scores)

[0.8277416825294495, 0.9648268222808838, 0.9644660949707031, 0.9646464586257935, 0.9648268222808838]
[0, 0, 0, 0, 0]


In [23]:
model.load_weights(tmp_model_filename)

OSError: Unable to open file (unable to open file: name = 'tmp_lstm_bi-lstm.h5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)

In [24]:
model.fit_generator(
    utils.NerSequence(trainSentences[1:100], shuffle_data=True, batch_size=2048), 
    validation_data = utils.NerSequence(devSentences[1:100], batch_size=256), 
    epochs = 5, callbacks = [history]
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fe204420cc0>

In [25]:
true_labels, pred_labels = utils.predict_sequences(model, testSentences)
print(compute_f1(pred_labels, true_labels, models.idx2Label))

(0, 0.0, 0)


# Experiments

In [27]:
f = open('results_lstm.txt', 'a')
for run_i in range(10):
    print("Run " + str(run_i))
    
    tmp_model_filename = 'tmp_lstm_bi-lstm.' + str(run_i) + '.h5'

    history = utils.F1History(tmp_model_filename, devSet=devSentences)
    
    # model = get_model_3cnn()
    model = models.get_model_lstm()
    model.fit_generator(
        utils.NerSequence(trainSentences, shuffle_data=True, batch_size=16), 
        validation_data = utils.NerSequence(devSentences, batch_size=256), 
        epochs = 10, callbacks = [history]
    )
    
    model.load_weights(tmp_model_filename)
    
    model.fit_generator(
        utils.NerSequence(trainSentences, shuffle_data=True, batch_size=512), 
        validation_data = utils.NerSequence(devSentences, batch_size=256), 
        epochs = 10, callbacks = [history]
    )
    
    true_labels, pred_labels = utils.predict_sequences(model, testSentences)
    
    pre, rec, f1 = compute_f1(pred_labels, true_labels, models.idx2Label)
    f.write(str(run_i) + "\t" + str(pre) + "\t" + str(rec) +  "\t" + str(f1))
    f.write("\n")
    f.flush()
f.close()

Run 0
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
char_input (InputLayer)         (None, None, 52)     0                                            
__________________________________________________________________________________________________
casing_input (InputLayer)       (None, None)         0                                            
__________________________________________________________________________________________________
char_embedding (TimeDistributed (None, None, 52, 32) 10560       char_input[0][0]                 
__________________________________________________________________________________________________
words_input (InputLayer)        (None, None, 300)    0                                            
__________________________________________________________________________________________________
case

KeyboardInterrupt: 

# Save final model

In [None]:
import shutil, json
# copy file for best run
shutil.copyfile('tmp_lstm_bi-lstm.h5', '../models/final_model-lstm_germeval_outer.h5')
with open("../models/final_model-lstm_germeval.indexes", "w") as f:
    json.dump([models.idx2Label, models.label2Idx, models.char2Idx, models.case2Idx], f)

In [None]:
from keras_contrib.layers import CRF
def create_custom_objects():
    instanceHolder = {"instance": None}
    class ClassWrapper(CRF):
        def __init__(self, *args, **kwargs):
            instanceHolder["instance"] = self
            super(ClassWrapper, self).__init__(*args, **kwargs)
    def loss(*args):
        method = getattr(instanceHolder["instance"], "loss_function")
        return method(*args)
    def accuracy(*args):
        method = getattr(instanceHolder["instance"], "accuracy")
        return method(*args)
    return {"ClassWrapper": ClassWrapper ,"CRF": ClassWrapper, "loss": loss, "accuracy":accuracy}

finalmodel = load_model('../models/final_model-lstm_germeval_outer.h5', custom_objects=create_custom_objects())
true_labels, pred_labels = utils.predict_sequences(finalmodel, testSentences)
print(compute_f1(pred_labels, true_labels, models.idx2Label))

In [30]:
f = open('germeval_output.tsv', 'w', encoding='UTF-8')
for i_sent, sent in enumerate(testSentences):
    for i_tok, tok in enumerate(sent):
        if tok[0] == 'PADDING_TOKEN':
            break
        correctlabel = models.idx2Label[true_labels[i_sent][i_tok]]
        guessedlabel = models.idx2Label[pred_labels[i_sent][i_tok]]
        line = "\t".join([str(i_tok+1), tok[0], correctlabel, correctlabel, guessedlabel, guessedlabel])
        f.write(line + '\n')
    f.write('\n')
f.close

<function TextIOWrapper.close()>

In [29]:
model = models.get_model_lstm()

model.load_weights(tmp_model_filename)

model.fit_generator(
    utils.NerSequence(trainSentences, shuffle_data=True, batch_size=512), 
    validation_data = utils.NerSequence(devSentences, batch_size=256), 
    epochs = 10, callbacks = [history]
)

true_labels, pred_labels = utils.predict_sequences(model, testSentences)

pre, rec, f1 = compute_f1(pred_labels, true_labels, models.idx2Label)
f.write(str(run_i) + "\t" + str(pre) + "\t" + str(rec) +  "\t" + str(f1))
f.write("\n")
f.flush()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
char_input (InputLayer)         (None, None, 52)     0                                            
__________________________________________________________________________________________________
casing_input (InputLayer)       (None, None)         0                                            
__________________________________________________________________________________________________
char_embedding (TimeDistributed (None, None, 52, 32) 10560       char_input[0][0]                 
__________________________________________________________________________________________________
words_input (InputLayer)        (None, None, 300)    0                                            
__________________________________________________________________________________________________
case_embed