In [1]:
import sys
import os
import fastText

from validation import compute_f1

from keras.models import load_model
from keras.callbacks import ModelCheckpoint, Callback

import models
import utils

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
  return f(*args, **kwds)


In [2]:
#%env CUDA_DEVICE_ORDER=PCI_BUS_ID
#%env CUDA_VISIBLE_DEVICES=0

In [4]:
trainSentences = utils.get_sentences_germeval('../../Resources/GermEVAL/NER-de-train.tsv', level2=True)
devSentences = utils.get_sentences_germeval('../../Resources/GermEVAL/NER-de-dev.tsv', level2=True)
testSentences = utils.get_sentences_germeval('../../Resources/GermEVAL/NER-de-test.tsv', level2=True)

print(len(trainSentences))
print(len(devSentences))
print(len(testSentences))


24000
2200
5100


In [5]:
print(testSentences[3])

[['Die', 'O'], ['These', 'O'], [',', 'O'], ['Schlatter', 'O'], ['sei', 'O'], ['Antisemit', 'O'], ['gewesen', 'O'], [',', 'O'], ['wurde', 'O'], ['seither', 'O'], ['in', 'O'], ['der', 'O'], ['theologischen', 'O'], ['Fachliteratur', 'O'], ['nicht', 'O'], ['mehr', 'O'], ['vertreten', 'O'], ['.', 'O']]


In [6]:
labelSet = set()
characters= set()
models.max_sequence_length = 0

for dataset in [trainSentences, devSentences, testSentences]:
    for sentence in dataset:
        for word, label in sentence:
            for char in word:
                characters.add(char)
            labelSet.add(label)
        if len(sentence) > models.max_sequence_length:
            models.max_sequence_length = len(sentence)

In [7]:
print(len(labelSet))
print(models.max_sequence_length)

18
56


In [8]:
# :: Create a mapping for the labels ::
models.label2Idx = {"PADDING_TOKEN":0}
for label in labelSet:
    models.label2Idx[label] = len(models.label2Idx)

In [9]:
print(models.label2Idx)

{'O': 1, 'B-OTHpart': 2, 'B-OTHderiv': 3, 'B-ORG': 4, 'B-ORGderiv': 5, 'B-ORGpart': 6, 'I-LOCderiv': 10, 'B-LOC': 7, 'I-PER': 8, 'B-LOCpart': 11, 'B-PER': 15, 'I-ORG': 14, 'B-LOCderiv': 17, 'PADDING_TOKEN': 0, 'B-PERderiv': 9, 'I-OTH': 16, 'I-LOC': 18, 'B-OTH': 13, 'B-PERpart': 12}


In [10]:
# :: Hard coded case lookup ::
models.case2Idx = {'PADDING_TOKEN':0, 'numeric': 1, 'allLower':2, 'allUpper':3, 'initialUpper':4, 'other':5, 'mainly_numeric':6, 'contains_digit': 7}

In [11]:
print(models.case2Idx)

{'other': 5, 'allUpper': 3, 'contains_digit': 7, 'numeric': 1, 'PADDING_TOKEN': 0, 'initialUpper': 4, 'mainly_numeric': 6, 'allLower': 2}


In [12]:
print(type(trainSentences))

<class 'list'>


In [13]:
print(trainSentences[0])

[['Schartau', 'O'], ['sagte', 'O'], ['dem', 'O'], ['"', 'O'], ['Tagesspiegel', 'O'], ['"', 'O'], ['vom', 'O'], ['Freitag', 'O'], [',', 'O'], ['Fischer', 'O'], ['sei', 'O'], ['"', 'O'], ['in', 'O'], ['einer', 'O'], ['Weise', 'O'], ['aufgetreten', 'O'], [',', 'O'], ['die', 'O'], ['alles', 'O'], ['andere', 'O'], ['als', 'O'], ['überzeugend', 'O'], ['war', 'O'], ['"', 'O'], ['.', 'O']]


In [14]:
models.char2Idx={"PADDING_TOKEN":0, "<S>":1, "</S>":2, "<W>":3, "</W>":4}
for char in characters:
    models.char2Idx[char] = len(models.char2Idx)
models.char2Idx['UNKNOWN'] = len(models.char2Idx)
print(models.char2Idx)

{'冲': 5, '→': 6, '루': 7, 'И': 280, 'з': 8, 'Ł': 240, 'ā': 9, '/': 10, '博': 11, 'k': 169, 'á': 12, 'ź': 14, 'κ': 19, 'ν': 16, 'ß': 17, 'у': 38, '佐': 20, 'ь': 21, 'Z': 22, ':': 24, '殿': 26, '!': 27, 'ē': 33, 'ú': 30, '太': 330, 'O': 32, '造': 274, 'а': 34, '\xad': 35, 'B': 37, 'A': 305, 'q': 39, 'å': 40, 'б': 285, 'ŏ': 41, 'Ц': 42, 'ـ': 44, 'Λ': 45, 'Œ': 47, 'ī': 251, '‘': 49, 'ž': 50, '柯': 87, 'ğ': 51, 'M': 283, 'о': 52, 'd': 53, '"': 154, '\x99': 54, '▪': 55, 'ǒ': 329, 'H': 56, 'ي': 57, '‹': 58, 'σ': 172, 'Ġ': 296, 'ý': 60, 'ʻ': 61, 'е': 62, 'м': 13, 's': 188, 'ι': 64, 'G': 65, '公': 67, 'Ž': 306, 'έ': 68, '−': 69, 'ḳ': 70, '³': 205, '²': 71, '×': 18, 'D': 72, 'α': 74, 'W': 15, 'ο': 79, '±': 77, '術': 81, 'w': 80, 'ж': 289, 'ب': 83, 'ě': 84, 'Е': 85, 'я': 76, 'œ': 260, 'h': 88, 'o': 281, 'Ż': 183, 'Т': 89, '’': 90, 'к': 91, 'İ': 137, '″': 92, 'PADDING_TOKEN': 0, 'ş': 187, 'ż': 94, 'ð': 297, '`': 96, '守': 98, 'Ş': 213, '대': 100, '©': 25, 'и': 101, '\x96': 102, 'У': 103, 'Š': 104, '동': 231, 

In [15]:
print(trainSentences[0]) 

[['Schartau', 'O'], ['sagte', 'O'], ['dem', 'O'], ['"', 'O'], ['Tagesspiegel', 'O'], ['"', 'O'], ['vom', 'O'], ['Freitag', 'O'], [',', 'O'], ['Fischer', 'O'], ['sei', 'O'], ['"', 'O'], ['in', 'O'], ['einer', 'O'], ['Weise', 'O'], ['aufgetreten', 'O'], [',', 'O'], ['die', 'O'], ['alles', 'O'], ['andere', 'O'], ['als', 'O'], ['überzeugend', 'O'], ['war', 'O'], ['"', 'O'], ['.', 'O']]


In [17]:
models.ft = fastText.load_model("../../fastText/wiki.de.bin")

In [18]:
print(models.nb_embedding_dims)
print(len(trainSentences[0]))

300
25


In [19]:
models.idx2Label = {v: k for k, v in models.label2Idx.items()}
print(len(models.label2Idx))
print(len(models.idx2Label))

19
19


# Compute Model

In [None]:
# import importlib
# importlib.reload(models)

In [21]:
tmp_model_filename = 'model_lstm_germeval_2nd-level.h5'
# checkpoint = ModelCheckpoint(tmp_model_filename, verbose=1, save_best_only = True, monitor = 'val_acc')
history = utils.F1History(tmp_model_filename, devSet = devSentences)
model = models.get_model_lstm()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
char_input (InputLayer)         (None, None, 52)     0                                            
__________________________________________________________________________________________________
casing_input (InputLayer)       (None, None)         0                                            
__________________________________________________________________________________________________
char_embedding (TimeDistributed (None, None, 52, 32) 10688       char_input[0][0]                 
__________________________________________________________________________________________________
words_input (InputLayer)        (None, None, 300)    0                                            
__________________________________________________________________________________________________
case_embed

In [22]:
#import importlib
#importlib.reload(utils)
# print(models.max_sequence_length)

In [23]:
model.fit_generator(
    utils.NerSequence(trainSentences, shuffle_data=True, batch_size=16), 
    validation_data = utils.NerSequence(devSentences, batch_size=256), 
    epochs = 10, callbacks = [history]
)

Epoch 1/10
New maximum F1 score: 0.45307443365695793 (before: 0) Saving to model_lstm_germeval_2nd-level.h5
Epoch 2/10
New maximum F1 score: 0.5217391304347825 (before: 0.45307443365695793) Saving to model_lstm_germeval_2nd-level.h5
Epoch 3/10
Epoch 4/10
New maximum F1 score: 0.5644768856447688 (before: 0.5217391304347825) Saving to model_lstm_germeval_2nd-level.h5
Epoch 5/10
New maximum F1 score: 0.5808080808080809 (before: 0.5644768856447688) Saving to model_lstm_germeval_2nd-level.h5
Epoch 6/10
Epoch 7/10
Epoch 8/10
New maximum F1 score: 0.5924050632911393 (before: 0.5808080808080809) Saving to model_lstm_germeval_2nd-level.h5
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7efa94dd6f98>

In [24]:
print(history.acc)
print(history.f1_scores)

[0.9984171995249661, 0.998506489016793, 0.9984902602976019, 0.9983928684754805, 0.9985308504104614, 0.9982467536492782, 0.9985227326913313, 0.9985470893166282, 0.9983847397023982, 0.9984659064899791]
[0.45307443365695793, 0.5217391304347825, 0.5189504373177842, 0.5644768856447688, 0.5808080808080809, 0.5476190476190477, 0.555256064690027, 0.5924050632911393, 0.5773672055427251, 0.5839416058394161]


In [25]:
model.load_weights(tmp_model_filename)

In [26]:
model.fit_generator(
    utils.NerSequence(trainSentences, shuffle_data=True, batch_size=2048), 
    validation_data = utils.NerSequence(devSentences, batch_size=256), 
    epochs = 10, callbacks = [history]
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
New maximum F1 score: 0.5938242280285035 (before: 0.5924050632911393) Saving to model_lstm_germeval_2nd-level.h5
Epoch 7/10
Epoch 8/10
New maximum F1 score: 0.5957446808510638 (before: 0.5938242280285035) Saving to model_lstm_germeval_2nd-level.h5
Epoch 9/10
New maximum F1 score: 0.6018957345971563 (before: 0.5957446808510638) Saving to model_lstm_germeval_2nd-level.h5
Epoch 10/10
New maximum F1 score: 0.6052009456264775 (before: 0.6018957345971563) Saving to model_lstm_germeval_2nd-level.h5


<keras.callbacks.History at 0x7ef7d079e160>

In [27]:
model.load_weights(tmp_model_filename)

In [28]:
true_labels, pred_labels = utils.predict_sequences(model, testSentences[:1000])
print(compute_f1(pred_labels, true_labels, models.idx2Label))

(0.4943820224719101, 0.5238095238095238, 0.5086705202312138)


In [29]:
true_labels, pred_labels = utils.predict_sequences(model, testSentences)
print(compute_f1(pred_labels, true_labels, models.idx2Label))

(0.5936794582392777, 0.5106796116504855, 0.5490605427974948)


# Save final model

In [None]:
import shutil, json
# copy file for best run
shutil.copyfile(tmp_model_filename, '../models/final_model_germeval_inner.h5')
shutil.copyfile(tmp_model_filename + '.indexes', '../models/final_model_germeval_inner.indexes')

In [None]:
from keras_contrib.layers import CRF
def create_custom_objects():
    instanceHolder = {"instance": None}
    class ClassWrapper(CRF):
        def __init__(self, *args, **kwargs):
            instanceHolder["instance"] = self
            super(ClassWrapper, self).__init__(*args, **kwargs)
    def loss(*args):
        method = getattr(instanceHolder["instance"], "loss_function")
        return method(*args)
    def accuracy(*args):
        method = getattr(instanceHolder["instance"], "accuracy")
        return method(*args)
    return {"ClassWrapper": ClassWrapper ,"CRF": ClassWrapper, "loss": loss, "accuracy":accuracy}

finalmodel = load_model('../models/final_model_germeval_outer.h5', custom_objects=create_custom_objects())
true_labels, pred_labels = utils.predict_sequences(finalmodel, testSentences)
print(compute_f1(pred_labels, true_labels, models.idx2Label))

In [None]:
f = open('germeval_output.tsv', 'w', encoding='UTF-8')
for i_sent, sent in enumerate(testSentences):
    for i_tok, tok in enumerate(sent):
        if tok[0] == 'PADDING_TOKEN':
            break
        correctlabel = models.idx2Label[true_labels[i_sent][i_tok]]
        guessedlabel = models.idx2Label[pred_labels[i_sent][i_tok]]
        line = "\t".join([str(i_tok+1), tok[0], correctlabel, correctlabel, guessedlabel, guessedlabel])
        f.write(line + '\n')
    f.write('\n')
f.close