In [1]:
import sys
import os
import fastText

from validation import compute_f1

from keras.models import load_model
from keras.callbacks import ModelCheckpoint, Callback

import models
import utils

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
  return f(*args, **kwds)


In [2]:
#%env CUDA_DEVICE_ORDER=PCI_BUS_ID
#%env CUDA_VISIBLE_DEVICES=0

In [3]:
trainSentences = utils.get_sentences_germeval('../../Resources/GermEVAL/NER-de-train.tsv', level2=True)
devSentences = utils.get_sentences_germeval('../../Resources/GermEVAL/NER-de-dev.tsv', level2=True)
testSentences = utils.get_sentences_germeval('../../Resources/GermEVAL/NER-de-test.tsv', level2=True)

print(len(trainSentences))
print(len(devSentences))
print(len(testSentences))


24000
2200
5100


In [4]:
print(testSentences[3])

[['Die', 'O'], ['These', 'O'], [',', 'O'], ['Schlatter', 'O'], ['sei', 'O'], ['Antisemit', 'O'], ['gewesen', 'O'], [',', 'O'], ['wurde', 'O'], ['seither', 'O'], ['in', 'O'], ['der', 'O'], ['theologischen', 'O'], ['Fachliteratur', 'O'], ['nicht', 'O'], ['mehr', 'O'], ['vertreten', 'O'], ['.', 'O']]


In [5]:
labelSet = set()
characters= set()
models.max_sequence_length = 0

for dataset in [trainSentences, devSentences, testSentences]:
    for sentence in dataset:
        for word, label in sentence:
            for char in word:
                characters.add(char)
            labelSet.add(label)
        if len(sentence) > models.max_sequence_length:
            models.max_sequence_length = len(sentence)

In [6]:
print(len(labelSet))
print(models.max_sequence_length)

18
56


In [7]:
# :: Create a mapping for the labels ::
models.label2Idx = {"PADDING_TOKEN":0}
for label in labelSet:
    models.label2Idx[label] = len(models.label2Idx)

In [8]:
print(models.label2Idx)

{'B-PERpart': 18, 'B-ORGpart': 2, 'B-OTH': 15, 'B-LOC': 3, 'B-LOCpart': 4, 'B-ORGderiv': 6, 'I-LOCderiv': 10, 'PADDING_TOKEN': 0, 'B-ORG': 8, 'I-PER': 16, 'B-OTHpart': 12, 'I-LOC': 11, 'O': 13, 'B-PERderiv': 9, 'B-PER': 1, 'I-ORG': 7, 'I-OTH': 17, 'B-LOCderiv': 5, 'B-OTHderiv': 14}


In [9]:
# :: Hard coded case lookup ::
models.case2Idx = {'PADDING_TOKEN':0, 'numeric': 1, 'allLower':2, 'allUpper':3, 'initialUpper':4, 'other':5, 'mainly_numeric':6, 'contains_digit': 7}

In [10]:
print(models.case2Idx)

{'allLower': 2, 'mainly_numeric': 6, 'contains_digit': 7, 'initialUpper': 4, 'numeric': 1, 'other': 5, 'PADDING_TOKEN': 0, 'allUpper': 3}


In [11]:
print(type(trainSentences))

<class 'list'>


In [12]:
print(trainSentences[0])

[['Schartau', 'O'], ['sagte', 'O'], ['dem', 'O'], ['"', 'O'], ['Tagesspiegel', 'O'], ['"', 'O'], ['vom', 'O'], ['Freitag', 'O'], [',', 'O'], ['Fischer', 'O'], ['sei', 'O'], ['"', 'O'], ['in', 'O'], ['einer', 'O'], ['Weise', 'O'], ['aufgetreten', 'O'], [',', 'O'], ['die', 'O'], ['alles', 'O'], ['andere', 'O'], ['als', 'O'], ['überzeugend', 'O'], ['war', 'O'], ['"', 'O'], ['.', 'O']]


In [13]:
models.char2Idx={"PADDING_TOKEN":0, "<S>":1, "</S>":2, "<W>":3, "</W>":4}
for char in characters:
    models.char2Idx[char] = len(models.char2Idx)
models.char2Idx['UNKNOWN'] = len(models.char2Idx)
print(models.char2Idx)

{'E': 5, 'М': 160, '.': 8, '€': 280, 'オ': 299, 'È': 9, 'W': 10, '別': 11, 'V': 12, 'т': 13, '冲': 234, 'É': 166, 'ḳ': 17, 'и': 15, 'o': 191, 'ı': 18, ',': 7, '▪': 170, 'П': 187, 'ć': 113, 'y': 19, 'В': 20, 'Œ': 330, 'β': 22, '[': 23, 'ب': 26, 'z': 25, 'И': 212, 'е': 27, '+': 28, '¤': 29, 'φ': 169, 'Е': 31, 'œ': 32, '‚': 33, '–': 34, '½': 36, 'л': 38, 'ü': 40, '\xad': 41, 'Ş': 174, '鶴': 46, ']': 44, 'ǒ': 45, 'Ł': 47, 'T': 118, 'À': 326, '7': 48, '0': 192, 'ă': 49, 'Î': 50, 'Ü': 51, 'η': 291, 'Π': 58, 'r': 53, 'Ш': 54, 'ż': 292, '\x95': 295, 'а': 55, 'Š': 56, 'ź': 59, '公': 60, '東': 66, '_': 64, 'ë': 63, 'м': 127, 'ū': 65, 'n': 67, 'ό': 68, 'ß': 265, '算': 71, 'р': 70, '佐': 72, '懿': 73, 'ن': 217, 'к': 245, 'Λ': 74, '傳': 78, 'æ': 79, 'ـ': 61, '→': 178, 'Ö': 80, 'U': 81, 'λ': 62, 'б': 82, '~': 83, 'j': 310, '士': 84, 'ã': 88, '?': 87, '柯': 89, 'π': 90, 'f': 91, 'č': 92, 'ö': 182, '‘': 93, 'Ø': 94, '5': 95, 'ř': 96, 'н': 57, '⋅': 98, 'ŏ': 103, 'α': 100, 'У': 101, '\x96': 102, '©': 16, 'Ä': 104, 

In [14]:
print(trainSentences[0]) 

[['Schartau', 'O'], ['sagte', 'O'], ['dem', 'O'], ['"', 'O'], ['Tagesspiegel', 'O'], ['"', 'O'], ['vom', 'O'], ['Freitag', 'O'], [',', 'O'], ['Fischer', 'O'], ['sei', 'O'], ['"', 'O'], ['in', 'O'], ['einer', 'O'], ['Weise', 'O'], ['aufgetreten', 'O'], [',', 'O'], ['die', 'O'], ['alles', 'O'], ['andere', 'O'], ['als', 'O'], ['überzeugend', 'O'], ['war', 'O'], ['"', 'O'], ['.', 'O']]


In [15]:
models.ft = fastText.load_model("../../fastText/wiki.de.bin")

In [16]:
print(models.nb_embedding_dims)
print(len(trainSentences[0]))

300
25


In [17]:
models.idx2Label = {v: k for k, v in models.label2Idx.items()}
print(len(models.label2Idx))
print(len(models.idx2Label))

19
19


# Compute Model

In [18]:
# import importlib
# importlib.reload(models)

In [19]:
tmp_model_filename = 'model_lstm_germeval_2nd-level.h5'
# checkpoint = ModelCheckpoint(tmp_model_filename, verbose=1, save_best_only = True, monitor = 'val_acc')
history = utils.F1History(tmp_model_filename, devSet = devSentences)
model = models.get_model_lstm_v2()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
char_input (InputLayer)         (None, None, 52)     0                                            
__________________________________________________________________________________________________
casing_input (InputLayer)       (None, None)         0                                            
__________________________________________________________________________________________________
char_embedding (TimeDistributed (None, None, 52, 32) 10688       char_input[0][0]                 
__________________________________________________________________________________________________
words_input (InputLayer)        (None, None, 300)    0                                            
__________________________________________________________________________________________________
case_embed

In [20]:
#import importlib
#importlib.reload(utils)
# print(models.max_sequence_length)

In [21]:
model.fit_generator(
    utils.NerSequence(trainSentences, shuffle_data=True, batch_size=16), 
    validation_data = utils.NerSequence(devSentences, batch_size=256), 
    epochs = 10, callbacks = [history]
)

Epoch 1/10
New maximum F1 score: 0.3269961977186312 (before: 0) Saving to model_lstm_germeval_2nd-level.h5
Epoch 2/10
New maximum F1 score: 0.5134328358208955 (before: 0.3269961977186312) Saving to model_lstm_germeval_2nd-level.h5
Epoch 3/10

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



New maximum F1 score: 0.5734265734265733 (before: 0.5583756345177665) Saving to model_lstm_germeval_2nd-level.h5
Epoch 6/10
New maximum F1 score: 0.5903083700440529 (before: 0.5734265734265733) Saving to model_lstm_germeval_2nd-level.h5
Epoch 7/10

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





<keras.callbacks.History at 0x7efd4053c470>

In [22]:
print(history.acc)
print(history.f1_scores)

[0.9983441495895385, 0.998514611937783, 0.9984578015587546, 0.9984821412780068, 0.9984172201156616, 0.9983766248009421, 0.9982386298613115, 0.9984009803425182, 0.9985227277062156, 0.9982954668998718]
[0.3269961977186312, 0.5134328358208955, 0.5112359550561798, 0.5583756345177665, 0.5734265734265733, 0.5903083700440529, 0.5608695652173913, 0.5688073394495413, 0.6013986013986014, 0.5657015590200447]


In [23]:
model.load_weights(tmp_model_filename)

In [24]:
model.fit_generator(
    utils.NerSequence(trainSentences, shuffle_data=True, batch_size=2048), 
    validation_data = utils.NerSequence(devSentences, batch_size=256), 
    epochs = 10, callbacks = [history]
)

Epoch 1/10
New maximum F1 score: 0.602803738317757 (before: 0.6013986013986014) Saving to model_lstm_germeval_2nd-level.h5
Epoch 2/10
Epoch 3/10
New maximum F1 score: 0.6031746031746033 (before: 0.602803738317757) Saving to model_lstm_germeval_2nd-level.h5
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
New maximum F1 score: 0.6045454545454546 (before: 0.6031746031746033) Saving to model_lstm_germeval_2nd-level.h5
Epoch 10/10


<keras.callbacks.History at 0x7efd3863cb70>

In [27]:
model.load_weights(tmp_model_filename)

In [28]:
true_labels, pred_labels = utils.predict_sequences(model, testSentences)
print(compute_f1(pred_labels, true_labels, models.idx2Label))

(0.5807127882599581, 0.537864077669903, 0.5584677419354839)


# Save final model

In [None]:
import shutil, json
# copy file for best run
shutil.copyfile(tmp_model_filename, '../models/final_model_germeval_inner.h5')
shutil.copyfile(tmp_model_filename + '.indexes', '../models/final_model_germeval_inner.indexes')