In [1]:
import sys
import os
import numpy as np 
from keras.utils import to_categorical
from validation import compute_f1
from keras.models import Model
from keras.layers import TimeDistributed,Conv1D,Dense,Embedding,Input,Dropout,LSTM,Bidirectional,MaxPooling1D,Flatten,concatenate
# from prepro import readfile,createBatches,createMatrices,iterate_minibatches,addCharInformatioin,padding
from keras.utils import plot_model,Progbar
from keras.preprocessing.sequence import pad_sequences
from keras.initializers import RandomUniform
from keras.optimizers import Adam
import keras.backend as K
from sklearn.metrics import f1_score
from keras.callbacks import ModelCheckpoint, Callback
from keras_contrib.layers import CRF
from numpy import newaxis
from random import shuffle
import math
import sklearn
import subprocess
import fastText
import pickle

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
def getCasing(word, caseLookup):
    
    if word == 'PADDING_TOKEN':
        return(caseLookup['PADDING_TOKEN'])
    
    casing = 'other'
    
    numDigits = 0
    for char in word:
        if char.isdigit():
            numDigits += 1
            
    digitFraction = numDigits / float(len(word))
    
    if word.isdigit(): #Is a digit
        casing = 'numeric'
    elif digitFraction > 0.5:
        casing = 'mainly_numeric'
    elif word.islower(): #All lower case
        casing = 'allLower'
    elif word.isupper(): #All upper case
        casing = 'allUpper'
    elif word[0].isupper(): #is a title, initial char upper, then all lower
        casing = 'initialUpper'
    elif numDigits > 0:
        casing = 'contains_digit'
   
    return caseLookup[casing]

In [3]:
def tag_dataset(dataset):
    correctLabels = []
    predLabels = []
    b = Progbar(len(dataset))
    for i,data in enumerate(dataset):    
        tokens, casing,char, labels = data
        tokens = np.asarray([tokens])     
        casing = np.asarray([casing])
        char = np.asarray([char])
        pred = model.predict([tokens, casing,char], verbose=False)[0]   
        pred = pred.argmax(axis=-1) #Predict the classes            
        correctLabels.append(labels)
        predLabels.append(pred)
        b.update(i)
    return predLabels, correctLabels

In [4]:
# changing all deriv and part to misc. with BIO
def modify_labels(dataset):
    bad_labels = ['I-PERderiv','I-OTHpart','B-ORGderiv', 'I-OTH','B-OTHpart','B-LOCderiv','I-LOCderiv','I-OTHderiv','B-PERderiv','B-OTHderiv','B-PERpart','I-PERpart','I-LOCpart','B-LOCpart','I-ORGpart','I-ORGderiv','B-ORGpart','B-OTH']
    for sentence in dataset:
        for word in sentence:
            label = word[1]
            if label in bad_labels:
                first_char = label[0]
                if first_char == 'B' :
                    word[1] = 'B-MISC'
                else:
                    word[1] = 'I-MISC'
    return dataset
                

In [5]:
def get_sentences_germeval(path):
    sentences = []
    with open(path, 'r', encoding = 'UTF-8') as f:
        sentence = []
        for line in f:
            
            line = line.strip()
            
            # append sentence
            if len(line) == 0:
                if len(sentence):
                    sentences.append(sentence)
                sentence = []
                continue
            
            # get sentence tokens
            splits = line.split()
            if splits[0] == '#':
                continue
            temp = [splits[1],splits[2]]
            sentence.append(temp)
        
        # append last
        if len(sentence):
            sentences.append(sentence)    
    return sentences

In [6]:
# preproecessing data from Conll
def get_sentences_conll(filename):
    '''
        -DOCSTART- -X- -X- O

    EU NNP B-NP B-ORG
    rejects VBZ B-VP O
    German JJ B-NP B-MISC
    call NN I-NP O
    to TO B-VP O
    boycott VB I-VP O
    British JJ B-NP B-MISC
    lamb NN I-NP O
    . . O O
    
    '''
    
    '''
    read file
    return format :
    [ ['EU', 'B-ORG'], ['rejects', 'O'], ['German', 'B-MISC'], ['call', 'O'], ['to', 'O'], ['boycott', 'O'], ['British', 'B-MISC'], ['lamb', 'O'], ['.', 'O'] ]
    '''
    f = open(filename,'rb')
    sentences = []
    sentence = []
    for line in f:
        splits = line.split()
        try:
            word=splits[0].decode()
            if word=='-DOCSTART-':
                continue
            label=splits[-1].decode()
            temp=[word,label]
            sentence.append(temp)
        except Exception as e:
            if len(sentence)!=0:
                sentences.append(sentence)
                sentence=[]
    return sentences



In [7]:
trainSentences = get_sentences_germeval('../data/GermEVAL/NER-de-train.tsv')
devSentences = get_sentences_germeval('../data/GermEVAL/NER-de-dev.tsv')
testSentences = get_sentences_germeval('../data/GermEVAL/NER-de-test.tsv')

# trainSentences = get_sentences('../data/CONLL/deu/deu_utf.train')
# devSentences = get_sentences('../data/CONLL/deu/deu_utf.testa')
# testSentences = get_sentences('../data/CONLL/deu/deu_utf.testb')

print(len(trainSentences))
print(len(devSentences))
print(len(testSentences))


24000
2200
5100


In [8]:
print(testSentences[0])

[['1951', 'O'], ['bis', 'O'], ['1953', 'O'], ['wurde', 'O'], ['der', 'O'], ['nördliche', 'O'], ['Teil', 'O'], ['als', 'O'], ['Jugendburg', 'O'], ['des', 'O'], ['Kolpingwerkes', 'B-OTH'], ['gebaut', 'O'], ['.', 'O']]


In [9]:
labelSet = set()
characters= set()
max_sequence_length = 0

for dataset in [trainSentences, devSentences, testSentences]:
    for sentence in dataset:
        for word, label in sentence:
            for char in word:
                characters.add(char)
            labelSet.add(label)
        if len(sentence) > max_sequence_length:
            max_sequence_length = len(sentence)

In [10]:
print(len(labelSet))
print(max_sequence_length)

25
56


In [11]:
# :: Create a mapping for the labels ::
label2Idx = {"PADDING_TOKEN":0}
for label in labelSet:
    label2Idx[label] = len(label2Idx)

In [12]:
print(label2Idx)

{'B-OTHpart': 1, 'I-LOCderiv': 13, 'B-OTH': 3, 'B-PER': 14, 'I-PERderiv': 25, 'B-ORGderiv': 4, 'O': 5, 'B-LOCderiv': 15, 'B-LOC': 16, 'B-PERderiv': 17, 'I-ORGpart': 18, 'I-ORG': 2, 'I-PER': 19, 'B-ORG': 20, 'PADDING_TOKEN': 0, 'B-OTHderiv': 7, 'B-ORGpart': 22, 'I-OTHpart': 23, 'I-LOCpart': 24, 'I-ORGderiv': 8, 'I-PERpart': 21, 'I-OTH': 9, 'I-OTHderiv': 10, 'B-LOCpart': 6, 'I-LOC': 11, 'B-PERpart': 12}


In [13]:
# :: Hard coded case lookup ::
case2Idx = {'PADDING_TOKEN':0, 'numeric': 1, 'allLower':2, 'allUpper':3, 'initialUpper':4, 'other':5, 'mainly_numeric':6, 'contains_digit': 7}
caseEmbeddings = np.identity(len(case2Idx), dtype='float32')

In [14]:
print(caseEmbeddings)
print(case2Idx)

[[1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1.]]
{'numeric': 1, 'allUpper': 3, 'mainly_numeric': 6, 'allLower': 2, 'contains_digit': 7, 'PADDING_TOKEN': 0, 'initialUpper': 4, 'other': 5}


In [15]:
print(type(trainSentences))

<class 'list'>


In [16]:
print(trainSentences[0])

[['Schartau', 'B-PER'], ['sagte', 'O'], ['dem', 'O'], ['"', 'O'], ['Tagesspiegel', 'B-ORG'], ['"', 'O'], ['vom', 'O'], ['Freitag', 'O'], [',', 'O'], ['Fischer', 'B-PER'], ['sei', 'O'], ['"', 'O'], ['in', 'O'], ['einer', 'O'], ['Weise', 'O'], ['aufgetreten', 'O'], [',', 'O'], ['die', 'O'], ['alles', 'O'], ['andere', 'O'], ['als', 'O'], ['überzeugend', 'O'], ['war', 'O'], ['"', 'O'], ['.', 'O']]


In [17]:
char2Idx={"PADDING_TOKEN":0}
for char in characters:
    char2Idx[char] = len(char2Idx)
char2Idx['UNKNOWN'] = len(char2Idx)
print(char2Idx)

{'υ': 1, 'ε': 2, '²': 3, '¹': 4, 'L': 5, 'ğ': 6, '+': 52, 'Î': 7, 'Е': 110, 'ệ': 8, 'î': 9, '\x92': 286, 'Π': 11, 'à': 12, 'ź': 13, 'á': 14, '傳': 217, 'O': 16, '▪': 17, '寝': 325, 'м': 18, '„': 231, '東': 312, 'G': 21, 'B': 22, 'д': 25, 'Ġ': 27, '³': 26, 'ế': 235, '\x9a': 28, 'ي': 169, ']': 30, '公': 31, 'Ö': 32, 'Ü': 33, 'γ': 314, '太': 35, '별': 36, 'У': 284, '`': 37, 'Å': 38, '算': 40, 'ċ': 255, 'ý': 42, 'â': 174, 'Ş': 44, 'h': 45, 'С': 46, '\xad': 47, "'": 48, 'ą': 24, '$': 50, 'c': 51, '殿': 201, '別': 53, 'ó': 54, 'ø': 282, 'κ': 200, 'Ł': 55, 'ß': 56, 'ě': 57, '¤': 58, 'b': 59, '›': 60, 'û': 61, 't': 62, 'ú': 63, 'p': 64, 'ρ': 65, '−': 67, '«': 118, 'б': 68, 'z': 69, 'є': 70, '†': 287, 'ъ': 72, 'w': 96, 'F': 73, '博': 75, '’': 76, 'П': 78, '南': 79, 'ŏ': 15, 'ē': 80, 'ά': 295, 'Â': 82, '冲': 83, 'λ': 181, 'ī': 84, 'с': 293, 'α': 85, '©': 242, '·': 10, '%': 89, 'и': 90, '루': 259, '7': 92, 'é': 97, 'ῦ': 98, 'a': 244, 'ǒ': 100, 'φ': 99, '_': 106, '#': 102, 'E': 103, '&': 104, '柯': 185, 'œ': 10

In [18]:
print(trainSentences[0]) 

[['Schartau', 'B-PER'], ['sagte', 'O'], ['dem', 'O'], ['"', 'O'], ['Tagesspiegel', 'B-ORG'], ['"', 'O'], ['vom', 'O'], ['Freitag', 'O'], [',', 'O'], ['Fischer', 'B-PER'], ['sei', 'O'], ['"', 'O'], ['in', 'O'], ['einer', 'O'], ['Weise', 'O'], ['aufgetreten', 'O'], [',', 'O'], ['die', 'O'], ['alles', 'O'], ['andere', 'O'], ['als', 'O'], ['überzeugend', 'O'], ['war', 'O'], ['"', 'O'], ['.', 'O']]


In [19]:
ft = fastText.load_model("../embeddings/wiki.de.bin")
# ft = fastText.load_model("../embeddings/cc.de.300.bin")

nb_embedding_dims = ft.get_dimension()

In [20]:
print(nb_embedding_dims)
print(len(trainSentences[0]))

300
25


In [21]:
print(trainSentences[0])

[['Schartau', 'B-PER'], ['sagte', 'O'], ['dem', 'O'], ['"', 'O'], ['Tagesspiegel', 'B-ORG'], ['"', 'O'], ['vom', 'O'], ['Freitag', 'O'], [',', 'O'], ['Fischer', 'B-PER'], ['sei', 'O'], ['"', 'O'], ['in', 'O'], ['einer', 'O'], ['Weise', 'O'], ['aufgetreten', 'O'], [',', 'O'], ['die', 'O'], ['alles', 'O'], ['andere', 'O'], ['als', 'O'], ['überzeugend', 'O'], ['war', 'O'], ['"', 'O'], ['.', 'O']]


In [22]:
def createBatches(dataset, batch_size):
    batches = []
    temp = []
    i = 0
    for item in dataset:
        temp.append(item)
        i += 1
        if i == batch_size:
            batches.append(temp)
            temp = []
            i = 0
    if len(temp) > 0:
        batches.append(temp)
    return batches

In [23]:
batch_size = 32
# train_batches = createBatches(trainSentences, batch_size)
# dev_batches = createBatches(devSentences, batch_size)
# test_batches = createBatches(testSentences, batch_size)

In [25]:
def generator(data, shuffle_data = False):
    
    print("Creating batches ...")
    batches = createBatches(data, batch_size)
    
    if shuffle_data:
        print("Shuffling ...")
        shuffle(batches)
    
    while True:
        for batch in batches:
            word_embeddings = []
            case_embeddings = []
            char_embeddings = []
            
            output_labels = []
            
            # batches made according to the size of the sentences. len(batch) gives the size of current batch
            for index in range(len(batch)): 
                sentence = batch[index]
    #             print(sentence)
                
                temp_word= []
                temp_casing = []
                temp_char= []
                
                temp_output=[]
                
                # padding
                words_to_pad = max_sequence_length - len(sentence)
                for i in range(words_to_pad):
                    sentence.append(['PADDING_TOKEN', 'PADDING_TOKEN'])
                
                # create data input for words
                for word in sentence:
                    word, label = word
                    temp_output.append(label2Idx[label])
                    
                    casing = getCasing(word, case2Idx)
                    temp_casing.append(casing)
                        
                    if word == 'PADDING_TOKEN':
                        temp_char2=np.array([char2Idx['PADDING_TOKEN']])
                        temp_char.append(temp_char2)
                        word_vector = [0] * nb_embedding_dims
                        temp_word.append(word_vector)
                    else:
                        # char
                        temp_char2=[]
                        for char in word:
                            if char in char2Idx.keys():
                                temp_char2.append(char2Idx[char])
                            else:
                                temp_char2.append(char2Idx['UNKNOWN']) # To incorporate the words which are not in the vocab
                        temp_char2 = np.array(temp_char2)
                        temp_char.append(temp_char2)
                        
                        # word
                        word_vector = ft.get_word_vector(word.lower())
                        # word_vector = ft.get_word_vector(word)
                        temp_word.append(word_vector)
                        
                temp_char = pad_sequences(temp_char, 52)
                word_embeddings.append(temp_word)
                case_embeddings.append(temp_casing)
                char_embeddings.append(temp_char)
                temp_output = to_categorical(temp_output, len(label2Idx))
                output_labels.append(temp_output)
    #             output_labels = to_categorical()
    #             output_labels = np.array(output_labels)
    #             output_labels = output_labels[...,newaxis]

    #             print(np.array(word_embeddings).shape)
    #             print(np.array(case_embeddings).shape)
    #             print(np.array(char_embeddings).shape)
    #             print(output_labels.shape)
    #             print("******************\n\n")
            yield ([np.array(word_embeddings), np.array(case_embeddings), np.array(char_embeddings)], np.array(output_labels))

def get_label_from_categorical(a):
    labels = []
    for label in a:
        label = np.ndarray.tolist(label)
        label = np.argmax(label)
        labels.append(label)
    return(labels)

def predict_batches(batch):
    steps = 0
    true_labels = []
    pred_labels = []
    for input_data, output_data in generator(batch):
        pred_labels_batch = model.predict(input_data)
        for s in pred_labels_batch:
            pred_labels.append(get_label_from_categorical(s))
        for s in output_data:
            true_labels.append(get_label_from_categorical(s))
        steps += 1
        if steps == math.ceil(len(batch) / batch_size):
            break
    return(true_labels, pred_labels)


def predict_batches_ignore_padding(batch):
    steps = 0
    true_labels = []
    pred_labels = []
    for input_data, output_data in generator(batch):
        
        pred_labels_batch = model.predict(input_data)
        for s_id, s in enumerate(output_data):
            not_padded_true = []
            not_padded_pred = []
            predicted_labels = get_label_from_categorical(pred_labels_batch[s_id])
            for t_id, t in enumerate(get_label_from_categorical(s)):
                if t != 0:
                    not_padded_true.append(t)
                    not_padded_pred.append(predicted_labels[t_id])
            true_labels.append(not_padded_true)
            pred_labels.append(not_padded_pred)
            
        steps += 1
        if steps == math.ceil(len(batch) / batch_size):
            break
    return(true_labels, pred_labels)

In [26]:
idx2Label = {v: k for k, v in label2Idx.items()}
print(len(label2Idx))
print(len(idx2Label))

26
26


In [27]:
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=1

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=1


In [31]:
nb_char_embedding_dims = 52
def get_model_lstm():
    words_input = Input(shape=(None, nb_embedding_dims), dtype='float32', name='words_input')
    casing_input = Input(shape=(None,), dtype='int32', name='casing_input')
    casing = Embedding(output_dim=caseEmbeddings.shape[1], input_dim=caseEmbeddings.shape[0], weights=[caseEmbeddings], trainable=False, name = 'case_embed')(casing_input)
    character_input=Input(shape=(None,nb_char_embedding_dims,),name='char_input')
    embed_char_out=TimeDistributed(Embedding(len(char2Idx),32,embeddings_initializer=RandomUniform(minval=-0.5, maxval=0.5)), name='char_embedding')(character_input)
    char_lstm = TimeDistributed(Bidirectional(LSTM(50)))(embed_char_out)
    output = concatenate([words_input, casing, char_lstm])
    output = Bidirectional(LSTM(200, return_sequences=True, dropout=0.50, recurrent_dropout=0.5))(output)
    output = TimeDistributed(Dense(len(label2Idx)))(output)
    crf = CRF(len(label2Idx))
    output = crf(output)
    model = Model(inputs=[words_input, casing_input, character_input], outputs=[output])
    model.compile(loss=crf.loss_function, optimizer='nadam', metrics=[crf.accuracy])
    model.summary()
    return(model)

class F1History(Callback):
    def on_train_begin(self, logs={}):
        self.acc = []
        self.f1_scores = []
        self.max_f1 = 0

    def on_epoch_end(self, epoch, logs={}):
        self.acc.append(logs.get('val_acc'))
        true_labels, pred_labels = predict_batches_ignore_padding(devSentences)
        pre, rec, f1 = compute_f1(pred_labels, true_labels, idx2Label)
        self.f1_scores.append(f1)
        if epoch > -1 and f1 > self.max_f1:
            print("\nNew maximum F1 score: " + str(f1) + " (before: " + str(self.max_f1) + ") Saving to " + tmp_model_filename)
            self.max_f1 = f1
            model.save(tmp_model_filename)

In [None]:
tmp_model_filename = 'tmp_generator_NER_lstm_best.h5'
# checkpoint = ModelCheckpoint(tmp_model_filename, verbose=1, save_best_only = True, monitor = 'val_acc')
history = F1History()
model = get_model()
model.fit_generator(
    generator(trainSentences, shuffle_data=True), steps_per_epoch = math.ceil(len(trainSentences) / batch_size), 
    validation_data = generator(devSentences), validation_steps = math.ceil(len(devSentences) / batch_size), 
    epochs = 12, callbacks = [history]
)

In [None]:
print(history.acc)
print(history.f1_scores)

In [None]:
model.load_weights(tmp_model_filename)

In [None]:
true_labels, pred_labels = predict_batches_ignore_padding(testSentences)
print(compute_f1(pred_labels, true_labels, idx2Label))

# Experiments

In [None]:
f = open('results_lstm.txt', 'w')
for run_i in range(10):
    print("Run " + str(run_i))
    
    tmp_model_filename = 'tmp_generator_NER_lstm_best.' + str(run_i) + '.h5'
    # tmp_model_filename = 'tmp_generator_NER_best.h5'
    # checkpoint = ModelCheckpoint(tmp_model_filename, verbose=1, save_best_only = True, monitor = 'val_acc')
    history = F1History()
    
    model = get_model_lstm()
    model.fit_generator(
        generator(trainSentences, shuffle_data=True), steps_per_epoch = math.ceil(len(trainSentences) / batch_size), 
        validation_data = generator(devSentences), validation_steps = math.ceil(len(devSentences) / batch_size), 
        epochs = 15, callbacks = [history]
    )
    
    model.load_weights(tmp_model_filename)
    true_labels, pred_labels = predict_batches_ignore_padding(testSentences)
    
    pre, rec, f1 = compute_f1(pred_labels, true_labels, idx2Label)
    f.write(str(run_i) + "\t" + str(pre) + "\t" + str(rec) +  "\t" + str(f1))
    f.write("\n")
    f.flush()
f.close()

Run 0
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
char_input (InputLayer)         (None, None, 52)     0                                            
__________________________________________________________________________________________________
casing_input (InputLayer)       (None, None)         0                                            
__________________________________________________________________________________________________
char_embedding (TimeDistributed (None, None, 52, 32) 10560       char_input[0][0]                 
__________________________________________________________________________________________________
words_input (InputLayer)        (None, None, 300)    0                                            
__________________________________________________________________________________________________
case

Creating batches ...Creating batches ...

Shuffling ...Epoch 1/15


New maximum F1 score: 0.743232484076433 (before: 0) Saving to tmp_generator_NER_lstm_best.1.h5
Epoch 2/15

# Save final model

In [None]:
import shutil, json
# copy file for best run
shutil.copyfile('tmp_generator_NER_best.0.h5', 'final_model_germeval.h5')
with open("final_model_germeval.indexes", "w") as f:
    json.dump([idx2Label, label2Idx, char2Idx, case2Idx], f)

In [None]:
true_labels, pred_labels = predict_batches(test_batches)
print(compute_f1(pred_labels, true_labels, idx2Label))

In [None]:
print(pred_labels)

In [None]:
print(testSentences[len(testSentences)-1])

In [None]:
print(true_labels[len(testSentences)-1])
print(pred_labels[len(testSentences)-1])

In [None]:
idx2Label[14]

In [None]:
print(len(testSentences))
print(len(pred_labels))

In [None]:
type(train_batches[0])