In [1]:
import sys
import os
import numpy as np 
from keras.utils import to_categorical
from validation import compute_f1
from keras.models import Model
from keras.layers import TimeDistributed,Conv1D,Dense,Embedding,Input,Dropout,LSTM,Bidirectional,MaxPooling1D,Flatten,concatenate
from keras.preprocessing.sequence import pad_sequences
from keras.initializers import RandomUniform
import keras.backend as K
from keras.callbacks import ModelCheckpoint
from keras_contrib.layers import CRF
from numpy import newaxis
import sklearn
import subprocess
import fastText

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
def getCasing(word, caseLookup):   
    casing = 'other'
    
    numDigits = 0
    for char in word:
        if char.isdigit():
            numDigits += 1
            
    digitFraction = numDigits / float(len(word))
    
    if word.isdigit(): #Is a digit
        casing = 'numeric'
    elif digitFraction > 0.5:
        casing = 'mainly_numeric'
    elif word.islower(): #All lower case
        casing = 'allLower'
    elif word.isupper(): #All upper case
        casing = 'allUpper'
    elif word[0].isupper(): #is a title, initial char upper, then all lower
        casing = 'initialUpper'
    elif numDigits > 0:
        casing = 'contains_digit'
    
   
    return caseLookup[casing]


def createMatrices(sentences, label2Idx, case2Idx,char2Idx):
    #{'numeric': 0, 'allLower': 1, 'contains_digit': 6, 'PADDING_TOKEN': 7, 'other': 4, 'allUpper': 2, 'mainly_numeric': 5, 'initialUpper': 3}

        
    dataset = []
    
    wordCount = 0
    unknownWordCount = 0
    
    for sentence in sentences:
        wordIndices = []
        caseIndices = []
        charIndices = []
        labelIndices = []
        
        for word,char,label in sentence:  
            charIdx = []
            for x in char:
                if x in char2Idx.keys():
                    charIdx.append(char2Idx[x])
                else:
                    charIdx.append(char2Idx['UNKNOWN'])
            #Get the label and map to int            
            wordIndices.append(word)
            caseIndices.append(getCasing(word, case2Idx))
            charIndices.append(charIdx)
            labelIndices.append(label2Idx[label])
           
        dataset.append([wordIndices, caseIndices, charIndices, labelIndices]) 
        
    return dataset

def padding(Sentences):
    maxlen = 52
    for sentence in Sentences:
        char = sentence[2]
        for x in char:
            maxlen = max(maxlen,len(x))
    for i,sentence in enumerate(Sentences):
        Sentences[i][2] = pad_sequences(Sentences[i][2],52,padding='post')
    return Sentences



In [3]:
def tag_dataset(dataset):
    correctLabels = []
    predLabels = []
    b = Progbar(len(dataset))
    for i,data in enumerate(dataset):    
        tokens, casing,char, labels = data
        tokens = np.asarray([tokens])     
        casing = np.asarray([casing])
        char = np.asarray([char])
        pred = model.predict([tokens, casing,char], verbose=False)[0]   
        pred = pred.argmax(axis=-1) #Predict the classes            
        correctLabels.append(labels)
        predLabels.append(pred)
        b.update(i)
    return predLabels, correctLabels

In [4]:
# changing all deriv and part to misc. with BIO
def modify_labels(dataset):
    bad_labels = ['I-PERderiv','I-OTHpart','B-ORGderiv', 'I-OTH','B-OTHpart','B-LOCderiv','I-LOCderiv','I-OTHderiv','B-PERderiv','B-OTHderiv','B-PERpart','I-PERpart','I-LOCpart','B-LOCpart','I-ORGpart','I-ORGderiv','B-ORGpart','B-OTH']
    for sentence in dataset:
        for word in sentence:
            label = word[1]
            if label in bad_labels:
                first_char = label[0]
                if first_char == 'B' :
                    word[1] = 'B-MISC'
                else:
                    word[1] = 'I-MISC'
    return dataset
                

In [5]:
def get_sentences_germeval(path):
    sentences = []
    with open(path, 'r', encoding = 'UTF-8') as f:
        sentence = []
        for line in f:
            
            line = line.strip()
            
            # append sentence
            if len(line) == 0:
                if len(sentence):
                    sentences.append(sentence)
                sentence = []
                continue
            
            # get sentence tokens
            splits = line.split()
            if splits[0] == '#':
                continue
            temp = [splits[1],splits[2]]
            sentence.append(temp)
        
        # append last
        if len(sentence):
            sentences.append(sentence)    
    return sentences

In [6]:
# preproecessing data from Conll
def get_sentences_conll(filename):
    '''
        -DOCSTART- -X- -X- O

    EU NNP B-NP B-ORG
    rejects VBZ B-VP O
    German JJ B-NP B-MISC
    call NN I-NP O
    to TO B-VP O
    boycott VB I-VP O
    British JJ B-NP B-MISC
    lamb NN I-NP O
    . . O O
    
    '''
    
    '''
    read file
    return format :
    [ ['EU', 'B-ORG'], ['rejects', 'O'], ['German', 'B-MISC'], ['call', 'O'], ['to', 'O'], ['boycott', 'O'], ['British', 'B-MISC'], ['lamb', 'O'], ['.', 'O'] ]
    '''
    f = open(filename,'rb')
    sentences = []
    sentence = []
    for line in f:
        splits = line.split()
        try:
            word=splits[0].decode()
            if word=='-DOCSTART-':
                continue
            label=splits[-1].decode()
            temp=[word,label]
            sentence.append(temp)
        except Exception as e:
            if len(sentence)!=0:
                sentences.append(sentence)
                sentence=[]
    return sentences



In [7]:
testSentences = get_sentences_germeval('../data/GermEVAL/NER-de-test.tsv')
print(len(testSentences))


5100


In [8]:
print(testSentences[0])

[['1951', 'O'], ['bis', 'O'], ['1953', 'O'], ['wurde', 'O'], ['der', 'O'], ['nördliche', 'O'], ['Teil', 'O'], ['als', 'O'], ['Jugendburg', 'O'], ['des', 'O'], ['Kolpingwerkes', 'B-OTH'], ['gebaut', 'O'], ['.', 'O']]


In [49]:
# Load label mapping
import json
indexMappings = json.load(open("final_model_germeval.indexes", "r"))
idx2Label = {int(k):v for k,v in indexMappings[0].items()}
label2Idx = indexMappings[1]
char2Idx = indexMappings[2]
case2Idx = indexMappings[3]

In [50]:
print(idx2Label)
print(label2Idx)
print(char2Idx)
print(case2Idx)

{0: 'B-OTHderiv', 1: 'I-OTH', 2: 'B-ORGpart', 3: 'B-LOCderiv', 4: 'I-PER', 5: 'B-ORGderiv', 6: 'I-PERpart', 7: 'I-OTHpart', 8: 'I-ORGpart', 9: 'I-LOC', 10: 'I-ORG', 11: 'B-PERderiv', 12: 'B-LOCpart', 13: 'O', 14: 'I-LOCderiv', 15: 'I-PERderiv', 16: 'B-PERpart', 17: 'B-ORG', 18: 'I-OTHderiv', 19: 'I-ORGderiv', 20: 'I-LOCpart', 21: 'B-OTHpart', 22: 'B-OTH', 23: 'B-LOC', 24: 'B-PER'}
{'B-OTH': 22, 'I-ORGderiv': 19, 'B-ORGderiv': 5, 'B-PERpart': 16, 'I-PERpart': 6, 'I-LOCpart': 20, 'I-OTH': 1, 'I-PERderiv': 15, 'B-OTHderiv': 0, 'I-ORGpart': 8, 'I-ORG': 10, 'O': 13, 'B-PERderiv': 11, 'B-LOCderiv': 3, 'B-ORG': 17, 'I-PER': 4, 'B-LOC': 23, 'I-OTHderiv': 18, 'B-ORGpart': 2, 'I-OTHpart': 7, 'I-LOCderiv': 14, 'B-PER': 24, 'B-OTHpart': 21, 'B-LOCpart': 12, 'I-LOC': 9}
{'œ': 271, 'є': 281, '▪': 292, 'ệ': 305, 'р': 25, '×': 14, 'æ': 273, 'É': 72, '\x9a': 93, 'o': 222, '©': 13, 'K': 303, 'i': 39, '»': 158, 'ά': 133, 'D': 141, '루': 285, '›': 314, 'ю': 113, 'á': 274, 'ε': 316, '\x95': 263, 'd': 0, 'ć'

In [21]:
caseEmbeddings = np.identity(len(case2Idx), dtype='float32')

In [25]:
# ft = fastText.load_model("../embeddings/wiki.de.bin")
ft = fastText.load_model("../embeddings/cc.de.300.bin")

nb_embedding_dims = ft.get_dimension()

In [26]:
def createBatches(dataset):
    l = []
    for i in dataset:
        l.append(len(i))
    l = set(l)
    print(len(l))
    batches = []
    batch_len = []
    z = 0
    for i in l:
        temp = []
        for batch in dataset:
            if len(batch) == i:
                temp.append(batch)
                z += 1
        batches.append(temp)
#         batch_len.append(z)
    return batches

In [27]:
test_batches = createBatches(testSentences)

47


In [34]:
def generator(batches: 'list of training/dev sentences- batches already created'):
    global line_number
    
    while True:
        for batch in batches:
            word_embeddings = []
            case_embeddings = []
            char_embeddings = []

            output_labels = []
            for index in range(len(batch)): # batches made according to the size of the sentences. len(batch) gives the size of current batch            
                sentence = batch[index]
                temp_casing = []
                temp_char=[]
                temp_word=[]
                temp_output=[]
                for word in sentence:
                    word, label = word
                    casing =getCasing(word, case2Idx)
                    temp_casing.append(casing)
                    temp_char2=[]
                    for char in word:
                        if char in char2Idx.keys():
                            temp_char2.append(char2Idx[char])
                        else:
                            temp_char2.append(char2Idx['UNKNOWN']) # To incorporate the words which are not in the vocab
                    temp_char2 = np.array(temp_char2)
                    temp_char.append(temp_char2)
                    # word_vector = ft.get_word_vector(word.lower())
                    word_vector = ft.get_word_vector(word)
                    temp_word.append(word_vector)
                    temp_output.append(label2Idx[label])
                temp_char = pad_sequences(temp_char, 52)
                word_embeddings.append(temp_word)
                case_embeddings.append(temp_casing)
                char_embeddings.append(temp_char)
                temp_output = to_categorical(temp_output, 25)
                output_labels.append(temp_output)
            yield ([np.array(word_embeddings), np.array(case_embeddings), np.array(char_embeddings)], np.array(output_labels))

def get_label_from_categorical(a):
    labels = []
    for label in a:
        label = np.ndarray.tolist(label)
        label = np.argmax(label)
        labels.append(label)
    return(labels)

def predict_batches(batch):
    steps = 0
    true_labels = []
    pred_labels = []
    for input_data, output_data in generator(batch):
        pred_labels_batch = model.predict(input_data)
        for s in pred_labels_batch:
            pred_labels.append(get_label_from_categorical(s))
        for s in output_data:
            true_labels.append(get_label_from_categorical(s))
        steps += 1
        if steps == len(batch):
            break
    return(true_labels, pred_labels)

tmp_model_filename = 'tmp_generator_NER_best.h5'
checkpoint = ModelCheckpoint(tmp_model_filename, verbose=1, save_best_only = True, monitor = 'val_acc')

In [30]:
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=1

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=1


In [31]:
def get_model():
    words_input = Input(shape=(None, nb_embedding_dims), dtype='float32', name='words_input')
    casing_input = Input(shape=(None,), dtype='int32', name='casing_input')
    casing = Embedding(output_dim=caseEmbeddings.shape[1], input_dim=caseEmbeddings.shape[0], weights=[caseEmbeddings], trainable=False, name = 'case_embed')(casing_input)
    character_input=Input(shape=(None,52,),name='char_input')
    embed_char_out=TimeDistributed(Embedding(len(char2Idx),32,embeddings_initializer=RandomUniform(minval=-0.5, maxval=0.5)), name='char_embedding')(character_input)
    kernel_sizes = (3, 4, 5)
    conv_blocks = []
    for sz in kernel_sizes:
        conv = TimeDistributed(Conv1D(
                             kernel_size=sz,
                             filters=32,
                             padding="same",
                             activation="relu",
                             strides=1))(embed_char_out)
        conv = TimeDistributed(MaxPooling1D(52))(conv)
        conv = TimeDistributed(Flatten())(conv)
        conv_blocks.append(conv)
    output = concatenate([words_input, casing, conv_blocks[0], conv_blocks[1], conv_blocks[2]])
    output = Bidirectional(LSTM(200, return_sequences=True, dropout=0.50, recurrent_dropout=0.5))(output)
    output = TimeDistributed(Dense(len(label2Idx)))(output)
    crf = CRF(len(label2Idx))
    output = crf(output)
    model = Model(inputs=[words_input, casing_input, character_input], outputs=[output])
    model.compile(loss=crf.loss_function, optimizer='nadam', metrics=[crf.accuracy])
    model.summary()
    return(model)

In [32]:
model = get_model()
model.load_weights(tmp_model_filename)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
char_input (InputLayer)         (None, None, 52)     0                                            
__________________________________________________________________________________________________
char_embedding (TimeDistributed (None, None, 52, 32) 10528       char_input[0][0]                 
__________________________________________________________________________________________________
time_distributed_1 (TimeDistrib (None, None, 52, 32) 3104        char_embedding[0][0]             
__________________________________________________________________________________________________
time_distributed_4 (TimeDistrib (None, None, 52, 32) 4128        char_embedding[0][0]             
__________________________________________________________________________________________________
time_distr

In [51]:
true_labels, pred_labels = predict_batches(test_batches)
print(compute_f1(true_labels, pred_labels, idx2Label))

(0.7745224991906766, 0.8343504795117699, 0.8033240997229917)
