# Multilingual NER

### Imports

In [4]:
from gensim.models import KeyedVectors
from nltk.corpus.reader.conll import ConllCorpusReader
import re
from math import floor
import numpy as np
import sys
import pandas as pd
import itertools
import matplotlib.pyplot as plt
from keras.layers.recurrent import LSTM
from keras.models import Sequential, Model
from keras.layers import Dense, Bidirectional, Flatten, Dropout
from keras.preprocessing.sequence import pad_sequences
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score, train_test_split
from sklearn.metrics import confusion_matrix

### Define Preprocessing Functions

In [5]:
nums_regex = re.compile(r'0+')
def clean_sents(iob_sents):
    sents = []
    # remove sentences shorter than 5 words
    for sent in iob_sents:
        if len(sent) > 4:
            new_sent = []
            # clean the words
            for word in sent:
                this_word = word[0].lower()
                new_word = ''
                # replace numbers with 0
                for char in this_word:
                    if char.isalpha():
                        new_word = new_word + char
                    elif char.isdigit():
                        new_word = new_word + '0'
                new_word = nums_regex.sub('0', new_word)
                new_sent.append((new_word, word[1], word[2]))
            sents.append(new_sent)
    return sents


In [6]:
def windows(data, window_size):
    start = 0
    end = 0
    while start < len(data):
        while end < len(data) and end-start < window_size:
            end += 1
        yield start, end
        start = end
        if end >= len(data):
            break  

def get_padded_sentence_features(sentences, num_features, max_length, wv):
    features = np.empty((0, max_length, num_features))
    labels = np.empty((0, max_length))
    for i in range(len(sentences)):
        if i % 100 == 0:
            print("Processed", i, "of", len(sentences))
        sent = sentences[i]
        new_sent = []
        sent_labels = np.empty((0))
        for j in range(max_length):
            if 0 <= j < len(sent):
                this_word = sent[j][0]
                if this_word in wv.vocab:
                    new_sent.append(wv.get_vector(this_word))
                elif this_word == '':
                    new_sent.append(np.zeros(num_features))
                else:
                    new_sent.append(np.random.uniform(-0.25,0.25, num_features))  # random vector for unknown
                sent_labels = np.append(sent_labels, sent[j][-1])
            else:
                new_sent.append(np.zeros(num_features))
                sent_labels = np.append(sent_labels, 'O')

        labels = np.vstack([labels, sent_labels])
        feature_stack = np.dstack([[new_sent]])
        features = np.vstack([features, feature_stack])
        
    return features, labels


def get_features(sentences, num_features, window_size, wv):
    features = np.empty((0, window_size, num_features))
    labels = np.empty((0))
    count_unk = 0
    count_known = 0
    for i in range(len(sentences)):
        if i % 100 == 0:
            print("Processed", i, "of", len(sentences))
        sent_features, sent_labels = get_sentence_features(sentences[i], num_features, window_size, wv)
        features = np.vstack([features, sent_features])
        labels = np.append(labels, sent_labels)

    return features, labels


def get_sentence_features(sentence, num_features, window_size, wv):
    features = np.empty((0, window_size, num_features))
    labels = np.empty((0))
    for j in range(len(sentence)):
        m = floor(window_size/2)
        start = j-m
        end = j+m+1
        # no padding
        if start >= 0 and end <= len(sentence):
            words = sentence[start:end]
        else:
            # padding
            if start >= 0:
                words = sentence[start:] + [('', '', 'O')] * (end - len(sentence))
            elif end <= len(sentence):
                words = [('', '', 'O')] * (0-start) + sentence[:end]
            else:
                [('', '', 'O')] * (0-start) + sentence + [('', '', 'O')] * (end - len(sentence))
        emb = []
        # clean the words and get the vectors
        for word in words:
            this_word = word[0]
            if this_word in wv.vocab:
                emb.append(wv.get_vector(this_word))
            elif this_word == '':
                emb.append(np.zeros(num_features))
            else:
                emb.append(np.random.uniform(-0.25,0.25, num_features))  # random vector for unknown
        feature_stack = np.dstack([[emb]])
        features = np.vstack([features, feature_stack])
        labels = np.append(labels, sentence[j][-1])
    return features, labels

### Load Word Embeddings in English

In [11]:
wv = KeyedVectors.load_word2vec_format('data/wiki.multi.en.vec.txt', binary=False)


In [16]:
pretrained_weights = wv.vectors
vocab_size, embedding_size = pretrained_weights.shape
print('Result embedding shape:', pretrained_weights.shape)

max_length = 70
window_size = 7
num_features = embedding_size


Result embedding shape: (200000, 300)


### Load the English Training Sentences

In [4]:
corpus = ConllCorpusReader('data', fileids=['eng.train.txt'], columntypes=('words', 'pos', 'ne', 'chunk'))

In [None]:
sents = clean_sents(corpus.iob_sents())
print("Number of sentences:", len(sents))

In [7]:

# X, y = get_features(sents, num_features, window_size, wv)
X, y = get_padded_sentence_features(sents, num_features, max_length, wv)

np.save('data/eng.X.train.npy', X)
np.save('data/eng.y.train.npy', y)
# y = np.asarray(pd.get_dummies(label_values), dtype = np.float32)
# X = X.reshape((len(X), window_size, num_features))

Processed 0 of 11376
Processed 100 of 11376
Processed 200 of 11376
Processed 300 of 11376
Processed 400 of 11376
Processed 500 of 11376
Processed 600 of 11376
Processed 700 of 11376
Processed 800 of 11376
Processed 900 of 11376
Processed 1000 of 11376
Processed 1100 of 11376
Processed 1200 of 11376
Processed 1300 of 11376
Processed 1400 of 11376
Processed 1500 of 11376
Processed 1600 of 11376
Processed 1700 of 11376
Processed 1800 of 11376
Processed 1900 of 11376
Processed 2000 of 11376
Processed 2100 of 11376
Processed 2200 of 11376
Processed 2300 of 11376
Processed 2400 of 11376
Processed 2500 of 11376
Processed 2600 of 11376
Processed 2700 of 11376
Processed 2800 of 11376
Processed 2900 of 11376
Processed 3000 of 11376
Processed 3100 of 11376
Processed 3200 of 11376
Processed 3300 of 11376
Processed 3400 of 11376
Processed 3500 of 11376
Processed 3600 of 11376
Processed 3700 of 11376
Processed 3800 of 11376
Processed 3900 of 11376
Processed 4000 of 11376
Processed 4100 of 11376
Proc

### Load German Test Data

In [7]:
wv_de = KeyedVectors.load_word2vec_format('data/wiki.multi.en.vec.txt', binary=False)

In [9]:
corpus_de_test = ConllCorpusReader('data', fileids=['deu.testa.txt'], columntypes=('words', 'srl', 'pos', 'ne', 'chunk'))

In [10]:
sents_de_test = clean_sents(corpus_de_test.iob_sents())

[[('groer', 'NN', 'O'), ('fotowettbeweb', 'NN', 'O'), ('', '$(', 'O'), ('nordendler', 'NN', 'I-ORG'), ('', '$(', 'O'), ('laden', 'VVFIN', 'O'), ('die', 'ART', 'O'), ('nordendler', 'NN', 'I-MISC'), ('ein', 'ART', 'O')], [('einen', 'ART', 'O'), ('fotowettbewerb', 'NN', 'O'), ('mit', 'APPR', 'O'), ('dem', 'ART', 'O'), ('thema', 'NN', 'O'), ('', '$(', 'O'), ('leben', 'NN', 'O'), ('und', 'KON', 'O'), ('feiern', 'VVFIN', 'O'), ('im', 'APPRART', 'O'), ('nordend', 'NN', 'I-LOC'), ('und', 'KON', 'O'), ('mit', 'APPR', 'O'), ('den', 'ART', 'O'), ('nordendlern', 'NN', 'I-LOC'), ('', '$(', 'O'), ('hat', 'VAFIN', 'O'), ('der', 'ART', 'O'), ('vorstand', 'NN', 'O'), ('des', 'ART', 'O'), ('karnevalclubs', 'NN', 'O'), ('ausgeschrieben', 'VVPP', 'O'), ('', '$.', 'O')], [('teilnehmen', 'NN', 'O'), ('knnen', 'VMFIN', 'O'), ('alle', 'PIDAT', 'O'), ('mitglieder', 'NN', 'O'), ('und', 'KON', 'O'), ('freunde', 'NN', 'O'), ('der', 'ART', 'O'), ('', '$(', 'O'), ('nordendler', 'NN', 'I-ORG'), ('', '$(', 'O'), ('so

In [11]:
X_de_test, y_de_test = get_padded_sentence_features(sents_de_test, num_features, max_length, wv_de)

Processed 0 of 2644
Processed 100 of 2644
Processed 200 of 2644
Processed 300 of 2644
Processed 400 of 2644
Processed 500 of 2644
Processed 600 of 2644
Processed 700 of 2644
Processed 800 of 2644
Processed 900 of 2644
Processed 1000 of 2644
Processed 1100 of 2644
Processed 1200 of 2644
Processed 1300 of 2644
Processed 1400 of 2644
Processed 1500 of 2644
Processed 1600 of 2644
Processed 1700 of 2644
Processed 1800 of 2644
Processed 1900 of 2644
Processed 2000 of 2644
Processed 2100 of 2644
Processed 2200 of 2644
Processed 2300 of 2644
Processed 2400 of 2644
Processed 2500 of 2644
Processed 2600 of 2644


In [12]:
np.save('data/deu.X.testa.npy', X_de_test)
np.save('data/deu.y.testa.npy', y_de_test)

### Load German Train Data

In [17]:
corpus_de_train = ConllCorpusReader('data', fileids=['deu.train.txt'], columntypes=('words', 'srl', 'pos', 'ne', 'chunk'))

In [18]:
sents_de_train = clean_sents(corpus_de_train.iob_sents())

In [19]:
X_de_train, y_de_train = get_padded_sentence_features(sents_de_train, num_features, max_length, wv_de)

Processed 0 of 10995
Processed 100 of 10995
Processed 200 of 10995
Processed 300 of 10995
Processed 400 of 10995
Processed 500 of 10995
Processed 600 of 10995
Processed 700 of 10995
Processed 800 of 10995
Processed 900 of 10995
Processed 1000 of 10995
Processed 1100 of 10995
Processed 1200 of 10995
Processed 1300 of 10995
Processed 1400 of 10995
Processed 1500 of 10995
Processed 1600 of 10995
Processed 1700 of 10995
Processed 1800 of 10995
Processed 1900 of 10995
Processed 2000 of 10995
Processed 2100 of 10995
Processed 2200 of 10995
Processed 2300 of 10995
Processed 2400 of 10995
Processed 2500 of 10995
Processed 2600 of 10995
Processed 2700 of 10995
Processed 2800 of 10995
Processed 2900 of 10995
Processed 3000 of 10995
Processed 3100 of 10995
Processed 3200 of 10995
Processed 3300 of 10995
Processed 3400 of 10995
Processed 3500 of 10995
Processed 3600 of 10995
Processed 3700 of 10995
Processed 3800 of 10995
Processed 3900 of 10995
Processed 4000 of 10995
Processed 4100 of 10995
Proc

In [20]:
np.save('data/deu.X.train.npy', X_de_train)
np.save('data/deu.y.train.npy', y_de_train)

### Load English Test Data

In [13]:
corpus_en_test = ConllCorpusReader('data', fileids=['eng.testa.txt'], columntypes=('words', 'pos', 'ne', 'chunk'))

In [14]:
sents_en_test = clean_sents(corpus_en_test.iob_sents())

In [15]:
X_en_test, y_en_test = get_padded_sentence_features(sents_en_test, num_features, max_length, wv)

Processed 0 of 2701
Processed 100 of 2701
Processed 200 of 2701
Processed 300 of 2701
Processed 400 of 2701
Processed 500 of 2701
Processed 600 of 2701
Processed 700 of 2701
Processed 800 of 2701
Processed 900 of 2701
Processed 1000 of 2701
Processed 1100 of 2701
Processed 1200 of 2701
Processed 1300 of 2701
Processed 1400 of 2701
Processed 1500 of 2701
Processed 1600 of 2701
Processed 1700 of 2701
Processed 1800 of 2701
Processed 1900 of 2701
Processed 2000 of 2701
Processed 2100 of 2701
Processed 2200 of 2701
Processed 2300 of 2701
Processed 2400 of 2701
Processed 2500 of 2701
Processed 2600 of 2701
Processed 2700 of 2701


In [16]:
np.save('data/eng.X.testa.npy', X_en_test)
np.save('data/eng.y.testa.npy', y_en_test)

### Load Spanish Test Data

In [21]:
wv_es = KeyedVectors.load_word2vec_format('data/wiki.multi.es.vec.txt', binary=False)

In [22]:
corpus_es_test = ConllCorpusReader('data', fileids=['esp.testa.txt'], columntypes=('words', 'pos', 'chunk'))

In [23]:
sents_es_test = clean_sents(corpus_es_test.iob_sents())

In [24]:
X_es_test, y_es_test = get_padded_sentence_features(sents_es_test, num_features, max_length, wv_es)

Processed 0 of 1588
Processed 100 of 1588
Processed 200 of 1588
Processed 300 of 1588
Processed 400 of 1588
Processed 500 of 1588
Processed 600 of 1588
Processed 700 of 1588
Processed 800 of 1588
Processed 900 of 1588
Processed 1000 of 1588
Processed 1100 of 1588
Processed 1200 of 1588
Processed 1300 of 1588
Processed 1400 of 1588
Processed 1500 of 1588


In [25]:
np.save('data/esp.X.testa.npy', X_es_test)
np.save('data/esp.y.testa.npy', y_es_test)

### Load Spanish Train Data

In [26]:
corpus_es_train = ConllCorpusReader('data', fileids=['esp.train.txt'], columntypes=('words', 'pos', 'chunk'))

In [27]:
sents_es_train = clean_sents(corpus_es_train.iob_sents())

In [28]:
X_es_train, y_es_train = get_padded_sentence_features(sents_es_train, num_features, max_length, wv_es)

Processed 0 of 7036
Processed 100 of 7036
Processed 200 of 7036
Processed 300 of 7036
Processed 400 of 7036
Processed 500 of 7036
Processed 600 of 7036
Processed 700 of 7036
Processed 800 of 7036
Processed 900 of 7036
Processed 1000 of 7036
Processed 1100 of 7036
Processed 1200 of 7036
Processed 1300 of 7036
Processed 1400 of 7036
Processed 1500 of 7036
Processed 1600 of 7036
Processed 1700 of 7036
Processed 1800 of 7036
Processed 1900 of 7036
Processed 2000 of 7036
Processed 2100 of 7036
Processed 2200 of 7036
Processed 2300 of 7036
Processed 2400 of 7036
Processed 2500 of 7036
Processed 2600 of 7036
Processed 2700 of 7036
Processed 2800 of 7036
Processed 2900 of 7036
Processed 3000 of 7036
Processed 3100 of 7036
Processed 3200 of 7036
Processed 3300 of 7036
Processed 3400 of 7036
Processed 3500 of 7036
Processed 3600 of 7036
Processed 3700 of 7036
Processed 3800 of 7036
Processed 3900 of 7036
Processed 4000 of 7036
Processed 4100 of 7036
Processed 4200 of 7036
Processed 4300 of 7036


In [29]:
np.save('data/esp.X.train.npy', X_es_train)
np.save('data/esp.y.train.npy', y_es_train)