# Multilingual NER
### Load Word Embeddings in Training Language

In [1]:
from gensim.models import KeyedVectors
wv = KeyedVectors.load_word2vec_format('data/wiki.multi.en.vec.txt', binary=False)


In [2]:
pretrained_weights = wv.vectors
vocab_size, embedding_size = pretrained_weights.shape
print('Result embedding shape:', pretrained_weights.shape)

def word2idx(word):
  return wv.vocab[word].index
def idx2word(idx):
  return wv.index2word[idx]

Result embedding shape: (200000, 300)


## Load the Training Sentences

In [3]:
from nltk.corpus.reader.conll import ConllCorpusReader

corpus = ConllCorpusReader('data', fileids=['eng.train.txt'], columntypes=('words', 'pos', 'ne', 'chunk'))

In [13]:
import re
nums_regex = re.compile(r'0+')
def clean_sents(iob_sents):
    sents = []
    # remove sentences shorter than 5 words
    for sent in iob_sents:
        if len(sent) > 4:
            new_sent = []
            # clean the words
            for word in sent:
                this_word = word[0].lower()
                new_word = ''
                # replace numbers with 0
                for char in this_word:
                    if char.isalpha():
                        new_word = new_word + char
                    elif char.isdigit():
                        new_word = new_word + '0'
                new_word = nums_regex.sub('0', new_word)
                new_sent.append((new_word, word[1], word[2]))
            sents.append(new_sent)
    return sents

sents = clean_sents(corpus.iob_sents())
print("Number of sentences:", len(sents))

Number of sentences: 11376


In [16]:
from math import floor
import numpy as np
def windows(data, window_size):
    start = 0
    end = 0
    while start < len(data):
        while end < len(data) and end-start < window_size:
            end += 1
        yield start, end
        start = end
        if end >= len(data):
            break  
        
def get_features(sentences, num_features, window_size, wv):
    features = np.empty((0, window_size, num_features))
    labels = np.empty((0))
    count_unk = 0
    count_known = 0
    for i in range(len(sentences)):
        sys.stdout.write("[%-20s] %d%%" % ('='*i, (i/len(sentences)*100)))
        sys.stdout.flush()
#         if i % 100 == 0:
#             print("Processed", i, "of", len(sentences))
        sent_features, sent_labels = get_sentence_features(sentences[i], num_features, window_size, wv)
        features = np.vstack([features, sent_features])
        labels = np.append(labels, sent_labels)

    return features, labels

def get_sentence_features(sentence, num_features, window_size, wv):
    features = np.empty((0, window_size, num_features))
    labels = np.empty((0))
    for j in range(len(sentence)):
        m = floor(window_size/2)
        start = j-m
        end = j+m+1
        # no padding
        if start >= 0 and end <= len(sentence):
            words = sentence[start:end]
        else:
            # padding
            if start >= 0:
                words = sentence[start:] + [('', '', 'O')] * (end - len(sentence))
            elif end <= len(sentence):
                words = [('', '', 'O')] * (0-start) + sentence[:end]
            else:
                [('', '', 'O')] * (0-start) + sentence + [('', '', 'O')] * (end - len(sentence))
        emb = []
        # clean the words and get the vectors
        for word in words:
            this_word = word[0]
            if this_word in wv.vocab:
                emb.append(wv.get_vector(this_word))
            elif this_word == '':
                emb.append(np.zeros(num_features))
            else:
                emb.append(np.random.uniform(-0.25,0.25, num_features))  # random vector for unknown
        feature_stack = np.dstack([[emb]])
        features = np.vstack([features, feature_stack])
        labels = np.append(labels, sentence[j][-1])
    return features, labels

In [None]:
import pandas as pd
window_size = 7
num_features = embedding_size
X, y = get_features(sents, num_features, window_size, wv)
# y = np.asarray(pd.get_dummies(label_values), dtype = np.float32)
# X = X.reshape((len(X), window_size, num_features))

Processed 0 of 11376
Processed 100 of 11376
Processed 200 of 11376
Processed 300 of 11376
Processed 400 of 11376
Processed 500 of 11376
Processed 600 of 11376
Processed 700 of 11376
Processed 800 of 11376
Processed 900 of 11376
Processed 1000 of 11376
Processed 1100 of 11376
Processed 1200 of 11376
Processed 1300 of 11376
Processed 1400 of 11376
Processed 1500 of 11376
Processed 1600 of 11376
Processed 1700 of 11376
Processed 1800 of 11376
Processed 1900 of 11376
Processed 2000 of 11376
Processed 2100 of 11376
Processed 2200 of 11376
Processed 2300 of 11376
Processed 2400 of 11376
Processed 2500 of 11376
Processed 2600 of 11376
Processed 2700 of 11376
Processed 2800 of 11376
Processed 2900 of 11376
Processed 3000 of 11376
Processed 3100 of 11376
Processed 3200 of 11376
Processed 3300 of 11376
Processed 3400 of 11376
Processed 3500 of 11376
Processed 3600 of 11376
Processed 3700 of 11376
Processed 3800 of 11376
Processed 3900 of 11376
Processed 4000 of 11376
Processed 4100 of 11376
Proc

In [None]:
import pickle
np.save('data/preprocessed.eng.X.train', X)
np.save('data/preprocessed.eng.y.train', y)

In [None]:
from keras.layers.recurrent import LSTM
from keras.models import Sequential, Model
from keras.layers import Dense, Bidirectional, Flatten, Dropout
from keras.wrappers.scikit_learn import KerasClassifier
dropout = 0.05
num_classes = len(set(y))
print("Classes:", set(y))

print("num_classes:",num_classes)

def create_model():
    model = Sequential()
    model.add(Bidirectional(LSTM(units=num_features), input_shape=(window_size, num_features,), merge_mode='concat'))
    model.add(Dropout(dropout))
    model.add(Dense(num_features, activation='tanh'))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

model = KerasClassifier(build_fn=create_model)

In [None]:
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score
# model.fit(X, y)
# print("Score:", model.score(X, y))
# kfold = KFold(n_splits=5, shuffle=True)  # StratifiedKFold(n_splits=5, shuffle=True)
# results = cross_val_score(model, X, y, cv=kfold)
# print("Score:", results.mean())

## Evaluate Multilingual Capabilities

### Load Target Language

In [None]:
wv_de = KeyedVectors.load_word2vec_format('data/wiki.multi.en.vec.txt', binary=False)

In [None]:
corpus_de = ConllCorpusReader('data', fileids=['deu.testa.txt'], columntypes=('words', 'srl', 'pos', 'ne', 'chunk'))

### Train on All Data

In [None]:
model.fit(X, y)

### Test on Target Language

In [None]:
sents_de = clean_sents(corpus_de.iob_sents())
print(sents_de[:10])

In [None]:
X_de, y_de = get_features(sents_de, num_features, window_size, wv_de)

In [None]:
print("English classes:", set(y))
print("German classes:", set(y_de))

In [None]:
score_de = model.score(X_de, y_de)
print("German Score from English Training:", score_de)

In [None]:
pred_de_10 = model.predict(X_de[:10])
print("Prediction:", pred_de_10)
print("True:", y_de[:10])

## Analyze the Results

In [None]:
import itertools
import matplotlib.pyplot as plt
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

### English Results

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15)  # english only results
model.fit(X_train, y_train, epochs=1, shuffle=True)
y_pred = model.predict(X_test)
cnf_matrix = confusion_matrix(y_test, y_pred, labels=list(set(y)))
plot_confusion_matrix(cnf_matrix, classes=list(set(y)), normalize=True,
                      title='Normalized confusion matrix')

### German Results

In [None]:
y_pred_de = model.predict(X_de)
cnf_matrix = confusion_matrix(y_de, y_pred_de, labels=list(set(y_de)))
plot_confusion_matrix(cnf_matrix, classes=list(set(y_de)), normalize=True,
                      title='Normalized confusion matrix')