# NER Model

In [30]:
from gensim.models import KeyedVectors
from nltk.corpus.reader.conll import ConllCorpusReader
import nltk
import re
from math import floor
import numpy as np
import sys
import pandas as pd
import itertools
import matplotlib.pyplot as plt
from keras.layers.recurrent import LSTM
from keras.models import Sequential, Model, load_model
from keras.layers import Dense, Bidirectional, Flatten, Dropout, TimeDistributed
from keras.wrappers.scikit_learn import KerasClassifier
from keras.layers.normalization import BatchNormalization
from keras.utils import to_categorical
import keras.backend as K
from keras_contrib.layers import CRF
from keras_contrib.utils import save_load_utils
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score, train_test_split
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score

In [21]:
max_length = 70
num_features = 300

In [32]:
def get_padded_sentence_features(sentences, num_features, max_length, wv):
    features = np.empty((0, max_length, num_features))
    for i in range(len(sentences)):
        if i % 100 == 0:
            print("Processed", i, "of", len(sentences))
        sent = sentences[i]
        new_sent = []
        for j in range(max_length):
            if 0 <= j < len(sent):
                this_word = sent[j]
                if this_word in wv.vocab:
                    new_sent.append(wv.get_vector(this_word))
                elif this_word == '':
                    new_sent.append(np.zeros(num_features))
                else:
                    new_sent.append(np.random.uniform(-0.25,0.25, num_features))  # random vector for unknown
            else:
                new_sent.append(np.zeros(num_features))

        feature_stack = np.dstack([[new_sent]])
        features = np.vstack([features, feature_stack])
        
    return features

In [3]:
nums_regex = re.compile(r'0+')
def clean_sents(sents, max_length):
    cleaned = []
    # remove sentences shorter than 5 words
    for sent in sents:
        if len(sent) > 4 and len(sent) <= max_length:
            new_sent = []
            # clean the words
            for word in sent:
                this_word = word.lower()
                new_word = ''
                # replace numbers with 0
                for char in this_word:
                    if char.isalpha():
                        new_word = new_word + char
                    elif char.isdigit():
                        new_word = new_word + '0'
                new_word = nums_regex.sub('0', new_word)
                new_sent.append((new_word, word[1], word[2]))
            cleaned.append(new_sent)
    return cleaned

In [7]:
def arr2label(cats, labels):
    new_labels = []
    for i in range(len(cats)):
        sent_labels = []
        for j in range(len(cats[i])):
            label = np.argmax(cats[i][j])
            label = labels[label]
            new_labels.append(label)
    return new_labels


## Load the Data

In [27]:
wv = KeyedVectors.load_word2vec_format('data/wiki.multi.en.vec.txt', binary=False)

In [28]:
wv_it = KeyedVectors.load_word2vec_format('data/wiki.multi.it.vec.txt', binary=False)

In [10]:
classes = ['B-MISC', 'I-MISC', 'I-LOC', 'B-PER', 'I-PER', 'B-LOC', 'I-ORG', 'B-ORG', 'O']
num_classes = len(classes)
encoded_classes = range(num_classes)
class2idx = {classes[enc]: enc for enc in encoded_classes}

## Load the Model

In [23]:
dropout = 0.1
recurrent_dropout = 0.3
hidden_nodes = 100
window_size = 70

def create_model(num_classes, num_features, hidden_nodes=100):
    model = Sequential()
    model.add(Bidirectional(
        LSTM(units=num_features, return_sequences=True, dropout=dropout, recurrent_dropout=recurrent_dropout),
        input_shape=(window_size, num_features,),
        merge_mode='concat'))
#     model.add(Dropout(dropout))
#     model.add(BatchNormalization())
    model.add(TimeDistributed(Dense(hidden_nodes, activation='relu')))
    # add a CRF layer to enforce NER IOB rules
    crf = CRF(num_classes, sparse_target=False)
    model.add(crf)
    print("Summary:", model.summary())
    model.compile(optimizer='rmsprop', loss=crf.loss_function, metrics=[crf.accuracy])
#     model.add(Dense(num_classes, activation='softmax'))
#     model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

In [None]:
X_train = np.load('data/full.X.train.npy', mmap_mode='r')
y_train = np.load('data/full.y.train.npy', mmap_mode='r')
model = create_model(num_classes, num_features)
model.fit(X_train, y_train, batch_size=50, epochs=10)
# save_load_utils.load_all_weights(model,'models/full_train.h5', include_optimizer=False)

## Test the Model

In [55]:
test_sent1 = 'The United States of America, often called USA, is the country where we live.' # 'La multinacional española Telefónica ha impuesto un récord mundial al poner en servicio tres millones de nuevas líneas en el estado brasileño de Sao Paulo desde que asumió el control de la operadora Telesp hace 20 meses, anunció hoy el presidente de Telefónica do Brasil, Fernando Xavier Ferreira.'
test_sent2 = 'Sai la semplicità di Messer Nicia, che benché sia dottore, egli è el più semplice e il più sciocco uomo di Firenze'
test_sent3 = 'You know the simplicity of Messer Nicia, that although he is a doctor, he is the simplest and the silliest man in Florence'

In [56]:
test1 = nltk.word_tokenize(test_sent1)  # tokenize the sentence
test1 = clean_sents([test1], max_length)
test_X1 = get_padded_sentence_features(test1, num_features, max_length, wv)  # process the sentence

Processed 0 of 1


In [57]:
pred1 = model.predict(test_X1)  # predict
labels1 = arr2label(pred1, classes)  # get labels
labels1 = labels1[:len(test1)]  # remove padding
print(labels1)

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [58]:
tags = [(test1[i], labels1[i]) for i in range(len(test1))]

In [59]:
print(tags)

[('The', 'O'), ('United', 'O'), ('States', 'O'), ('of', 'O'), ('America', 'O'), (',', 'O'), ('often', 'O'), ('called', 'O'), ('USA', 'O'), (',', 'O'), ('is', 'O'), ('the', 'O'), ('country', 'O'), ('where', 'O'), ('we', 'O'), ('live', 'O'), ('.', 'O')]
