# NER Model

In [5]:
from gensim.models import KeyedVectors
from nltk.corpus.reader.conll import ConllCorpusReader
import re
from math import floor
import numpy as np
import sys
import pandas as pd
import itertools
import matplotlib.pyplot as plt
from keras.layers.recurrent import LSTM
from keras.models import Sequential, Model, load_model
from keras.layers import Dense, Bidirectional, Flatten, Dropout, TimeDistributed
from keras.wrappers.scikit_learn import KerasClassifier
from keras.layers.normalization import BatchNormalization
from keras.utils import to_categorical
import keras.backend as K
from keras_contrib.layers import CRF
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score, train_test_split
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score

In [6]:
max_length = 70
num_featuers = 300

In [2]:
def get_padded_sentence_features(sentences, num_features, max_length, wv):
    features = np.empty((0, max_length, num_features))
    for i in range(len(sentences)):
        if i % 100 == 0:
            print("Processed", i, "of", len(sentences))
        sent = sentences[i]
        new_sent = []
        for j in range(max_length):
            if 0 <= j < len(sent):
                this_word = sent[j]
                if this_word in wv.vocab:
                    new_sent.append(wv.get_vector(this_word))
                elif this_word == '':
                    new_sent.append(np.zeros(num_features))
                else:
                    new_sent.append(np.random.uniform(-0.25,0.25, num_features))  # random vector for unknown
            else:
                new_sent.append(np.zeros(num_features))
                sent_labels = np.append(sent_labels, 'O')

        feature_stack = np.dstack([[new_sent]])
        features = np.vstack([features, feature_stack])
        
    return features

In [3]:
nums_regex = re.compile(r'0+')
def clean_sents(sents, max_length):
    cleaned = []
    # remove sentences shorter than 5 words
    for sent in sents:
        if len(sent) > 4 and len(sent) <= max_length:
            new_sent = []
            # clean the words
            for word in sent:
                this_word = word.lower()
                new_word = ''
                # replace numbers with 0
                for char in this_word:
                    if char.isalpha():
                        new_word = new_word + char
                    elif char.isdigit():
                        new_word = new_word + '0'
                new_word = nums_regex.sub('0', new_word)
                new_sent.append((new_word, word[1], word[2]))
            cleaned.append(new_sent)
    return cleaned

In [None]:
def arr2label(cats, labels):
    new_labels = []
    for i in range(len(cats)):
        sent_labels = []
        for j in range(len(cats[i])):
            label = np.argmax(cats[i][j])
            label = labels[label]
            new_labels.append(label)
    return new_labels


## Load the Data

In [None]:
wv = KeyedVectors.load_word2vec_format('data/wiki.multi.en.vec.txt', binary=False)

In [None]:
wv_it = KeyedVectors.load_word2vec_format('data/wiki.multi.it.vec.txt', binary=False)

In [None]:
X_en_train = np.load('data/eng.X.train.npy', mmap_mode='r')
y_en_train_label = np.load('data/eng.y.train.npy', mmap_mode='r')

In [None]:
classes = ['B-MISC', 'I-MISC', 'I-LOC', 'B-PER', 'I-PER', 'B-LOC', 'I-ORG', 'B-ORG', 'O']
encoded_classes = range(num_classes)
class2idx = {classes[enc]: enc for enc in encoded_classes}

## Load the Model

In [None]:
model = load_model('models/full_train.h5', custom_objects={'CRF': CRF})

## Test the Model

In [None]:
test_sent1 = 'The model that Steven and Tony created had never seen a sentence such as this!'
test_sent2 = 'Sai la semplicità di Messer Nicia, che benché sia dottore, egli è el più semplice e il più sciocco uomo di Firenze'
test_sent2_eng = 'You know the simplicity of Messer Nicia, that although he is a doctor, he is the simplest and the silliest man in Florence'