# Named Entity Recognition - demo

Let's start by importing all the important classes

In [None]:
import numpy as np
from keras import Input, Model
from keras.layers import Embedding, Dropout, TimeDistributed, Bidirectional, \
    LSTM, concatenate, Dense
from keras_contrib.layers import CRF
from keras.utils import to_categorical
from nlp_architect.data.sequential_tagging import SequentialTaggingDataset
from nlp_architect.utils.embedding import load_word_embeddings
from nlp_architect.utils.metrics import get_conll_scores

## Preparing the data

Load the dataset using the `NamedEntityDataset` data loader.
The files should be tagged in `BIO` format and each token should appear in a separate line with its tags separated by tabs. For example: `A B-ENTITY`.  Sentence should be separated by an empty line.

In [None]:
train = '<path to train file>'
test = '<path to train file>'

sentence_length = 50
word_length = 12

dataset = SequentialTaggingDataset(train, test,
                             max_sentence_length=sentence_length,
                             max_word_length=word_length,
                             tag_field_no=4)

Get the train and test sets - we have 2 inputs and 1 output (word and chars, and entity type for outout).

In [None]:
x_train, x_char_train, y_train = dataset.train
x_test, x_char_test, y_test = dataset.test

Convert output matrices into 1-hot encoding

In [None]:
num_y_labels = len(dataset.y_labels) + 1
y_test = to_categorical(y_test, num_y_labels)
y_train = to_categorical(y_train, num_y_labels)

## Loading external word embedding model

In [None]:
embedding_path = '/<path to glove.6B>/glove.6B.100d.txt'
embedding_size = 100

external_emb, emb_size = load_word_embeddings(embedding_path)
embedding_matrix = np.zeros((dataset.word_vocab_size, emb_size))
for word, i in dataset.word_vocab.items():
    embedding_vector = external_emb.get(word.lower())
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
embedding_matrix.shape

## Creating the model
The NER model we're going to build is depicted below:

![image.png](attachment:image.png)

We have 2 input source (words and word characters), a bi-directional LSTM layer and a CRF layer for token classification.


In [None]:
word_vocab_size = dataset.word_vocab_size
char_vocab_size = dataset.char_vocab_size
num_y_labels = len(dataset.y_labels) + 1
char_embedding_dims = 25
word_lstm_dims = 25
tagger_lstm_dims = 100

# build word input
words_input = Input(shape=(sentence_length,), name='words_input')

embedding_layer = Embedding(word_vocab_size,
                            embedding_size,
                            weights=[embedding_matrix],
                            input_length=sentence_length,
                            trainable=False)

word_embeddings = embedding_layer(words_input)
word_embeddings = Dropout(0.5)(word_embeddings)

# create word character embeddings
word_chars_input = Input(shape=(sentence_length, word_length), name='word_chars_input')
char_embedding_layer = Embedding(char_vocab_size, char_embedding_dims,
                                 input_length=word_length)
char_embeddings = TimeDistributed(char_embedding_layer)(word_chars_input)
char_embeddings = TimeDistributed(Bidirectional(LSTM(word_lstm_dims)))(char_embeddings)
char_embeddings = Dropout(0.5)(char_embeddings)

# create the final feature vectors
features = concatenate([word_embeddings, char_embeddings], axis=-1)

# encode using a bi-lstm
bilstm = Bidirectional(LSTM(tagger_lstm_dims, return_sequences=True))(features)
bilstm = Dropout(0.5)(bilstm)

# classify the dense vectors
crf = CRF(num_y_labels, sparse_target=False)
predictions = crf(bilstm)

# compile the model
model = Model(inputs=[words_input, word_chars_input], outputs=predictions)
model.compile(loss=crf.loss_function,
              optimizer='adam',
              metrics=[crf.accuracy])

## Training
set batch size and number of epochs and fit the data on the network.

In [None]:
b = 32
e = 1

model.fit(x=[x_train, x_char_train], y=y_train,
              batch_size=b,
              epochs=e)

## Evaluation
Once the model has trained. Run CONLLEVAL to see how well it performs.

In [None]:
predictions = model.predict([x_test, x_char_test], batch_size=b)

eval = get_conll_scores(predictions, y_test, {v: k for k, v in dataset.y_labels.items()})
print('Precision {}'.format(eval[0][0]))
print('Recall {}'.format(eval[0][1]))
print('F1 {}'.format(eval[0][2]))