# Named entity recognition

In [2]:
import pandas as pd
from spacy import displacy
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model, Sequential, Input
from keras.layers import Bidirectional, CuDNNLSTM, Embedding, Dropout, Dense,TimeDistributed, Conv1D, MaxPooling1D, GlobalMaxPooling1D
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [3]:
# please donwload ner_dataset.csv from https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus
df = pd.read_csv("ner_dataset.csv",sep=",",encoding="latin-1")

In [4]:
df.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


Sentences are vertically stacked and we have only the first row of each sentence that is filled. So we have to fill each of the NaN rows with their corresponding sentence.

### Data preparation

In [5]:
# Sentence attribution
df["Sentence #"] = df["Sentence #"].fillna(method="ffill")

In [6]:
# Get the sentence and tag sequences.
agg_func = lambda x : (x['Word'].values.tolist(), x['Tag'].values.tolist())
sequences_tags = df.groupby("Sentence #").apply(agg_func)

In [7]:
sentences = [sent[0] for sent in sequences_tags]
tags = [sent[1] for sent in sequences_tags]

### Tokenization

##### Sentences tokenization

In [8]:
# We need to keep ever characters from the sentences.
tokenizer_sent = Tokenizer(filters='')
tokenizer_sent.fit_on_texts(sentences)
index_word = tokenizer_sent.index_word

In [9]:
# Adding the endpad
index_word.update({0:"ENDPAD"})

In [10]:
vocab_size = len(index_word)

In [11]:
sent_seq = tokenizer_sent.texts_to_sequences(sentences)

##### Tags tokenization

In [12]:
# We need to keep ever characters from the tags.
tokenizer_tags = Tokenizer(filters='')
tokenizer_tags.fit_on_texts(tags)
index_tags = tokenizer_tags.index_word

In [13]:
index_tags.update({0:"ENDPAD"})

In [14]:
tags_size = len(index_tags)

In [15]:
tags_seq = tokenizer_tags.texts_to_sequences(tags)

### Padding

In [16]:
# Getting the max sentence and tag length
max_len_seq = max([len(i) for i in sent_seq])
max_len_tag = max([len(i) for i in tags_seq])

In [17]:
X = pad_sequences(sent_seq,padding='post',value=0, maxlen=max_len_seq)
y = pad_sequences(tags_seq,padding='post',value=0, maxlen=max_len_tag)

### Modeling

In [18]:
# Labels transformed as categorical hot encoded vectors (i.e [[2,6,0]] -> [[0,0,1,0,0,0],[0,0,0,0,0,0,1],[1,0,0,0,0,0]] )
y = to_categorical(y)

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [20]:
model = Sequential()
model.add(Embedding(input_dim = vocab_size, output_dim = 100, input_length = max_len_seq))
model.add(Dropout(0.3))
model.add(Bidirectional(CuDNNLSTM(25,return_sequences=True)))
model.add(TimeDistributed(Dense(tags_size,activation = 'softmax')))

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 104, 100)          3181800   
_________________________________________________________________
dropout_1 (Dropout)          (None, 104, 100)          0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 104, 50)           25400     
_________________________________________________________________
time_distributed_1 (TimeDist (None, 104, 18)           918       
Total params: 3,208,118
Trainable params: 3,208,118
Non-trainable params: 0
_________________________________________________________________


Layers :
     - Embedding , i embedded the sentences (M, max length = 104) within a dimension D of 100.
     
     - Dropout , used to avoid overfitting (0.3 is the dropout rate) every epochs 30% of the connexions between LSTM and the embedding layer are turned off randomly -> forces the network to find recurrent paths.
     
     - LSTM, i used LSTM layer since it is suitable for sequence tasks and allows to 'remember' the context words, param return_sequences = True makes the output of a dimension (None, M, N) instead of (None, N) N is the number of units of an LSTM (here 2 x 25 , since we used Bidirectional layer). We need every sequence time (M) outputs from the LSTM, we will predict a named entity for each of them.
     
     - CuDNNLSTM, this is an LSTM layer but optimized for GPU computing (CuDNN) ultra fast
     
     - Bidirectional , better when you have a sequence task. Reads sequence in the two directions to get the meaning from every direction.
     
     - Dense, final layer is a fully connected layer with a softmax activation and the number of tags classes to output.
     
     - TimeDistributed, as we want to predict every named entity for every word of our sequence we need to specify it using the TimeDistributed layer.

In [21]:
model.compile(optimizer='adam',loss="categorical_crossentropy",metrics=["accuracy"])

In [22]:
model.fit(X_train, y_train, batch_size=128, epochs= 5, validation_split= 0.2)

Train on 25705 samples, validate on 6427 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x11a4997bfd0>

In [23]:
y_pred = model.predict(X_test)

In [24]:
pred =  np.argmax(y_pred,axis=2)

In [38]:
def print_decode(seq_words,seq_tag,index_words,index_tags):
    for idx_w,idx_t in zip(seq_words,seq_tag):
        w = index_words[idx_w]
        t = index_tags[idx_t]
        if w == "ENDPAD":
            break
        print(f"{w:{30}} {t}")

In [40]:
print_decode(X_test[2],pred[2],index_word,index_tags)

the                            o
pact                           o
was                            o
initially                      o
approved                       o
after                          o
discussions                    o
between                        o
president                      b-per
bush                           i-per
and                            o
peruvian                       b-gpe
president                      b-per
alan                           i-per
garcia                         i-per
,                              o
but                            o
democrats                      o
in                             o
congress                       b-org
forced                         o
u.s.                           b-geo
officials                      o
to                             o
reopen                         o
negotiations                   o
and                            o
add                            o
stronger                       o
labor      

In [163]:
entities = ['B-GEO','B-TIM','B-ORG','I-PER','B-PER','I-ORG','B-GPE','I-GEO','I-TIM','B-ART','B-EVE',\
                'I-ART','I-EVE','B-NAT','I-GPE','I-NAT',]

In [164]:
colors = ["#e6fc9c","#fcb09c","#73534a","#aa9cfc","#aa9cfc",
          "#8e15a1","#15a135","#a19f15","#16dfe2","#162fe2","#ffa46d",
          "#b4ff6d","#b86dff","#ff6d6d","#d3ff6d","#ffce6d"]

In [171]:
options = {"ents": entities, "colors" : dict(zip(entities,colors))}

In [173]:
def render_NER(seq_words, seq_tag, index_words, index_tags, options = options):
    text = ""
    ents = []
    title = None
    seq_tag = zip(seq_words,seq_tag)
    for i, (idx_w, idx_t) in enumerate(seq_tag):
        w = index_words[idx_w]
        t = index_tags[idx_t]
        
        if w == "ENDPAD":
            break
        
        start = len(text)
        text += w + ' '
        end = len(text)
        label = t.upper()
        if t != 'o':
            ents.append({'start':start,'end':end, 'label':label})
    
    res_dict = {"text":text,"ents":ents,"title":title}
    displacy.render(res_dict, style="ent", manual = True, jupyter = True, options=options)

In [174]:
render_NER(X_test[2],pred[2],index_word,index_tags,options=options)

In [180]:
render_NER(X_test[10],pred[10],index_word,index_tags,options=options)