# Named Entity Recognition using LSTMs

### 1. Preparing the dataset

In [1]:
import pandas as pd
import numpy as np

data = pd.read_csv("data/ner_dataset.csv", encoding="latin1")
data = data.fillna(method="ffill")

In [3]:
print(data.head())

      Sentence           Word  POS Tag
0  Sentence: 1      Thousands  NNS   O
1  Sentence: 1             of   IN   O
2  Sentence: 1  demonstrators  NNS   O
3  Sentence: 1           have  VBP   O
4  Sentence: 1        marched  VBN   O


In [4]:
words = list(set(data["Word"].values))

words.append("UNKNOWN")
words.append("ENDPAD")

n_words = len(words)
print(n_words)

35167


In [5]:
tags = list(set(data["Tag"].values))
n_tags = len(tags); n_tags
print(n_tags)

17


(This is a helper class I found online)

In [6]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [7]:
getter = SentenceGetter(data)

In [8]:
sent = getter.get_next()

In [9]:
sentences = getter.sentences

In [10]:
max_len = 35
word2idx = {w: i for i, w in enumerate(words)}
tag2idx = {t: i for i, t in enumerate(tags)}

In [11]:
word2idx["sup"]

21370

In [12]:
from keras.preprocessing.sequence import pad_sequences
x_data = [[word2idx[w[0]] for w in s] for s in sentences]
x_data = pad_sequences(maxlen=max_len, sequences=x_data, padding="post", value=n_words - 1)

ModuleNotFoundError: No module named 'keras'

In [12]:
y_data = [[tag2idx[w[2]] for w in s] for s in sentences]
y_data = pad_sequences(maxlen=max_len, sequences=y_data, padding="post", value=tag2idx["O"])

In [13]:
print(x_data[9])
print(y_data[9])

[26414  1511 20273 15846 16937 18460 10160 12786 34276 27400 30633 18531
 31059 10960  5156 33694 14898  8133 11095  7673 15187 34276 24692 28330
 18788  2452  8203 35166 35166 35166 35166 35166 35166 35166 35166]
[ 9 11 11 11 15 11 11 11 11 11 11 11 11  3 11 11 11 11 11 11 11 11 11 11
 11 11 11 11 11 11 11 11 11 11 11]


In [14]:
from keras.utils import to_categorical
y_data = [to_categorical(i, num_classes=n_tags) for i in y_data]

In [15]:
print(x_data[1])
print(y_data[1])

[17231 23301 24901 19269 13616  8133 14443 14771  8133 35153 33525   941
 31059 24692 14479 33461 30544 17380 33694 23150 33331 17258   393 12037
  8203 35166 35166 35166 35166 35166 35166 35166 35166 35166 35166]
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0

So, for every sentence, we have an integer-indexed matrix (which will be the input for the embedding layer). For each word, we have a corresponding one-hot-encoded class.

In [16]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2)

### 2. Building the model

In [29]:
from keras.models import Model, Sequential
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional,CuDNNLSTM

model = Sequential()
model.add(Embedding(input_dim=n_words, output_dim=100, input_length=max_len))
model.add(Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1)))
model.add(TimeDistributed(Dense(n_tags, activation="softmax")))

model.compile(optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"])

In [30]:
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 35, 100)           3516700   
_________________________________________________________________
bidirectional_5 (Bidirection (None, 35, 200)           160800    
_________________________________________________________________
time_distributed_5 (TimeDist (None, 35, 17)            3417      
Total params: 3,680,917
Trainable params: 3,680,917
Non-trainable params: 0
_________________________________________________________________
None


### 3. Training 

In [31]:
model.fit(x_train, np.array(y_train), batch_size=30, epochs=10, validation_split=0.1, verbose=2)

Train on 34530 samples, validate on 3837 samples
Epoch 1/10
 - 202s - loss: 0.1781 - acc: 0.9546 - val_loss: 0.0822 - val_acc: 0.9763
Epoch 2/10
 - 198s - loss: 0.0714 - acc: 0.9792 - val_loss: 0.0717 - val_acc: 0.9794
Epoch 3/10
 - 194s - loss: 0.0594 - acc: 0.9826 - val_loss: 0.0674 - val_acc: 0.9804
Epoch 4/10
 - 195s - loss: 0.0533 - acc: 0.9842 - val_loss: 0.0646 - val_acc: 0.9809
Epoch 5/10
 - 194s - loss: 0.0492 - acc: 0.9853 - val_loss: 0.0638 - val_acc: 0.9811
Epoch 6/10
 - 194s - loss: 0.0454 - acc: 0.9864 - val_loss: 0.0638 - val_acc: 0.9811
Epoch 7/10
 - 221s - loss: 0.0420 - acc: 0.9874 - val_loss: 0.0695 - val_acc: 0.9792
Epoch 8/10
 - 215s - loss: 0.0385 - acc: 0.9884 - val_loss: 0.0664 - val_acc: 0.9804
Epoch 9/10
 - 211s - loss: 0.0356 - acc: 0.9892 - val_loss: 0.0691 - val_acc: 0.9800
Epoch 10/10
 - 219s - loss: 0.0326 - acc: 0.9901 - val_loss: 0.0708 - val_acc: 0.9795


<keras.callbacks.History at 0x29f61de44e0>

In [34]:
score,acc = model.evaluate(x_test, np.array(y_test), verbose = 2, batch_size = 15)
print("score: %.2f" % (score))
print("acc: %.2f" % (acc))

score: 0.07
acc: 0.98


### 4. Testing new inputs from user

In [37]:
import nltk

def get_matrix_ids(s):
    id_matrix = []
    w = nltk.word_tokenize(s)
    for i in w:
        if i in words:
            id_matrix.append(word2idx[i])
        else :
            id_matrix.append(word2idx["UNKNOWN"]) #Unknown token
    return id_matrix

In [91]:
def Tag_Sentence(s):
    tokens = nltk.word_tokenize(s)
    matrix_id = get_matrix_ids(s)
    matrix_id = pad_sequences(maxlen=max_len, sequences=[matrix_id], padding="post", value=n_words - 1)
    tags = []
    output = model.predict(matrix_id)
    for i in output[0]:
        indVal = np.argmax(i)
        tags.append([key for key, value in tag2idx.items() if value == indVal][0])
    for i in range(len(tokens)):
        print(tokens[i] + "\t" + "\t" + tags[i])

In [93]:
sentence = input()
Tag_Sentence(sentence)

I know a boy named Mayank, who lives in India.
I		O
know		O
a		O
boy		O
named		O
Mayank		B-per
,		O
who		O
lives		O
in		O
India		B-geo
.		O
