## import necessary items

In [1]:
import os 
import numpy as np
import pandas as pd
import random as rnd

## Loading and Preprocessing Dataset


In [2]:
data = pd.read_csv('../input/ner-dataset/ner_datasetreference.csv', encoding= 'unicode_escape')
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [3]:
data['Sentence #']=data['Sentence #'].ffill(axis = 0) 
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O


In [5]:
data_grouped = data.groupby(["Sentence #"])[['Word', 'POS', 'Tag']].agg(lambda x: list(x))
data_grouped.head()

Unnamed: 0_level_0,Word,POS,Tag
Sentence #,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Sentence: 1,"[Thousands, of, demonstrators, have, marched, ...","[NNS, IN, NNS, VBP, VBN, IN, NNP, TO, VB, DT, ...","[O, O, O, O, O, O, B-geo, O, O, O, O, O, B-geo..."
Sentence: 10,"[Iranian, officials, say, they, expect, to, ge...","[JJ, NNS, VBP, PRP, VBP, TO, VB, NN, TO, JJ, J...","[B-gpe, O, O, O, O, O, O, O, O, O, O, O, O, O,..."
Sentence: 100,"[Helicopter, gunships, Saturday, pounded, mili...","[NN, NNS, NNP, VBD, JJ, NNS, IN, DT, NNP, JJ, ...","[O, O, B-tim, O, O, O, O, O, B-geo, O, O, O, O..."
Sentence: 1000,"[They, left, after, a, tense, hour-long, stand...","[PRP, VBD, IN, DT, NN, JJ, NN, IN, NN, NNS, .]","[O, O, O, O, O, O, O, O, O, O, O]"
Sentence: 10000,"[U.N., relief, coordinator, Jan, Egeland, said...","[NNP, NN, NN, NNP, NNP, VBD, NNP, ,, NNP, ,, J...","[B-geo, O, O, B-per, I-per, O, B-tim, O, B-geo..."


In [6]:
sentences_list = data_grouped.Word.to_list()
ner_tags_list = data_grouped.Tag.to_list()
pos_tags_list = data_grouped.POS.to_list()

In [7]:
max_len = max([len(s) for s in sentences_list])
print("Maximum length: ", max_len)

words=list(set(data['Word'].values))
words.sort()
vocab={word: i for i,word in enumerate(words)}
vocab["<PAD>"] = len(vocab)
vocab["<UNK>"] = len(vocab)
reverse_vocab = {v: k for k, v in vocab.items()}

tags=list(set(data['Tag'].values))
tags.sort()
num_tags=len(tags)
tags_map={tag:i for i,tag in enumerate(tags)}
reverse_tag_map={v: k for k, v in tags_map.items()}

Maximum length:  104


In [8]:
def preprocess(vocab, tag_map, sentences, labels):
  sentences_int = []
  labels_int = []

  for sentence in sentences:
      s = [vocab[token] if token in vocab 
            else vocab['<UNK>']
            for token in sentence]
      sentences_int.append(s)

  for sentence in labels:
      l = [tag_map[label] for label in sentence] 
      labels_int.append(l) 
  return sentences_int, labels_int, len(sentences)

In [9]:
sentences_encoded, labels_encoded, data_length = preprocess(vocab, tags_map, sentences_list, ner_tags_list)

In [10]:
max_len=128
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

padded_sentences = pad_sequences(maxlen=max_len, sequences=sentences_encoded, padding="post", value=vocab["<PAD>"])
padded_tags = pad_sequences(maxlen=max_len, sequences=labels_encoded, padding="post", value=tags_map['O'])

print("Shape of padded sentences:", padded_sentences.shape)
print("Shape of padded labels:", padded_tags.shape)

Shape of padded sentences: (47959, 128)
Shape of padded labels: (47959, 128)


In [11]:
labels = [to_categorical(i, num_classes = num_tags) for i in  padded_tags]

## Splitting data into train, validation and test sets

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_val_test, y_train, y_val_test = train_test_split(padded_sentences, labels, test_size = 0.3)
X_val, X_test, y_val, y_test = train_test_split(X_val_test, y_val_test, test_size = 0.3)
y_train = np.array(y_train)
y_test = np.array(y_test)
y_val = np.array(y_val)

print("X_train shape: ", X_train.shape)
print("y_train shape: ", y_train.shape)
print("X_test shape: ", X_test.shape)
print("y_test shape: ", y_test.shape)
print("X_val shape: ", X_val.shape)
print("y_val shape: ", y_val.shape)

X_train shape:  (33571, 128)
y_train shape:  (33571, 128, 17)
X_test shape:  (4317, 128)
y_test shape:  (4317, 128, 17)
X_val shape:  (10071, 128)
y_val shape:  (10071, 128, 17)


In [13]:
batch_size = 64
len(X_train)/batch_size

524.546875

## Building Model

In [14]:
from tensorflow.keras import Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Dense
from tensorflow.keras.layers import TimeDistributed, SpatialDropout1D, Bidirectional
from keras.layers.merge import add

In [25]:
embedding_dim = 64
vocab_size = len(vocab)
max_len = 128

input = Input(shape = (max_len,))
embedding = Embedding(input_dim = vocab_size+1, output_dim = embedding_dim, input_length = max_len)(input)
x = Bidirectional(LSTM(units=embedding_dim, return_sequences=True))(embedding)
x = LSTM(units=embedding_dim, return_sequences=True)(x)
x2 = LSTM(units=embedding_dim, return_sequences=True)(x)
x = add([x, x2])  # residual connection to the first biLSTM
output = TimeDistributed(Dense(num_tags, activation="softmax"))(x)  #TimeDistributed layer applies Dense layer to each time stamp

model = Model(input, output)
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=['mae', 'categorical_accuracy'])
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, 128)]        0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 128, 64)      2251584     input_3[0][0]                    
__________________________________________________________________________________________________
bidirectional_2 (Bidirectional) (None, 128, 128)     66048       embedding_2[0][0]                
__________________________________________________________________________________________________
lstm_7 (LSTM)                   (None, 128, 64)      49408       bidirectional_2[0][0]            
____________________________________________________________________________________________

## Training Model

In [26]:
history = model.fit(X_train, y_train, validation_data=(X_val, y_val), batch_size = 64, epochs = 6)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


## Saving Model

In [18]:
model.save('ner_model.h5')

## Loading saved model

In [19]:
from keras.models import load_model
model = load_model('ner_model.h5')

## Evaluating predictions using test set

In [20]:
y_pred = model.predict(X_test) 

In [21]:
def evaluate_prediction(y_pred, y_test, pad):
    outputs = np.argmax(y_pred, axis=2)
    labels = np.argmax(y_test, axis=2)
    mask = labels != pad
    accuracy = np.sum(outputs == labels) / float(np.sum(mask))
    return accuracy

In [22]:
acc = evaluate_prediction(y_pred, y_test, vocab["<PAD>"])
print('Accuracy on test set:', acc)

Accuracy on test set: 0.9936334549455641


## Prediction on custom input

In [23]:
def predict(sentence, model, vocab, tag_map):
    s = [vocab[token] if token in vocab else vocab['<UNK>'] for token in sentence.split(' ')]
    len_s = len(s)
    if len_s<128:
        padding = [vocab["<PAD>"]]*(128-len_s)
        s = s + padding
    else:
        s = s[:128]
    batch_data = np.ones((1, len(s)))
    batch_data[0][:] = s
    sentence = np.array(batch_data).astype(int)
    output = model(sentence)
    outputs = np.argmax(output, axis=2)
    labels = list(tag_map.keys())
    pred = []
    for i in range(len(outputs[0])):
        idx = outputs[0][i] 
        pred_label = labels[idx]
        pred.append(pred_label)
    return pred

In [24]:
sentence = "Google LLC is an American multinational technology \
company that specializes in Internet-related services and products, \
which include online advertising technologies\
a search engine, cloud computing, software, and hardware."
predictions = predict(sentence, model, vocab, tags_map)
for x,y in zip(sentence.split(' '), predictions):
    if y != 'O':
        print(x, end="-->")
        print(y)

Google-->B-org
LLC-->I-org
American-->B-gpe
Internet-related-->B-org
