# Named Entity Recognition - RNN Demo

By [Akshaj Verma](https://akshajverma.com)

This notebook takes you through the basics of Named Entity Recognition using RNNs in PyTorch.

Named Entity Recognition sample example with 2 sentences of the same length.

In [1]:
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader


%matplotlib inline

torch.manual_seed(1)

<torch._C.Generator at 0x7f33a4033170>

## Prepare Data

In [2]:
training_data = [
    ("Ronaldo is from Portugal.".split(), ["PER", "O", "O", "LOC"]),
    ("Rooney is from England.".split(), ["PER", "O", "O", "LOC"])
]

sentence_list = [training_data[x][0] for x in range(len(training_data))]
tag_list = [training_data[x][1] for x in range(len(training_data))]

### The input sentences.

In [3]:
sentence_list

[['Ronaldo', 'is', 'from', 'Portugal.'], ['Rooney', 'is', 'from', 'England.']]

### The output tags.

In [4]:
tag_list

[['PER', 'O', 'O', 'LOC'], ['PER', 'O', 'O', 'LOC']]

### Clean the input data by converting it into lower case.

In [5]:
data_clean_list = []
for sentence, tags in training_data:
    clean_sentence = [x.lower().split('.')[0] for x in sentence]
    data_clean_list += [(clean_sentence, tags)]

    
sentence_clean_list = [data_clean_list[x][0] for x in range(len(data_clean_list))]

In [6]:
sentence_clean_list

[['ronaldo', 'is', 'from', 'portugal'], ['rooney', 'is', 'from', 'england']]

### Create a vocab for input words.

In [7]:
words = []
for sentence in sentence_clean_list:
    words += sentence
words = list(set(words))
print(f"Size of word-vocablury: {len(words)}\n")
print(words)

Size of word-vocablury: 6

['ronaldo', 'is', 'portugal', 'from', 'rooney', 'england']


### Create a dictionary for input <=> ID.

In [8]:
word2idx = {word: i for i, word in enumerate(words)}
print(word2idx)

{'ronaldo': 0, 'is': 1, 'portugal': 2, 'from': 3, 'rooney': 4, 'england': 5}


### Create a vocab for output tags.

In [9]:
tags = []
for tag in tag_list:
    tags += tag
tags = list(set(tags))
print(f"Size of tag-vocab: {len(tags)}\n")
print(tags)

Size of tag-vocab: 3

['O', 'LOC', 'PER']


### Create a dictionary for output <=> ID.

In [10]:
tag2idx = {word: i for i, word in enumerate(tags)}
print(tag2idx)

{'O': 0, 'LOC': 1, 'PER': 2}


### Encode the words into numbers.

In [11]:
sentence_clean_list, tag_list

([['ronaldo', 'is', 'from', 'portugal'], ['rooney', 'is', 'from', 'england']],
 [['PER', 'O', 'O', 'LOC'], ['PER', 'O', 'O', 'LOC']])

In [12]:
X = [[word2idx[w] for w in s] for s in sentence_clean_list]
X

[[0, 1, 3, 2], [4, 1, 3, 5]]

In [13]:
y = [[tag2idx[t] for t in s] for s in tag_list]
y

[[2, 0, 0, 1], [2, 0, 0, 1]]

## Neural Network

Input -> RNN -> Linear -> Softmax

### Define the model parameters

In [14]:
EMBEDDING_SIZE = 10
HIDDEN_SIZE = 20
LEARNING_RATE = 0.01
EPOCH = 10
BATCH_SIZE = 1

### Data Loader

In [15]:
class TrainData(Dataset):
    
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data

        
    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]
        
    def __len__ (self):
        return len(self.X_data)

In [16]:
train_data = TrainData(torch.Tensor(X).to(torch.int64), torch.Tensor(y).to(torch.long))
train_loader = DataLoader(dataset=train_data, batch_size=BATCH_SIZE)

In [17]:
for x,y in train_loader:
    print(x.shape, y.shape)

torch.Size([1, 4]) torch.Size([1, 4])
torch.Size([1, 4]) torch.Size([1, 4])


## GRU MODEL

### Class for GRU

In [18]:
class GRUtagger(nn.Module):
    
    def __init__(self, embedding_size, vocab_size, hidden_size, target_size):
        super(GRUtagger, self).__init__()
        
        self.word_embeddings = nn.Embedding(num_embeddings = vocab_size, embedding_dim = embedding_size)
        self.gru = nn.GRU(input_size = embedding_size, hidden_size=hidden_size, batch_first = True)
        self.linear = nn.Linear(in_features = hidden_size, out_features=target_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        gru_out, _ = self.gru(embeds)
        linear_out = self.linear(gru_out)
        y_out = F.log_softmax(linear_out, dim=1)
        return y_out

In [19]:
gru_model = GRUtagger(embedding_size=EMBEDDING_SIZE, vocab_size=len(word2idx), hidden_size=HIDDEN_SIZE, target_size=len(tag2idx))
print(gru_model)

criterion = nn.NLLLoss()
optimizer = optim.SGD(gru_model.parameters(), lr = LEARNING_RATE)

GRUtagger(
  (word_embeddings): Embedding(6, 10)
  (gru): GRU(10, 20, batch_first=True)
  (linear): Linear(in_features=20, out_features=3, bias=True)
)


### See how the GRU output from the model looks. 

In [20]:
with torch.no_grad():
    for x_batch, y_batch in train_loader:
        print("Input:")
        print(x_batch)
        y_out = gru_model(x_batch)
        _, y_out_tags = torch.max(y_out.squeeze(), dim = 1)
        
        print("\nOutput:")
        print(y_out, y_out.shape)
        
        print("\nOutput Indices:")
        print(y_out_tags)
        

        print("\nActual Output:")
        print(y_batch, y_batch.shape)
        
        print("=" * 50)

Input:
tensor([[0, 1, 3, 2]])

Output:
tensor([[[-1.3501, -1.3673, -1.5080],
         [-1.2519, -1.4159, -1.2836],
         [-1.4130, -1.3701, -1.3147],
         [-1.5540, -1.3927, -1.4564]]]) torch.Size([1, 4, 3])

Output Indices:
tensor([0, 0, 2, 1])

Actual Output:
tensor([[2, 0, 0, 1]]) torch.Size([1, 4])
Input:
tensor([[4, 1, 3, 5]])

Output:
tensor([[[-1.3398, -1.4564, -1.4748],
         [-1.3094, -1.4155, -1.2893],
         [-1.4956, -1.3380, -1.3252],
         [-1.4106, -1.3404, -1.4697]]]) torch.Size([1, 4, 3])

Output Indices:
tensor([0, 2, 2, 1])

Actual Output:
tensor([[2, 0, 0, 1]]) torch.Size([1, 4])


### Train the GRU model.

**nn.NLLLoss()** expects input and target to be 2-dimensional and 1-dimensional respectively.

So, we will reshape the tensors as follows:  
* input tensor (y_pred) to a 2d tensor from a 3d tensor. So, from `[1, 4, 3]` to `[4, 3]`. 
* target tensor (y_batch) to a 1d tensor from a 2d tensor. So, from `[1, 4]` to `[4]`.

In [21]:
for e in range(1, EPOCH+1):
    epoch_loss = 0

    for x_batch, y_batch in train_loader:
        gru_model.zero_grad()
        
        y_pred = gru_model(x_batch)
        y_batch = y_batch.view(-1)
        y_pred = y_pred.view(-1, len(tag2idx))
        loss = criterion(y_pred, y_batch)
        loss.backward()
        optimizer.step()    
        
        epoch_loss += loss.item()

    print(f'Epoch: {e+0:02} | Loss: {epoch_loss/len(train_loader):.5f}')

Epoch: 01 | Loss: 1.39782
Epoch: 02 | Loss: 1.39584
Epoch: 03 | Loss: 1.39386
Epoch: 04 | Loss: 1.39189
Epoch: 05 | Loss: 1.38993
Epoch: 06 | Loss: 1.38798
Epoch: 07 | Loss: 1.38603
Epoch: 08 | Loss: 1.38409
Epoch: 09 | Loss: 1.38216
Epoch: 10 | Loss: 1.38023


## LSTM Model

### Class for LSTM model

In [22]:
class LSTMtagger(nn.Module):
    
    def __init__(self, embedding_size, vocab_size, hidden_size, target_size):
        super(LSTMtagger, self).__init__()
        
        self.word_embeddings = nn.Embedding(num_embeddings = vocab_size, embedding_dim = embedding_size)
        self.lstm = nn.LSTM(input_size = embedding_size, hidden_size=hidden_size, batch_first = True)
        self.linear = nn.Linear(in_features = hidden_size, out_features=target_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds)
        linear_out = self.linear(lstm_out)
        y_out = F.log_softmax(linear_out, dim=1)
        return y_out

In [23]:
lstm_model = LSTMtagger(embedding_size=EMBEDDING_SIZE, vocab_size=len(word2idx), hidden_size=HIDDEN_SIZE, target_size=len(tag2idx))
print(lstm_model)

criterion = nn.NLLLoss()
optimizer = optim.SGD(lstm_model.parameters(), lr = LEARNING_RATE)

LSTMtagger(
  (word_embeddings): Embedding(6, 10)
  (lstm): LSTM(10, 20, batch_first=True)
  (linear): Linear(in_features=20, out_features=3, bias=True)
)


### See how the LSTM output from the model looks. 

In [24]:
with torch.no_grad():
    for x_batch, y_batch in train_loader:
        print("Input:")
        print(x_batch)
        y_out = lstm_model(x_batch)
        _, y_out_tags = torch.max(y_out.squeeze(), dim = 1)
        
        print("\nOutput:")
        print(y_out, y_out.shape)
        
        print("\nOutput Indices:")
        print(y_out_tags)
        
        print("\nActual Output:")
        print(y_batch, y_batch.shape)
        
        print("=" * 50)

Input:
tensor([[0, 1, 3, 2]])

Output:
tensor([[[-1.4406, -1.4216, -1.3859],
         [-1.3527, -1.3968, -1.3765],
         [-1.3716, -1.3627, -1.3308],
         [-1.3824, -1.3653, -1.4560]]]) torch.Size([1, 4, 3])

Output Indices:
tensor([2, 0, 2, 1])

Actual Output:
tensor([[2, 0, 0, 1]]) torch.Size([1, 4])
Input:
tensor([[4, 1, 3, 5]])

Output:
tensor([[[-1.3971, -1.3982, -1.3906],
         [-1.3705, -1.4080, -1.3985],
         [-1.4153, -1.3534, -1.3632],
         [-1.3632, -1.3864, -1.3933]]]) torch.Size([1, 4, 3])

Output Indices:
tensor([2, 0, 1, 0])

Actual Output:
tensor([[2, 0, 0, 1]]) torch.Size([1, 4])


### Train the LSTM model.

In [25]:
for e in range(1, EPOCH+1):
    epoch_loss = 0

    for x_batch, y_batch in train_loader:
        lstm_model.zero_grad()

        y_pred = lstm_model(x_batch)
        y_batch = y_batch.view(-1)
        y_pred = y_pred.view(-1, len(tag2idx))
        loss = criterion(y_pred, y_batch)
        loss.backward()
        optimizer.step()    

        epoch_loss += loss.item()

    print(f'Epoch: {e+0:02} | Loss: {epoch_loss/len(train_loader):.5f}')

Epoch: 01 | Loss: 1.37968
Epoch: 02 | Loss: 1.37901
Epoch: 03 | Loss: 1.37834
Epoch: 04 | Loss: 1.37767
Epoch: 05 | Loss: 1.37701
Epoch: 06 | Loss: 1.37634
Epoch: 07 | Loss: 1.37567
Epoch: 08 | Loss: 1.37501
Epoch: 09 | Loss: 1.37434
Epoch: 10 | Loss: 1.37368


## STACKED LSTM

### Class for stacked LSTM model

In [26]:
class StackedLSTMtagger(nn.Module):
    
    def __init__(self, embedding_size, vocab_size, hidden_size, target_size):
        super(StackedLSTMtagger, self).__init__()
        
        self.word_embeddings = nn.Embedding(num_embeddings = vocab_size, embedding_dim = embedding_size)
        self.lstm = nn.LSTM(input_size = embedding_size, hidden_size=hidden_size, batch_first = True, num_layers = 10)
        self.linear = nn.Linear(in_features = hidden_size, out_features=target_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds)
        linear_out = self.linear(lstm_out)
        y_out = F.log_softmax(linear_out, dim=1)
        return y_out

In [27]:
stacked_lstm_model = StackedLSTMtagger(embedding_size=EMBEDDING_SIZE, vocab_size=len(word2idx), hidden_size=HIDDEN_SIZE, target_size=len(tag2idx))
print(stacked_lstm_model)

criterion = nn.NLLLoss()
optimizer = optim.SGD(stacked_lstm_model.parameters(), lr = LEARNING_RATE)

StackedLSTMtagger(
  (word_embeddings): Embedding(6, 10)
  (lstm): LSTM(10, 20, num_layers=10, batch_first=True)
  (linear): Linear(in_features=20, out_features=3, bias=True)
)


### See how the Stacked-LSTM output from the model looks. 

In [28]:
with torch.no_grad():
    for x_batch, y_batch in train_loader:
        print("Input:")
        print(x_batch)
        y_out = stacked_lstm_model(x_batch)
        _, y_out_tags = torch.max(y_out.squeeze(), dim = 1)
        
        print("\nOutput:")
        print(y_out, y_out.shape)
        
        print("\nOutput Indices:")
        print(y_out_tags)

        print("\nActual Output:")
        print(y_batch, y_batch.shape)
        
        print("=" * 50)

Input:
tensor([[0, 1, 3, 2]])

Output:
tensor([[[-1.4104, -1.3916, -1.3739],
         [-1.3869, -1.3862, -1.3844],
         [-1.3762, -1.3842, -1.3914],
         [-1.3721, -1.3831, -1.3957]]]) torch.Size([1, 4, 3])

Output Indices:
tensor([2, 2, 0, 0])

Actual Output:
tensor([[2, 0, 0, 1]]) torch.Size([1, 4])
Input:
tensor([[4, 1, 3, 5]])

Output:
tensor([[[-1.4104, -1.3916, -1.3739],
         [-1.3869, -1.3862, -1.3844],
         [-1.3762, -1.3842, -1.3914],
         [-1.3721, -1.3831, -1.3957]]]) torch.Size([1, 4, 3])

Output Indices:
tensor([2, 2, 0, 0])

Actual Output:
tensor([[2, 0, 0, 1]]) torch.Size([1, 4])


### Train the Stacked-LSTM model.

In [29]:
for e in range(1, EPOCH+1):
    epoch_loss = 0

    for x_batch, y_batch in train_loader:
        stacked_lstm_model.zero_grad()
        
        y_pred = stacked_lstm_model(x_batch)
        y_batch = y_batch.view(-1)
        y_pred = y_pred.view(-1, len(tag2idx))
        loss = criterion(y_pred, y_batch)
        loss.backward()
        optimizer.step()    
        
        epoch_loss += loss.item()

    print(f'Epoch: {e+0:02} | Loss: {epoch_loss/len(train_loader):.5f}')

Epoch: 01 | Loss: 1.38000
Epoch: 02 | Loss: 1.37992
Epoch: 03 | Loss: 1.37984
Epoch: 04 | Loss: 1.37976
Epoch: 05 | Loss: 1.37968
Epoch: 06 | Loss: 1.37960
Epoch: 07 | Loss: 1.37952
Epoch: 08 | Loss: 1.37944
Epoch: 09 | Loss: 1.37935
Epoch: 10 | Loss: 1.37927
