# NER Sample Example | RNN | PyTorch


By [Akshaj Verma](https://akshajverma.com)

This notebook takes you through the basics of using 

Named Entity Recognition sample example with 4 sentences of different lengths. The data is padded to a constant number.

In [1]:
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader


%matplotlib inline

torch.manual_seed(1)

<torch._C.Generator at 0x7fe81d4bbe30>

## Prepare Data

In [2]:
training_data = [
    ("Ronaldo is from Portugal.".split(), ["PER", "O", "O", "LOC"]),
    ("Rooney is from England.".split(), ["PER", "O", "O", "LOC"]),
    ("Ronaldo was born in February.".split(), ["PER", "O", "O", "O", "DATE"]),
    ("MUFC is an English club.".split(), ["ORG", "O", "O", "LOC", "O"])
]

sentence_list = [training_data[x][0] for x in range(len(training_data))]
tag_list = [training_data[x][1] for x in range(len(training_data))]

### The input sentences.

In [3]:
sentence_list

[['Ronaldo', 'is', 'from', 'Portugal.'],
 ['Rooney', 'is', 'from', 'England.'],
 ['Ronaldo', 'was', 'born', 'in', 'February.'],
 ['MUFC', 'is', 'an', 'English', 'club.']]

### The output tags.

In [4]:
tag_list

[['PER', 'O', 'O', 'LOC'],
 ['PER', 'O', 'O', 'LOC'],
 ['PER', 'O', 'O', 'O', 'DATE'],
 ['ORG', 'O', 'O', 'LOC', 'O']]

### Clean the input data by converting it into lower case.

In [5]:
data_clean_list = []
for sentence, tags in training_data:
    clean_sentence = [x.lower().split('.')[0] for x in sentence]
    data_clean_list += [(clean_sentence, tags)]

    
sentence_clean_list = [data_clean_list[x][0] for x in range(len(data_clean_list))]

In [6]:
sentence_clean_list

[['ronaldo', 'is', 'from', 'portugal'],
 ['rooney', 'is', 'from', 'england'],
 ['ronaldo', 'was', 'born', 'in', 'february'],
 ['mufc', 'is', 'an', 'english', 'club']]

### Create a vocab for input words.

In [7]:
words = []
for sentence in sentence_clean_list:
    words += sentence
words = list(set(words))
print(f"Size of word-vocablury: {len(words)}\n")
print(words)

Size of word-vocablury: 14

['rooney', 'an', 'england', 'february', 'club', 'in', 'portugal', 'was', 'from', 'is', 'mufc', 'english', 'ronaldo', 'born']


### Create a dictionary for input <=> ID.

In [8]:
word2idx = {word: i for i, word in enumerate(words)}
print(word2idx)

{'rooney': 0, 'an': 1, 'england': 2, 'february': 3, 'club': 4, 'in': 5, 'portugal': 6, 'was': 7, 'from': 8, 'is': 9, 'mufc': 10, 'english': 11, 'ronaldo': 12, 'born': 13}


### Create a vocab for output tags.

In [9]:
tags = []
for tag in tag_list:
    tags += tag
tags = list(set(tags))
print(f"Size of tag-vocab: {len(tags)}\n")
print(tags)

Size of tag-vocab: 5

['DATE', 'ORG', 'LOC', 'PER', 'O']


### Create a dictionary for output <=> ID.

In [10]:
tag2idx = {word: i for i, word in enumerate(tags)}
print(tag2idx)

{'DATE': 0, 'ORG': 1, 'LOC': 2, 'PER': 3, 'O': 4}


### Encode the words to numbers.

In [11]:
sentence_clean_list, tag_list

([['ronaldo', 'is', 'from', 'portugal'],
  ['rooney', 'is', 'from', 'england'],
  ['ronaldo', 'was', 'born', 'in', 'february'],
  ['mufc', 'is', 'an', 'english', 'club']],
 [['PER', 'O', 'O', 'LOC'],
  ['PER', 'O', 'O', 'LOC'],
  ['PER', 'O', 'O', 'O', 'DATE'],
  ['ORG', 'O', 'O', 'LOC', 'O']])

In [12]:
X = [[word2idx[w] for w in s] for s in sentence_clean_list]
X

[[12, 9, 8, 6], [0, 9, 8, 2], [12, 7, 13, 5, 3], [10, 9, 1, 11, 4]]

In [13]:
y = [[tag2idx[t] for t in s] for s in tag_list]
y

[[3, 4, 4, 2], [3, 4, 4, 2], [3, 4, 4, 4, 0], [1, 4, 4, 2, 4]]

## Neural Network

Input -> RNN -> Linear -> Softmax

### Define the model parameters

In [14]:
EMBEDDING_SIZE = 10
HIDDEN_SIZE = 20
LEARNING_RATE = 0.01
EPOCH = 10
BATCH_SIZE = 2

### Data Loader

In [15]:
class TrainData(Dataset):
    
    def __init__(self, X_data, y_data, maxlen):
        self.X_data = X_data
        self.y_data = y_data
        self.maxlen = maxlen
        
#         self.X_data = self.pad_data(X_data)

        
    def __getitem__(self, index):
        self.X_data[index] = self.pad_data(self.X_data[index])
        self.y_data[index] = self.pad_data(self.y_data[index])        
        return self.X_data[index], self.y_data[index]
        
    def __len__ (self):
        return len(self.X_data)
    
    def pad_data(self, s):
#         print(len(s))
        padded = np.zeros((self.maxlen,), dtype=np.int64)
        if len(s) > self.maxlen: 
            padded[:] = s[:self.maxlen]
        else: 
            padded[:len(s)] = s
        
        return padded

In [16]:
train_data = TrainData(X, y, maxlen=10)
# train_data = TrainData(torch.Tensor(X).to(torch.int64), torch.Tensor(y).to(torch.long))
train_loader = DataLoader(dataset=train_data, batch_size=BATCH_SIZE)

In [17]:
for i, j in train_loader:
    print(i)
    print(j)
    print("=" * 50)


tensor([[12,  9,  8,  6,  0,  0,  0,  0,  0,  0],
        [ 0,  9,  8,  2,  0,  0,  0,  0,  0,  0]])
tensor([[3, 4, 4, 2, 0, 0, 0, 0, 0, 0],
        [3, 4, 4, 2, 0, 0, 0, 0, 0, 0]])
tensor([[12,  7, 13,  5,  3,  0,  0,  0,  0,  0],
        [10,  9,  1, 11,  4,  0,  0,  0,  0,  0]])
tensor([[3, 4, 4, 4, 0, 0, 0, 0, 0, 0],
        [1, 4, 4, 2, 4, 0, 0, 0, 0, 0]])


### Class for GRU

In [18]:
class GRUtagger(nn.Module):
    
    def __init__(self, embedding_size, vocab_size, hidden_size, target_size):
        super(GRUtagger, self).__init__()
        
        self.word_embeddings = nn.Embedding(num_embeddings = vocab_size, embedding_dim = embedding_size)
        self.gru = nn.GRU(input_size = embedding_size, hidden_size=hidden_size, batch_first = True)
        self.linear = nn.Linear(in_features = hidden_size, out_features=target_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        gru_out, _ = self.gru(embeds)
        linear_out = self.linear(gru_out)
        y_out = F.log_softmax(linear_out, dim=1)
        return y_out

In [19]:
model = GRUtagger(embedding_size=EMBEDDING_SIZE, vocab_size=len(word2idx), hidden_size=HIDDEN_SIZE, target_size=len(tag2idx))
print(model)

criterion = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr = LEARNING_RATE)

GRUtagger(
  (word_embeddings): Embedding(14, 10)
  (gru): GRU(10, 20, batch_first=True)
  (linear): Linear(in_features=20, out_features=5, bias=True)
)


### See how the output from the model looks. 

In [20]:
with torch.no_grad():
    for x_batch, y_batch in train_loader:
               
        print("Input:")
        print(x_batch)
        y_out = model(x_batch)
        _, y_out_tags = torch.max(y_out.squeeze(), dim = 1)
        
        print("\nOutput:")
        print(y_out, y_out.shape)
        
        print("\nOutput Indices:")
        print(y_out_tags)
        
#         print("\nOutput Tags:")
#         for i in y_out_tags.tolist():
#             print(tags[i])

        print("\nActual Output:")
        print(y_batch, y_batch.shape)
        
#         print("\nActual Tags:")
#         for i in y_batch.squeeze().tolist():
#             print(tags[i])

        print("=" * 50)

Input:
tensor([[12,  9,  8,  6,  0,  0,  0,  0,  0,  0],
        [ 0,  9,  8,  2,  0,  0,  0,  0,  0,  0]])

Output:
tensor([[[-1.9535, -1.7529, -1.9320, -2.5259, -2.5453],
         [-2.2073, -2.0193, -2.2057, -2.3097, -2.4400],
         [-2.1645, -2.0583, -2.0287, -2.4821, -2.3742],
         [-2.1633, -2.1360, -2.1701, -2.2748, -2.2861],
         [-2.3657, -2.4347, -2.3414, -2.2723, -2.2370],
         [-2.4404, -2.5664, -2.4388, -2.2618, -2.2265],
         [-2.4669, -2.6229, -2.4977, -2.2492, -2.2306],
         [-2.4747, -2.6464, -2.5339, -2.2391, -2.2386],
         [-2.4753, -2.6558, -2.5559, -2.2322, -2.2465],
         [-2.4736, -2.6592, -2.5692, -2.2278, -2.2528]],

        [[-2.2117, -2.2562, -2.1769, -2.3999, -2.3958],
         [-2.2828, -2.1576, -2.3157, -2.2746, -2.4117],
         [-2.1468, -2.0347, -2.0531, -2.4283, -2.3692],
         [-2.2397, -2.1273, -2.2050, -2.3327, -2.3239],
         [-2.3316, -2.3280, -2.2991, -2.3171, -2.2706],
         [-2.3653, -2.4134, -2.3573, -2.2

### Train the model

**nn.NLLLoss()** expects input and target to be 2-dimensional and 1-dimensional respectively.

So, we will reshape the tensors as follows:  
* input tensor (y_pred) to a 2d tensor from a 3d tensor. So, from `[1, 4, 3]` to `[4, 3]`. 
* target tensor (y_batch) to a 1d tensor from a 2d tensor. So, from `[1, 4]` to `[4]`.

In [21]:
for e in range(1, EPOCH+1):
    for x_batch, y_batch in train_loader:
        model.zero_grad()
        
        y_pred = model(x_batch)
        y_batch = y_batch.view(-1)
        y_pred = y_pred.view(-1, len(tag2idx))
#         print(y_pred.shape)
#         print(y_batch.shape)
        loss = criterion(y_pred, y_batch)
        loss.backward()
        optimizer.step()    
#         _, y_pred_idx = torch.max(y_pred.squeeze(), dim = 1)
#         print(y_pred_idx, y_batch)

    print(f'Epoch {e}/{EPOCH} : loss = {loss.item()}') 
        
        

Epoch 1/10 : loss = 2.4094676971435547
Epoch 2/10 : loss = 2.3996338844299316
Epoch 3/10 : loss = 2.3900833129882812
Epoch 4/10 : loss = 2.380798578262329
Epoch 5/10 : loss = 2.371765613555908
Epoch 6/10 : loss = 2.3629705905914307
Epoch 7/10 : loss = 2.3543994426727295
Epoch 8/10 : loss = 2.346041440963745
Epoch 9/10 : loss = 2.3378841876983643
Epoch 10/10 : loss = 2.3299174308776855
