# Named Entity Recognition with GloVe-BiLSTM-Softmax model

In [13]:
from tqdm import tqdm_notebook as progress
import torch
import torch.nn as nn
from torch.nn.utils import rnn
import time

# 1 Read and process NER 2003 English Shared Task data

In [14]:
def read_data(fpath):
    """ Read NER 2003 English Shared Task data (CoNNL file format)
    Data format:
        All data files contain one word per line with empty lines
        representing sentence boundaries. At the end of each line there is 
        tag which states whether the current word is inside a named entity or not.
        The tag also encodes the type of named entity. Example
        Example:
            U.N. NNP I-NP I-ORG
        Each line contains four fields:
            [word] [POS tag] [chunk tag] [NE tag]
            
        Four different types of named entities: PERSON, LOCATION, ORGANIZATION, MISC.
        
    Args:
        fpath: path to data file
    Returns:
        sentences_words: list of sentences' words (one sentence is a list of words)
        sentences_tags: list of sentences' tags (one sentence is a list of tags corresponding to words)
    """
    with open(fpath, 'r') as f:
        lines = f.readlines()

    raw_sentences = []
    i_prev, i_next = 0, 0
    while i_next < len(lines):
        if lines[i_next] == '\n':
            raw_sentences.append(lines[i_prev:i_next])
            i_prev = i_next + 1
        i_next += 1

    sentences_words = []
    sentences_tags = []
    for sentence in raw_sentences:
        words = [string.split()[0] for string in sentence]
        tags = [string.split()[3] for string in sentence]
        if words != ['-DOCSTART-']:
            sentences_words.append(words)
            sentences_tags.append(tags)
    return list(zip(sentences_words, sentences_tags))

In [15]:
root = './data'

train_data = read_data('./data/train.txt')
val_data = read_data('./data/dev.txt')
test_data = read_data('./data/test.txt')

print('Loaded train ({}), dev ({}), test ({}) data'.format(len(train_data),
                                                          len(val_data),
                                                          len(test_data)))

Loaded train (14041), dev (3250), test (3453) data


In [16]:
word_to_idx = {}  # vocabulary
tag_to_idx = {}  # tagset

for i, (sentence, tags) in enumerate(train_data + val_data + test_data):
    for word in sentence:
        if word not in word_to_idx:
            word_to_idx[word] = len(word_to_idx)
    for tag in tags:
        if tag not in tag_to_idx:
            tag_to_idx[tag] = len(tag_to_idx)

vocab_size = len(word_to_idx)
print(vocab_size, 'words in vocabulary')

tagset_size = len(tag_to_idx)
print(tagset_size, 'tags (IOB2 tagging scheme):', tag_to_idx.keys())

30289 words in vocabulary
9 tags (IOB2 tagging scheme): dict_keys(['B-ORG', 'O', 'B-MISC', 'B-PER', 'I-PER', 'B-LOC', 'I-ORG', 'I-MISC', 'I-LOC'])


# 2. Load GloVe embeddings

In [17]:
with open('./glove/glove.6B.100d.txt', 'r') as f:
    glove_raw = f.readlines()
    
glove = {line.split()[0]: torch.tensor([float(val) for val in line.split()[1:]])
         for line in progress(glove_raw)}

print('GloVe for {} words loaded'.format(len(glove)))

HBox(children=(IntProgress(value=0, max=400000), HTML(value='')))


GloVe for 400000 words loaded


In [18]:
def get_gloves(seq, glove, strategy=3):
    """Get Global Vectors for Word Representation in a list of words
    Args:
        seq: list of words of size N
        glove: GloVe dictionary of 400000 words, the format
               {word: 100D vector representation}
        strategy: 1, 2 or 3
            1 -- load the embeddings for original capitalization of words.
                 If embedding for this word doesn’t exists, associate
                 it with <UNK> embedding.
            2 -- load the embeddings for lowercased capitalization of words.
                 If embedding for this lowercased word doesn’t exists,
                 associate it with <UNK> embedding.
            3 -- Для різної капіталізації слів завантажуємо одні і ті самі (lowercased)
                 вектори ембедінгів, але в нашому словнику це різні слова.
                 Тобто, “Hello” та “hello” відповідають фізично два вектори,
                 які ідентичні за своїми значеннями перед тренуванням
                 (якщо freeze_embeddings стоїть  False, то в процесі
                 тренування вони будуть змінюватись).
    Returns:
        out_seq: a list of [100] tensors (GloVes) of size N
    """
    assert strategy in [1,2,3]
    
    def associate_embedding(word):
        if strategy == 1:
            return glove['unk'] if word not in glove else glove[word]
        if strategy == 2:
            return glove['unk'] if word.lower() not in glove else glove[word.lower()]
        return glove['unk'] if word.lower() not in glove else \
                glove[word.lower()] if word not in glove else glove[word]
    
    out_seq = [associate_embedding(w) for w in seq]
    return out_seq, torch.stack(out_seq, 0)

embedding_dim = 100
strategy = 3

gloves, gloves_matrix = get_gloves(word_to_idx.keys(), glove, strategy)
word_to_glove = dict(zip(word_to_idx.keys(), gloves))
print('Loaded GloVe embeddings as dict vocabulary and embedding matrix of shape',
          tuple(gloves_matrix.shape))

print('Example embedding for `the`:\n', word_to_glove['the'])

Loaded GloVe embeddings as dict vocabulary and embedding matrix of shape (30289, 100)
Example embedding for `the`:
 tensor([-0.0382, -0.2449,  0.7281, -0.3996,  0.0832,  0.0440, -0.3914,  0.3344,
        -0.5755,  0.0875,  0.2879, -0.0673,  0.3091, -0.2638, -0.1323, -0.2076,
         0.3340, -0.3385, -0.3174, -0.4834,  0.1464, -0.3730,  0.3458,  0.0520,
         0.4495, -0.4697,  0.0263, -0.5415, -0.1552, -0.1411, -0.0397,  0.2828,
         0.1439,  0.2346, -0.3102,  0.0862,  0.2040,  0.5262,  0.1716, -0.0824,
        -0.7179, -0.4153,  0.2033, -0.1276,  0.4137,  0.5519,  0.5791, -0.3348,
        -0.3656, -0.5486, -0.0629,  0.2658,  0.3020,  0.9977, -0.8048, -3.0243,
         0.0125, -0.3694,  2.2167,  0.7220, -0.2498,  0.9214,  0.0345,  0.4674,
         1.1079, -0.1936, -0.0746,  0.2335, -0.0521, -0.2204,  0.0572, -0.1581,
        -0.3080, -0.4162,  0.3797,  0.1501, -0.5321, -0.2055, -1.2526,  0.0716,
         0.7056,  0.4974, -0.4206,  0.2615, -1.5380, -0.3022, -0.0734, -0.2831,
    

# 3-4. Train BiLSTM model on batches & test with micro-average Precision/Recall/F1/F0.5

### Model

In [27]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

class LSTMTagger(nn.Module):
    """
    Args:
        embedding_dim -- 100
        hidden_dim -- hidden state dimensionality
        vocab_size -- vocabulary size
        tagset_size -- tag set size
        pretrained_embeddings -- None or [vocab_size, embedding_dim] tensor
    """
    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size,
                 pretrained_embeddings=None, strategy=1):

        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        if pretrained_embeddings is not None:
            self.word_embeddings = nn.Embedding.from_pretrained(pretrained_embeddings,
                                                                freeze=strategy!=3)

        self.lstm = nn.LSTM(embedding_dim, hidden_dim//2,
                            bidirectional=True, batch_first=True)
        self.tagger = nn.Linear(hidden_dim, tagset_size)

    def forward(self, inputs, packed_inputs=False):
        if packed_inputs:
            embeds = rnn.PackedSequence(self.word_embeddings(inputs.data), inputs.batch_sizes)
        else:
            embeds = self.word_embeddings(inputs).view(len(inputs), 1, -1)

        lstm_out, _ = self.lstm(embeds)

        if packed_inputs:
            tag_space = self.tagger(lstm_out.data)
        else:
            tag_space = self.tagger(lstm_out.view(len(inputs), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

### Utilities

In [28]:
class AverageMeter(object):
    """Computes and stores the average and current value
    """
    def __init__(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def seq_to_idxs(seq, mapping):
    """Converts a sequence of elements to sequence of indices
       using given mapping

    Args:
        seq -- list of elements
        mapping -- {element: idx} dict
    """
    outs_seq = [torch.tensor(mapping[el], dtype=torch.long) for el in seq]
    outs_seq = torch.stack(outs_seq, 0)
    return outs_seq


def calculate_scores(outputs, targets, log=False):
    """Calculate per-class and micro-average precision, recall, F-1 score and F-0.5 score
    
        Args:
            outputs: tensor of log-softmax model outputs for the dataset
            targets: tensor of true tag indices for the dataset
            log: whether to print or not
        Returns:
            (micro-average) precision, recall, F-1 score, F-0.5 score
    """    
    def F_score(precision, recall, beta=1):
        return (1 + beta**2) * precision * recall / ((beta**2) * precision + recall + 1e-15)

    pred = outputs.max(dim=1)[1]
    TP, TN = torch.zeros(9), torch.zeros(9)
    FP, FN = torch.zeros(9), torch.zeros(9)
    if log:
        print('Tag\tSize\tPrecision\tRecall\t\tF1\tF0.5')
    for tag in tag_to_idx.keys():
        i = tag_to_idx[tag]
        TP[i] = (pred[targets==i]==i).sum()
        TN[i] = (pred[targets!=i]!=i).sum()
        FP[i] = (pred[targets!=i]==i).sum()
        FN[i] = (pred[targets==i]!=i).sum()

        precision = float(TP[i] / (TP[i] + FP[i] + 1e-15))
        recall = float(TP[i] / (TP[i] + FN[i] + 1e-15))
        if log:
            print('{}\t{}\t{:.4f}\t\t{:.4f}\t\t{:.4f}\t{:.4f}'.format(tag, len(targets[targets==i]),
                                                                    precision, recall,
                                                                    F_score(precision, recall, beta=1),
                                                                    F_score(precision, recall, beta=0.5)))

    MicroAvePrecision = float(TP.sum() / (TP.sum() + FP.sum()))
    MicroAveRecall = float(TP.sum() / (TP.sum() + FN.sum()))
    F1 = F_score(MicroAvePrecision, MicroAveRecall, beta=1)
    F05 = F_score(MicroAvePrecision, MicroAveRecall, beta=0.5)
    if log:
        print('{}\t{}\t{:.4f}\t\t{:.4f}\t\t{:.4f}\t{:.4f}'.format('MICRO', len(targets),
                                                                  MicroAvePrecision,
                                                                  MicroAveRecall,
                                                                  F1, F05))
    return MicroAvePrecision, MicroAveRecall, F1, F05

### Trainer

In [29]:
class Trainer():
    def __init__(self, model, loss_fn, optimizer, word_to_idx, tag_to_idx):
        self.model = model
        self.loss_fn = loss_fn
        self.optimizer = optimizer
        self.word_to_idx = word_to_idx
        self.tag_to_idx = tag_to_idx
    
    def run(self, num_epochs,
            train_data, train_batch_size, val_data, val_batch_size,
            log_interval=50):
        for epoch in range(1, num_epochs+1):
            self.__train(epoch, train_data, train_batch_size, log_interval)
            self.__validate(epoch, val_data, val_batch_size)
        
    def __train(self, epoch, train_data, batch_size, log_interval=50):
        losses = AverageMeter()
        batch_time = AverageMeter()
        batch_start = 0
        batch_idx = 0
        while batch_start < len(train_data):
            end = time.time()
            self.optimizer.zero_grad()

            batch = train_data[batch_start:batch_start + batch_size]
            inputs_packed, targets_packed = self._prepare_batch(batch)
            outputs = self.model(inputs_packed, True)

            loss = self.loss_fn(outputs, targets_packed.data)
            loss.backward()
            self.optimizer.step()
            
            losses.update(loss.item())
            batch_time.update(time.time() - end)

            batch_start += batch_size
            batch_idx += 1
            
            if batch_idx % log_interval == 0:
                print('Train Epoch: {}\t[{:>5}/{:<5}]\tTime: {:.2f} ({:.2f})\tLoss: {:.4f} ({:.4f})'.format(epoch,
                    min(batch_start, len(train_data)), len(train_data),
                    batch_time.val, batch_time.avg,
                    losses.val, losses.avg))
        print('====> Train. {}\tTotal time: {:.2f}\tAverage loss: {:.4f}'.format(
              epoch, batch_time.sum, losses.avg))
            
            
    def __validate(self, epoch, val_data, batch_size):
        losses = AverageMeter()
        batch_time = AverageMeter()
        F1scores = AverageMeter()
        F05scores = AverageMeter()
        with torch.no_grad():
            batch_start = 0
            batch_idx = 0
            while batch_start < len(val_data):
                end = time.time()

                batch = val_data[batch_start:batch_start + batch_size]
                inputs_packed, targets_packed = self._prepare_batch(batch)
                outputs = self.model(inputs_packed, True)

                loss = self.loss_fn(outputs, targets_packed.data)
                _, _, F1, F05 = calculate_scores(outputs, targets_packed.data, log=False)
                F1scores.update(F1)
                F05scores.update(F05)

                losses.update(loss.item())
                batch_time.update(time.time() - end)

                batch_start += batch_size
                batch_idx += 1
        print('====> Valid. {}\tTotal time: {:.2f}\tAverage loss: {:.4f}\tF-1: {:.4f}\tF-0.5: {:.4f}\t'.format(
              epoch, batch_time.sum, losses.avg, F1scores.avg, F05scores.avg))
            
    def _prepare_batch(self, batch):
        inputs_batch = [seq_to_idxs(seq[0], self.word_to_idx) for seq in batch]
        targets_batch = [seq_to_idxs(seq[1], self.tag_to_idx) for seq in batch]

        order = sorted(enumerate(inputs_batch), key=lambda x: len(x[1]), reverse=True)
        inputs_batch = [inputs_batch[order_[0]] for order_ in order]
        targets_batch = [targets_batch[order_[0]] for order_ in order]

        inputs_packed = rnn.pack_sequence(inputs_batch)
        targets_packed = rnn.pack_sequence(targets_batch)
        return inputs_packed, targets_packed
    
    def test(self, test_data):
        with torch.no_grad():
            inputs_packed, targets_packed = self._prepare_batch(test_data)
            outputs = self.model(inputs_packed, True)
        calculate_scores(outputs, targets_packed.data, log=True);

### Run

In [31]:
hidden_dim = 64
learning_rate = 0.1

model = LSTMTagger(embedding_dim=embedding_dim,
                   hidden_dim=hidden_dim,
                   vocab_size=vocab_size,
                   tagset_size=tagset_size,
                   pretrained_embeddings=gloves_matrix,
                   strategy=3)
loss_function = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

train_batch_size = 128
val_batch_size = 128

trainer = Trainer(model, loss_function, optimizer, word_to_idx, tag_to_idx)
trainer.run(8, train_data, train_batch_size, val_data, val_batch_size)
trainer.test(test_data)

Train Epoch: 1	[ 6400/14041]	Time: 0.07 (0.07)	Loss: 0.1615 (0.2438)
Train Epoch: 1	[12800/14041]	Time: 0.09 (0.07)	Loss: 0.1274 (0.1818)
====> Train. 1	Total time: 8.05	Average loss: 0.1747
====> Valid. 1	Total time: 0.62	Average loss: 0.2585	F-1: 0.9333	F-0.5: 0.9333	
Train Epoch: 2	[ 6400/14041]	Time: 0.07 (0.07)	Loss: 0.1114 (0.0981)
Train Epoch: 2	[12800/14041]	Time: 0.09 (0.07)	Loss: 0.1156 (0.1006)
====> Train. 2	Total time: 8.13	Average loss: 0.0997
====> Valid. 2	Total time: 0.61	Average loss: 0.2370	F-1: 0.9401	F-0.5: 0.9401	
Train Epoch: 3	[ 6400/14041]	Time: 0.07 (0.07)	Loss: 0.1196 (0.0871)
Train Epoch: 3	[12800/14041]	Time: 0.09 (0.07)	Loss: 0.1169 (0.0910)
====> Train. 3	Total time: 8.07	Average loss: 0.0904
====> Valid. 3	Total time: 0.62	Average loss: 0.2625	F-1: 0.9344	F-0.5: 0.9344	
Train Epoch: 4	[ 6400/14041]	Time: 0.07 (0.07)	Loss: 0.0853 (0.0831)
Train Epoch: 4	[12800/14041]	Time: 0.08 (0.07)	Loss: 0.0968 (0.0856)
====> Train. 4	Total time: 7.86	Average loss: 0.0

# 5. Compare the performances (F1 and F0.5 scores) for each strategy of loading the embeddings

In [30]:
for strategy in [1, 2, 3]:
    print('Strategy', strategy)
    
    embedding_dim = 100

    gloves, gloves_matrix = get_gloves(word_to_idx.keys(), glove, strategy)
    word_to_glove = dict(zip(word_to_idx.keys(), gloves))
    print('Loaded GloVe embeddings as dict vocabulary and embedding matrix of shape',
              tuple(gloves_matrix.shape))
    
    hidden_dim = 64
    learning_rate = 0.1

    model = LSTMTagger(embedding_dim=embedding_dim,
                       hidden_dim=hidden_dim,
                       vocab_size=vocab_size,
                       tagset_size=tagset_size,
                       pretrained_embeddings=gloves_matrix,
                       strategy=strategy)
    loss_function = nn.NLLLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    train_batch_size = 128
    val_batch_size = 128

    trainer = Trainer(model, loss_function, optimizer, word_to_idx, tag_to_idx)
    trainer.run(5, train_data, train_batch_size, val_data, val_batch_size, log_interval=200)
    trainer.test(test_data)
    print()

Strategy 1
Loaded GloVe embeddings as dict vocabulary and embedding matrix of shape (30289, 100)
====> Train. 1	Total time: 4.68	Average loss: 0.5169
====> Valid. 1	Total time: 0.58	Average loss: 0.4581	F-1: 0.8290	F-0.5: 0.8290	
====> Train. 2	Total time: 4.83	Average loss: 0.4432
====> Valid. 2	Total time: 0.57	Average loss: 0.4529	F-1: 0.8452	F-0.5: 0.8452	
====> Train. 3	Total time: 5.19	Average loss: 0.4540
====> Valid. 3	Total time: 0.59	Average loss: 0.4684	F-1: 0.8477	F-0.5: 0.8477	
====> Train. 4	Total time: 5.07	Average loss: 0.4350
====> Valid. 4	Total time: 0.57	Average loss: 0.4518	F-1: 0.8560	F-0.5: 0.8560	
====> Train. 5	Total time: 5.23	Average loss: 0.4456
====> Valid. 5	Total time: 0.57	Average loss: 0.4452	F-1: 0.8569	F-0.5: 0.8569	
Tag	Size	Precision	Recall		F1	F0.5
B-ORG	1661	0.2868		0.0704		0.1131	0.1776
O	38323	0.9124		0.9811		0.9455	0.9254
B-MISC	702	0.5000		0.0028		0.0057	0.0139
B-PER	1617	0.8904		0.0402		0.0769	0.1702
I-PER	1156	0.7136		0.6704		0.6913	0.7045
B

> The performance is quite good for all strategies, however the last two perfom better straightway.