# BiLSTM for PoS Tagging

A multi-layer bi-directional LSTM followed by a Conditional Random Field (CRF) for Part-of-Speech (PoS) Tagging.  

In [1]:
import random
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

SEED = 515
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

## Preparing Data

The dataset is Universal Dependencies English Web Treebank (UDPOS).  
This dataset actually has two different sets of tags, [universal dependency (UD) tags](https://universaldependencies.org/u/pos/) and [Penn Treebank (PTB) tags](https://www.sketchengine.eu/penn-treebank-tagset/).  

In [2]:
from torchtext.data import Field, BucketIterator

TEXT = Field(lower=True, include_lengths=True)
# Because the set of possible tags is finite, do NOT use unknown token for it. 
UD_TAGS = Field(unk_token=None, include_lengths=True)
PTB_TAGS = Field(unk_token=None, include_lengths=True)

In [3]:
from torchtext.datasets import UDPOS

fields = [('text', TEXT), ('udtags', UD_TAGS), ('ptbtags', PTB_TAGS)]
train_data, valid_data, test_data = UDPOS.splits(fields=fields, root='data/')

In [4]:
print(train_data[0].text)
print(train_data[0].udtags)
print(train_data[0].ptbtags)

['al', '-', 'zaman', ':', 'american', 'forces', 'killed', 'shaikh', 'abdullah', 'al', '-', 'ani', ',', 'the', 'preacher', 'at', 'the', 'mosque', 'in', 'the', 'town', 'of', 'qaim', ',', 'near', 'the', 'syrian', 'border', '.']
['PROPN', 'PUNCT', 'PROPN', 'PUNCT', 'ADJ', 'NOUN', 'VERB', 'PROPN', 'PROPN', 'PROPN', 'PUNCT', 'PROPN', 'PUNCT', 'DET', 'NOUN', 'ADP', 'DET', 'NOUN', 'ADP', 'DET', 'NOUN', 'ADP', 'PROPN', 'PUNCT', 'ADP', 'DET', 'ADJ', 'NOUN', 'PUNCT']
['NNP', 'HYPH', 'NNP', ':', 'JJ', 'NNS', 'VBD', 'NNP', 'NNP', 'NNP', 'HYPH', 'NNP', ',', 'DT', 'NN', 'IN', 'DT', 'NN', 'IN', 'DT', 'NN', 'IN', 'NNP', ',', 'IN', 'DT', 'JJ', 'NN', '.']


In [5]:
TEXT.build_vocab(train_data, min_freq=2, 
                 vectors="glove.6B.100d", vectors_cache="vector_cache", 
                 unk_init=torch.Tensor.normal_)

UD_TAGS.build_vocab(train_data)
PTB_TAGS.build_vocab(train_data)

print(len(TEXT.vocab), len(UD_TAGS.vocab), len(PTB_TAGS.vocab))
print(UD_TAGS.vocab.itos)

8866 18 51
['<pad>', 'NOUN', 'PUNCT', 'VERB', 'PRON', 'ADP', 'DET', 'PROPN', 'ADJ', 'AUX', 'ADV', 'CCONJ', 'PART', 'NUM', 'SCONJ', 'X', 'INTJ', 'SYM']


In [6]:
BATCH_SIZE = 128

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size=BATCH_SIZE, device=device)

In [7]:
for batch in train_iterator:
    batch_text, batch_text_lens = batch.text
    batch_tags, batch_tags_lens = batch.udtags
    break

print(batch_text)
print(batch_text_lens)
print(batch_tags)
print(batch_tags_lens)

print(batch_text_lens == batch_tags_lens)

tensor([[  27,   56,  116,  ...,  127,    9, 3715],
        [  12,  244,    4,  ...,    4,   76,    1],
        [  73,   13,    1,  ...,    1, 1904,    1],
        ...,
        [   1,    1,    1,  ...,    1,    1,    1],
        [   1,    1,    1,  ...,    1,    1,    1],
        [   1,    1,    1,  ...,    1,    1,    1]], device='cuda:0')
tensor([19, 16,  2, 20, 44, 11, 29, 13, 10, 38, 22, 71, 17,  7, 15, 12,  7, 10,
        12, 29, 20,  5, 42, 20, 25, 11, 11,  4, 22, 16, 31, 28,  2, 24, 60, 18,
         4,  7,  4, 17, 26, 38, 34,  5,  2,  6,  1,  4, 23, 24, 33,  9, 16,  1,
        20, 27, 26, 23, 20, 13, 14, 20, 29, 14,  7, 13,  6, 23, 15, 11, 14, 27,
        31, 18,  2, 38, 52,  2,  2,  5,  7, 22,  7, 12, 16, 12,  5, 42, 18, 19,
        15,  8, 11, 13,  3, 33,  7,  4,  7,  1, 25, 48, 20, 11,  2, 26, 22, 19,
        21,  4, 12,  9, 33, 16, 15, 25, 10, 36,  3,  9,  5, 20, 17, 14,  4,  2,
        19,  1], device='cuda:0')
tensor([[14, 13,  8,  ...,  1,  4,  7],
        [ 4,  1,  2,  .

In [8]:
# Get the last elments
print(batch_text[batch_tags_lens-1, torch.arange(BATCH_SIZE)])
print(batch_tags[batch_tags_lens-1, torch.arange(BATCH_SIZE)])

tensor([   3,    3,    4,    3,    3,   31,    3,    3,    3,    3,    3,    3,
           3,    3,    3,    3,    3,    3,    3,   48,   51,    3,    3,    3,
          18,    3,    3, 5557,    3,    3,    3,   48,   16,    3,    3,    3,
           3,    3,   37,    3,    3,    3,    3,   51,    4,    3,    0,   37,
           3,    3,    3,   51,    3,  732,    3,    3,    3,    3,    3,    3,
           3,    3,    3,    3,    3,    3,  777,   51,    3,    3,    3,    3,
           3,    3,   51,    3,   18, 3211,    0,    3,   31,    3, 3128,   37,
           3,    3,    3,    3,    3,    3,    3,   37,   51,    3,  154,    3,
           3,   37,  464,  812,    3,    3,    3,    3,    4,    3,    3,    3,
           3,    3,    3,    3,    3,   18,    3,    3,    3,    3,   51,   31,
        2450,    3,    3,    3,  146,    4,    3, 3715], device='cuda:0')
tensor([ 2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
         2,  2,  2,  2,  2,  2,  2,  2,  2,  7

## Building the Model

A Seq2Seq model  
* The elements in two sequences are not matched one by one  
* The two sequences may have different lengths  

A PoS-tagger  
* The elements in two sequences are strictly matched one by one  
* The two sequences have definitely the same length  

### Conditional Random Field (CRF)

In [9]:
class CRF(nn.Module):
    def __init__(self, tag_dim: int, pad_idx: int):
        super().__init__()
        # These special indices all refer to tags, rather than the source sentence
        self.pad_idx = pad_idx

        # from_sos[j] is the score of transitioning from <sos> to j
        self.from_sos = nn.Parameter(torch.randn(tag_dim))
        # transitions[i, j] is the score of transitioning from i to j
        self.transitions = nn.Parameter(torch.randn(tag_dim, tag_dim))
        # to_eos[i] is the score of transitioning from i to <eos>
        self.to_eos = nn.Parameter(torch.randn(tag_dim))

        # no transitions from/to <pad>
        self.transitions.data[pad_idx, :] = -1e4
        self.transitions.data[:, pad_idx] = -1e4

    def _compute_scores(self, feats: torch.Tensor, seq_lens: torch.Tensor, mask: torch.Tensor, tags: torch.Tensor):
        """
        The result is actually log(scores). 
        """
        # feats: (step, batch, tag_dim)
        # mask/tags: (step, batch)
        # feat_scores: (step, batch) -> (batch)
        feat_scores = feats.gather(dim=-1, index=tags.unsqueeze(-1)).squeeze(-1)
        feat_scores = feat_scores.masked_fill(mask, 0).sum(dim=0)

        # trans_scores: (step-1, batch) -> (batch)
        trans_scores = self.transitions[tags[:-1], tags[1:]]
        trans_scores = trans_scores.masked_fill(mask[1:], 0).sum(dim=0)
        trans_scores += (self.from_sos[tags[0]] + self.to_eos[tags[seq_lens-1, torch.arange(tags.size(1))]])
        # scores: (batch)
        return feat_scores + trans_scores

    def _compute_partitions(self, feats: torch.Tensor, mask: torch.Tensor):
        """
        The result is actually log(partitions)
        """
        # Transition from <sos> -> Emission
        # Note: The first element in every sample is assumed to be NOT masked
        # alphas: (batch, tag_dim)
        alphas = self.from_sos.repeat(feats.size(1), 1) + feats[0]

        for t in range(1, feats.size(0)):
            # Transition -> Emission
            # alphas: (batch, tag_dim) -> (batch, tag_dim, 1)
            # feats[t]: (batch, tag_dim) -> (batch, 1, tag_dim)
            # next_alphas: (batch, tag_dim)
            next_alphas = torch.logsumexp(alphas.unsqueeze(2) + self.transitions + feats[t].unsqueeze(1), dim=1)
            # Keep the original alpha-values if masked
            # Note: Both the transition and emission at this step rely on mask[t]
            alphas = torch.where(mask[t].unsqueeze(-1), alphas, next_alphas)
            
        # Transition to <eos>
        # partitions: (batch)
        partitions = torch.logsumexp(alphas + self.to_eos, dim=1)
        return partitions

    def _viterbi_decode(self, feats: torch.Tensor, mask: torch.Tensor):
        # Transition from <sos> -> Emission
        # alphas: (batch, tag_dim)
        alphas = self.from_sos.repeat(feats.size(1), 1) + feats[0]
        # best_paths: (step=1, batch, tag_dim)
        best_paths = torch.arange(feats.size(-1), device=feats.device).repeat(feats.size(1), 1).unsqueeze(0)

        for t in range(1, feats.size(0)):
            # Transition -> Emission
            # alphas: (batch, tag_dim) -> (batch, tag_dim, 1)
            # feats[t]: (batch, tag_dim) -> (batch, 1, tag_dim)
            # next_alphas: (batch, tag_dim)
            next_alphas, indices = torch.max(alphas.unsqueeze(2) + self.transitions + feats[t].unsqueeze(1), dim=1)

            # Keep the original alpha-values if masked
            # Note: Both the transition and emission at this step rely on mask[t]
            alphas = torch.where(mask[t].unsqueeze(-1), alphas, next_alphas)

            next_best_paths = []
            for i in range(feats.size(1)):
                if not mask[t, i].item():
                    # If NOT masked, select paths according to indices, and this step traverses all tags
                    next_best_path = torch.cat([best_paths[:, i, indices[i]], 
                                                torch.arange(feats.size(-1), device=feats.device).unsqueeze(0)], dim=0)
                else:
                    # If masked, keep the original paths, and add <pad> at this step
                    next_best_path = torch.cat([best_paths[:, i], 
                                                torch.full((1, feats.size(-1)), fill_value=self.pad_idx, dtype=torch.long, device=feats.device)], dim=0)
                # next_best_path: (step, tag_dim) -> (step, 1, tag_dim)
                next_best_paths.append(next_best_path.unsqueeze(1))
            # best_paths: (step, batch, tag_dim)
            best_paths = torch.cat(next_best_paths, dim=1)
            
        # Transition to <eos>
        scores, indices = torch.max(alphas + self.to_eos, dim=1)
        # scores: (batch)
        # best_paths: (step, batch)
        return scores, best_paths[:, torch.arange(feats.size(1)), indices]

    def neg_loglikelihood(self, feats: torch.Tensor, seq_lens: torch.Tensor, mask: torch.Tensor, tags: torch.Tensor):
        scores = self._compute_scores(feats, seq_lens, mask, tags)
        partitions = self._compute_partitions(feats, mask)
        return partitions - scores

    def forward(self, feats: torch.Tensor, seq_lens: torch.Tensor, mask: torch.Tensor, tags: torch.Tensor):
        return self.neg_loglikelihood(feats, seq_lens, mask, tags)

In [10]:
VOC_DIM = len(TEXT.vocab)
EMB_DIM = 100
HID_DIM = 128
TAG_DIM = len(UD_TAGS.vocab)

N_LAYERS = 2
BIDIRECT = True
DROPOUT = 0.25
TEXT_PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
TAG_PAD_IDX = UD_TAGS.vocab.stoi[UD_TAGS.pad_token]


emb = nn.Embedding(VOC_DIM, EMB_DIM, padding_idx=TEXT_PAD_IDX).to(device)
rnn = nn.LSTM(EMB_DIM, HID_DIM, num_layers=N_LAYERS, bidirectional=BIDIRECT, dropout=DROPOUT).to(device)
hid2tag = nn.Linear(HID_DIM*2 if BIDIRECT else HID_DIM, TAG_DIM).to(device)


mask = (batch_text == TEXT_PAD_IDX)
print(mask.size())
embedded = emb(batch_text)
# Pack sequence
packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, batch_text_lens, enforce_sorted=False)
# hidden: (num_layers*num_directions, batch, hid_dim)
packed_outs, (hidden, cell) = rnn(packed_embedded)
# Unpack sequence
# outs: (step, batch, hid_dim)
outs, out_lens = nn.utils.rnn.pad_packed_sequence(packed_outs)

# feats: (step, batch, tag_dim)
feats = hid2tag(outs)
print(feats.size())

torch.Size([71, 128])
torch.Size([71, 128, 18])


In [11]:
crf = CRF(TAG_DIM, TAG_PAD_IDX).to(device)
scores = crf._compute_scores(feats, batch_text_lens, mask, batch_tags)
print(scores.size())
partitions = crf._compute_partitions(feats, mask)
print(partitions.size())

decoded_scores, best_paths = crf._viterbi_decode(feats, mask)
print(decoded_scores.size())
print(best_paths.size())

losses = crf(feats, batch_text_lens, mask, batch_tags)
print(losses.size())

torch.Size([128])
torch.Size([128])
torch.Size([128])
torch.Size([71, 128])
torch.Size([128])


In [12]:
# Check the treatment on padding positions
padded_feats = torch.cat([feats, torch.randn(1, BATCH_SIZE, TAG_DIM, device=device)], dim=0)
padded_mask = torch.cat([mask, torch.ones(1, BATCH_SIZE, dtype=bool, device=device)], dim=0)
padded_tags = torch.cat([batch_tags, torch.full((1, BATCH_SIZE), fill_value=TAG_PAD_IDX, dtype=torch.long, device=device)], dim=0)
padded_scores = crf._compute_scores(padded_feats, batch_text_lens, padded_mask, padded_tags)
padded_partitions = crf._compute_partitions(padded_feats, padded_mask)
padded_decoded_scores, padded_best_paths = crf._viterbi_decode(padded_feats, padded_mask)

print(feats.size())
print(padded_feats.size())
print((padded_scores == scores).all())
print((padded_partitions == partitions).all())
print((padded_decoded_scores == decoded_scores).all())
print((padded_best_paths[:-1] == best_paths).all())

print((best_paths[-1] == TAG_PAD_IDX).all())
print((padded_best_paths[-1] == TAG_PAD_IDX).all())

torch.Size([71, 128, 18])
torch.Size([72, 128, 18])
tensor(True, device='cuda:0')
tensor(True, device='cuda:0')
tensor(True, device='cuda:0')
tensor(True, device='cuda:0')
tensor(False, device='cuda:0')
tensor(True, device='cuda:0')


### BiLSTM-CRF PoS-Tagger

In [13]:
class PoSTagger(nn.Module):
    def __init__(self, voc_dim, emb_dim, hid_dim, tag_dim, n_layers, bidirect, dropout, text_pad_idx, tag_pad_idx):
        super().__init__()
        self.emb = nn.Embedding(voc_dim, emb_dim, padding_idx=text_pad_idx)
        self.rnn = nn.LSTM(emb_dim, hid_dim, num_layers=n_layers, 
                           bidirectional=bidirect, dropout=dropout)
        self.hid2tag = nn.Linear(hid_dim*2 if bidirect else hid_dim, tag_dim)
        self.crf = CRF(tag_dim, tag_pad_idx)
        self.dropout = nn.Dropout(dropout)

    def _get_rnn_features(self, text: torch.Tensor, seq_lens: torch.Tensor):
        # embedded: (step, batch, emb_dim)
        embedded = self.dropout(self.emb(text))
        # Pack sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, seq_lens, enforce_sorted=False)
        # hidden: (num_layers*num_directions, batch, hid_dim)
        packed_outs, (hidden, cell) = self.rnn(packed_embedded)
        # Unpack sequence
        # outs: (step, batch, hid_dim)
        outs, out_lens = nn.utils.rnn.pad_packed_sequence(packed_outs)

        # feats: (step, batch, tag_dim)
        feats = self.hid2tag(self.dropout(outs))
        return feats

    def forward(self, text: torch.Tensor, seq_lens: torch.Tensor, tags: torch.Tensor):
        # text/mask: (step, batch)
        mask = (text == self.emb.padding_idx)
        feats = self._get_rnn_features(text, seq_lens)
        
        # losses: (batch)
        losses = self.crf(feats, seq_lens, mask, tags)
        return losses

    def decode(self, text: torch.Tensor, seq_lens: torch.Tensor):
        # text/mask: (step, batch)
        mask = (text == self.emb.padding_idx)
        feats = self._get_rnn_features(text, seq_lens)

        decoded_scores, best_paths = self.crf._viterbi_decode(feats, mask)
        return decoded_scores, best_paths

In [14]:
tagger = PoSTagger(VOC_DIM, EMB_DIM, HID_DIM, TAG_DIM, N_LAYERS, 
                   BIDIRECT, DROPOUT, TEXT_PAD_IDX, TAG_PAD_IDX).to(device)
losses = tagger(batch_text, batch_text_lens, batch_tags)
print(losses.size())

torch.Size([128])


In [15]:
decoded_scores, best_paths = tagger.decode(batch_text, batch_text_lens)
print(decoded_scores.size())
print(best_paths.size())

torch.Size([128])
torch.Size([71, 128])


In [16]:
# Check if data are mixed across different samples in a batch.
tagger.eval()
max_len_012 = batch_text_lens[0:3].max()
losses_012 = tagger(batch_text[:max_len_012, 0:3], batch_text_lens[0:3], batch_tags[:max_len_012, 0:3])
max_len_123 = batch_text_lens[1:4].max()
losses_123 = tagger(batch_text[:max_len_123, 1:4], batch_text_lens[1:4], batch_tags[:max_len_123, 1:4])

losses_012[1:] - losses_123[:2]

tensor([0., 0.], device='cuda:0', grad_fn=<SubBackward0>)

## Training the Model

In [16]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.normal_(param.data, mean=0, std=0.1)

def count_parameters(model: nn.Module):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


tagger = PoSTagger(VOC_DIM, EMB_DIM, HID_DIM, TAG_DIM, N_LAYERS, 
                   BIDIRECT, DROPOUT, TEXT_PAD_IDX, TAG_PAD_IDX).to(device)

tagger.apply(init_weights)
print(f'The model has {count_parameters(tagger):,} trainable parameters')

The model has 1,522,370 trainable parameters


In [17]:
# Initialize Embeddings with Pre-Trained Vectors
print(TEXT.vocab.vectors.size())
print(tagger.emb.weight.size())

tagger.emb.weight.data.copy_(TEXT.vocab.vectors)

TEXT_UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
tagger.emb.weight.data[TEXT_UNK_IDX].zero_()
tagger.emb.weight.data[TEXT_PAD_IDX].zero_()

print(tagger.emb.weight[:5, :8])

torch.Size([8866, 100])
torch.Size([8866, 100])
tensor([[ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281, -0.3996,  0.0832,  0.0440, -0.3914,  0.3344],
        [-0.3398,  0.2094,  0.4635, -0.6479, -0.3838,  0.0380,  0.1713,  0.1598],
        [-0.1077,  0.1105,  0.5981, -0.5436,  0.6740,  0.1066,  0.0389,  0.3548]],
       device='cuda:0', grad_fn=<SliceBackward>)


In [18]:
optimizer = optim.AdamW(tagger.parameters())

In [19]:
def train_epoch(tagger, iterator, optimizer):
    tagger.train()
    epoch_loss = 0
    epoch_acc = 0
    for batch in iterator:
        # Forward pass & Calculate loss
        text, text_lens = batch.text
        tags, tags_lens = batch.udtags
        losses = tagger(text, text_lens, tags)
        loss = losses.mean()

        # Backward propagation
        optimizer.zero_grad()
        loss.backward()
        # Update weights
        optimizer.step()
        # Accumulate loss and acc
        epoch_loss += loss.item()

        decoded_scores, best_paths = tagger.decode(text, text_lens)
        non_padding = (tags != tagger.crf.pad_idx)
        epoch_acc += (best_paths == tags)[non_padding].sum().item() / non_padding.sum().item()
    return epoch_loss/len(iterator), epoch_acc/len(iterator)

def eval_epoch(tagger, iterator):
    tagger.eval()
    epoch_loss = 0
    epoch_acc = 0
    with torch.no_grad():
        for batch in iterator:
            # Forward pass & Calculate loss
            text, text_lens = batch.text
            tags, tags_lens = batch.udtags
            losses = tagger(text, text_lens, tags)
            loss = losses.mean()
            
            # Accumulate loss and acc
            epoch_loss += loss.item()

            decoded_scores, best_paths = tagger.decode(text, text_lens)
            non_padding = (tags != tagger.crf.pad_idx)
            epoch_acc += (best_paths == tags)[non_padding].sum().item() / non_padding.sum().item()
    return epoch_loss/len(iterator), epoch_acc/len(iterator)

In [20]:
import time
N_EPOCHS = 10
best_valid_loss = np.inf

for epoch in range(N_EPOCHS):
    t0 = time.time()
    train_loss, train_acc = train_epoch(tagger, train_iterator, optimizer)
    valid_loss, valid_acc = eval_epoch(tagger, valid_iterator)
    epoch_secs = time.time() - t0

    epoch_mins, epoch_secs = int(epoch_secs // 60), int(epoch_secs % 60)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(tagger.state_dict(), 'models/tut2-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 6m 58s
	Train Loss: 21.626 | Train Acc: 59.94%
	 Val. Loss: 7.016 |  Val. Acc: 81.02%
Epoch: 02 | Epoch Time: 7m 9s
	Train Loss: 7.661 | Train Acc: 85.35%
	 Val. Loss: 4.852 |  Val. Acc: 86.72%
Epoch: 03 | Epoch Time: 6m 56s
	Train Loss: 5.551 | Train Acc: 89.41%
	 Val. Loss: 4.191 |  Val. Acc: 86.73%
Epoch: 04 | Epoch Time: 6m 54s
	Train Loss: 4.553 | Train Acc: 91.22%
	 Val. Loss: 3.834 |  Val. Acc: 87.27%
Epoch: 05 | Epoch Time: 6m 59s
	Train Loss: 3.953 | Train Acc: 92.42%
	 Val. Loss: 3.632 |  Val. Acc: 88.01%
Epoch: 06 | Epoch Time: 6m 57s
	Train Loss: 3.532 | Train Acc: 93.10%
	 Val. Loss: 3.501 |  Val. Acc: 88.77%
Epoch: 07 | Epoch Time: 6m 59s
	Train Loss: 3.191 | Train Acc: 93.75%
	 Val. Loss: 3.326 |  Val. Acc: 88.64%
Epoch: 08 | Epoch Time: 7m 3s
	Train Loss: 2.972 | Train Acc: 94.19%
	 Val. Loss: 3.264 |  Val. Acc: 88.92%
Epoch: 09 | Epoch Time: 6m 49s
	Train Loss: 2.753 | Train Acc: 94.58%
	 Val. Loss: 3.279 |  Val. Acc: 89.04%
Epoch: 10 | Epoch Ti

In [21]:
tagger.load_state_dict(torch.load('models/tut2-model.pt'))

valid_loss, valid_acc = eval_epoch(tagger, valid_iterator)
test_loss, test_acc = eval_epoch(tagger, test_iterator)

print(f'Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}%')
print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Val. Loss: 3.126 | Val. Acc: 89.13%
Test Loss: 3.204 | Test Acc: 89.20%


## Check Embeddings
* The Embeddings of `<unk>` and `<pad>` tokens
    * Because the `padding_idx` has been passed to `nn.Embedding`, so the `<pad>` embedding will remain zeros throughout training.  
    * While the `<unk>` embedding will be learned.

In [22]:
print(tagger.emb.weight[:5, :8])

tensor([[-0.1400,  0.1221, -0.0143,  0.0239,  0.0237,  0.1102,  0.1046,  0.1522],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [-0.2850, -0.3341,  0.7358, -0.4719, -0.0563,  0.0646, -0.4973,  0.3815],
        [-0.5033,  0.3224,  0.6275, -0.7844, -0.4480, -0.0719,  0.3924,  0.2554],
        [-0.2419,  0.2174,  0.7135, -0.6660,  0.4973,  0.0330,  0.2495,  0.4371]],
       device='cuda:0', grad_fn=<SliceBackward>)
