In [1]:
import random
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

SEED = 515
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

## Preparing Data

In [2]:
from torchtext.data import Field, BucketIterator

# Refs: https://github.com/pytorch/text/blob/master/test/sequence_tagging.py
TEXT = Field(lower=True, include_lengths=True)
# There exist unknown token... (e.g., 'I-PRT')
CHUNK_TAGS = Field(include_lengths=True)

In [3]:
from torchtext.datasets import CoNLL2000Chunking

fields = [('text', TEXT), (None, None), ('tags', CHUNK_TAGS)]
train_data, valid_data, test_data = CoNLL2000Chunking.splits(fields=fields, root='data/')

In [4]:
print(train_data[0].text)
print(train_data[0].tags)

['construction', 'of', 'apartments', 'and', 'other', 'multi-family', 'dwellings', 'slipped', '2.2', '%', 'to', 'an', 'annual', 'rate', 'of', '1,022,000', 'following', 'a', '3.5', '%', 'decline', 'in', 'august', '.']
['B-NP', 'B-PP', 'B-NP', 'O', 'B-NP', 'I-NP', 'I-NP', 'B-VP', 'B-NP', 'I-NP', 'B-PP', 'B-NP', 'I-NP', 'I-NP', 'B-PP', 'B-NP', 'B-PP', 'B-NP', 'I-NP', 'I-NP', 'I-NP', 'B-PP', 'B-NP', 'O']


In [5]:
TEXT.build_vocab(train_data, min_freq=2, 
                 vectors="glove.6B.100d", vectors_cache="vector_cache", 
                 unk_init=torch.Tensor.normal_)

CHUNK_TAGS.build_vocab(train_data)

print(len(TEXT.vocab), len(CHUNK_TAGS.vocab))
print(CHUNK_TAGS.vocab.itos)

8389 23
['<unk>', '<pad>', 'I-NP', 'B-NP', 'O', 'B-VP', 'B-PP', 'I-VP', 'B-ADVP', 'B-SBAR', 'B-ADJP', 'I-ADJP', 'B-PRT', 'I-ADVP', 'I-PP', 'I-CONJP', 'I-SBAR', 'B-CONJP', 'B-INTJ', 'B-LST', 'I-INTJ', 'I-UCP', 'B-UCP']


In [6]:
BATCH_SIZE = 128

device = torch.device('cuda:3' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size=BATCH_SIZE, device=device)

In [7]:
for batch in train_iterator:
    batch_text, batch_text_lens = batch.text
    batch_tags, batch_tags_lens = batch.tags
    break

print(batch_text)
print(batch_text_lens)
print(batch_tags)
print(batch_tags_lens)

tensor([[   8,   64, 4318,  ...,   80,    8, 2301],
        [   0, 6733, 2373,  ..., 1133,   32, 2891],
        [   2, 3291,   32,  ...,    2, 2370,   14],
        ...,
        [   1,    1,    1,  ...,    1,    1,    1],
        [   1,    1,    1,  ...,    1,    1,    1],
        [   1,    1,    1,  ...,    1,    1,    1]], device='cuda:3')
tensor([26, 11, 33, 21, 27, 15, 11, 33, 14,  9, 24, 27, 11, 25, 28, 34, 13, 22,
        31, 31, 32,  5, 14, 42, 17, 24, 29, 32, 37, 21, 14, 23, 26, 29,  3, 10,
        44, 23, 27, 24, 22, 12, 19, 39, 30, 16, 21, 31, 12, 23, 43, 26, 25, 27,
        15, 27, 33,  9, 15,  9, 10, 53,  6,  9, 41, 25, 11, 18, 20, 13, 18,  9,
        20, 51, 14, 16, 13, 28, 22, 29, 24, 32, 18, 17, 18, 31, 34, 32, 26, 10,
        20,  2, 41,  6, 24, 24, 21, 26, 27, 21, 14, 31, 20, 33, 14, 28, 33, 37,
        19, 22, 23, 26, 21,  7, 20,  8, 38, 18, 13, 22, 10, 37, 19, 16,  6, 18,
        27, 31], device='cuda:3')
tensor([[6, 3, 3,  ..., 3, 6, 3],
        [3, 2, 5,  ..., 2, 3,

## Building the Model - BiLSTM

In [8]:
class PoSTagger(nn.Module):
    def __init__(self, voc_dim, emb_dim, hid_dim, tag_dim, n_layers, bidirect, dropout, pad_idx):
        super().__init__()
        self.emb = nn.Embedding(voc_dim, emb_dim, padding_idx=pad_idx)
        self.rnn = nn.LSTM(emb_dim, hid_dim, num_layers=n_layers, 
                           bidirectional=bidirect, dropout=dropout)
        self.fc = nn.Linear(hid_dim*2 if bidirect else hid_dim, tag_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text, text_lens):
        # text: (step, batch)
        embedded = self.dropout(self.emb(text))
        # Pack sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lens, enforce_sorted=False)
        # hidden: (num_layers*num_directions, batch, hid_dim)
        packed_outs, (hidden, cell) = self.rnn(packed_embedded)
        # Unpack sequence
        # outs: (step, batch, hid_dim)
        outs, out_lens = nn.utils.rnn.pad_packed_sequence(packed_outs)

        # preds: (step, batch, tag_dim)
        preds = self.fc(self.dropout(outs))
        return preds

In [9]:
VOC_DIM = len(TEXT.vocab)
EMB_DIM = 100
HID_DIM = 128
TAG_DIM = len(CHUNK_TAGS.vocab)

N_LAYERS = 2
BIDIRECT = True
DROPOUT = 0.25
TEXT_PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
TAG_PAD_IDX = CHUNK_TAGS.vocab.stoi[CHUNK_TAGS.pad_token]


tagger = PoSTagger(VOC_DIM, EMB_DIM, HID_DIM, TAG_DIM, 
                   N_LAYERS, BIDIRECT, DROPOUT, TEXT_PAD_IDX).to(device)
preds = tagger(batch_text, batch_text_lens)

print(batch_text.size())
print(preds.size())

torch.Size([53, 128])
torch.Size([53, 128, 23])


## Training the Model - BiLSTM

In [10]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.normal_(param.data, mean=0, std=0.1)

def count_parameters(model: nn.Module):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


tagger = PoSTagger(VOC_DIM, EMB_DIM, HID_DIM, TAG_DIM, 
                   N_LAYERS, BIDIRECT, DROPOUT, TEXT_PAD_IDX).to(device)

tagger.apply(init_weights)
print(f'The model has {count_parameters(tagger):,} trainable parameters')

The model has 1,475,595 trainable parameters


In [11]:
# Initialize Embeddings with Pre-Trained Vectors
print(TEXT.vocab.vectors.size())
print(tagger.emb.weight.size())

tagger.emb.weight.data.copy_(TEXT.vocab.vectors)

TEXT_UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
tagger.emb.weight.data[TEXT_UNK_IDX].zero_()
tagger.emb.weight.data[TEXT_PAD_IDX].zero_()

print(tagger.emb.weight[:5, :8])

torch.Size([8389, 100])
torch.Size([8389, 100])
tensor([[ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [-0.1077,  0.1105,  0.5981, -0.5436,  0.6740,  0.1066,  0.0389,  0.3548],
        [-0.0382, -0.2449,  0.7281, -0.3996,  0.0832,  0.0440, -0.3914,  0.3344],
        [-0.3398,  0.2094,  0.4635, -0.6479, -0.3838,  0.0380,  0.1713,  0.1598]],
       device='cuda:3', grad_fn=<SliceBackward>)


In [12]:
loss_func = nn.CrossEntropyLoss(ignore_index=TAG_PAD_IDX, reduction='mean')
optimizer = optim.AdamW(tagger.parameters())

In [13]:
def train_epoch(tagger, iterator, optimizer, loss_func):
    tagger.train()
    epoch_loss = 0
    epoch_acc = 0
    for batch in iterator:
        # Forward pass
        text, text_lens = batch.text
        tags, tags_lens = batch.tags
        preds = tagger(text, text_lens)

        # Calculate loss
        preds_flattened = preds.view(-1, preds.size(-1))
        tags_flattened = tags.flatten()
        loss = loss_func(preds_flattened, tags_flattened)

        # Backward propagation
        optimizer.zero_grad()
        loss.backward()
        # Update weights
        optimizer.step()
        # Accumulate loss and acc
        epoch_loss += loss.item()
        non_padding = (tags_flattened != loss_func.ignore_index)
        epoch_acc += (preds_flattened.argmax(dim=-1) == tags_flattened)[non_padding].sum().item() / non_padding.sum().item()
    return epoch_loss/len(iterator), epoch_acc/len(iterator)

def eval_epoch(tagger, iterator, loss_func):
    tagger.eval()
    epoch_loss = 0
    epoch_acc = 0
    with torch.no_grad():
        for batch in iterator:
            # Forward pass
            text, text_lens = batch.text
            tags, tags_lens = batch.tags
            preds = tagger(text, text_lens)

            # Calculate loss
            preds_flattened = preds.view(-1, preds.size(-1))
            tags_flattened = tags.flatten()
            loss = loss_func(preds_flattened, tags_flattened)
            
            # Accumulate loss and acc
            epoch_loss += loss.item()
            non_padding = (tags_flattened != loss_func.ignore_index)
            epoch_acc += (preds_flattened.argmax(dim=-1) == tags_flattened)[non_padding].sum().item() / non_padding.sum().item()
    return epoch_loss/len(iterator), epoch_acc/len(iterator)

In [14]:
import time
N_EPOCHS = 10
best_valid_loss = np.inf

for epoch in range(N_EPOCHS):
    t0 = time.time()
    train_loss, train_acc = train_epoch(tagger, train_iterator, optimizer, loss_func)
    valid_loss, valid_acc = eval_epoch(tagger, valid_iterator, loss_func)
    epoch_secs = time.time() - t0

    epoch_mins, epoch_secs = int(epoch_secs // 60), int(epoch_secs % 60)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(tagger.state_dict(), 'models/tut5-model-1.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 3s
	Train Loss: 1.443 | Train Acc: 51.89%
	 Val. Loss: 0.667 |  Val. Acc: 80.78%
Epoch: 02 | Epoch Time: 0m 3s
	Train Loss: 0.555 | Train Acc: 83.67%
	 Val. Loss: 0.393 |  Val. Acc: 88.49%
Epoch: 03 | Epoch Time: 0m 4s
	Train Loss: 0.388 | Train Acc: 88.65%
	 Val. Loss: 0.308 |  Val. Acc: 90.93%
Epoch: 04 | Epoch Time: 0m 3s
	Train Loss: 0.312 | Train Acc: 90.77%
	 Val. Loss: 0.264 |  Val. Acc: 92.22%
Epoch: 05 | Epoch Time: 0m 3s
	Train Loss: 0.267 | Train Acc: 92.03%
	 Val. Loss: 0.237 |  Val. Acc: 93.05%
Epoch: 06 | Epoch Time: 0m 3s
	Train Loss: 0.237 | Train Acc: 92.89%
	 Val. Loss: 0.223 |  Val. Acc: 93.36%
Epoch: 07 | Epoch Time: 0m 4s
	Train Loss: 0.214 | Train Acc: 93.56%
	 Val. Loss: 0.212 |  Val. Acc: 93.84%
Epoch: 08 | Epoch Time: 0m 3s
	Train Loss: 0.195 | Train Acc: 94.17%
	 Val. Loss: 0.200 |  Val. Acc: 94.14%
Epoch: 09 | Epoch Time: 0m 3s
	Train Loss: 0.177 | Train Acc: 94.66%
	 Val. Loss: 0.188 |  Val. Acc: 94.27%
Epoch: 10 | Epoch Time: 0m 4

In [15]:
tagger.load_state_dict(torch.load('models/tut5-model-1.pt'))

valid_loss, valid_acc = eval_epoch(tagger, valid_iterator, loss_func)
test_loss, test_acc = eval_epoch(tagger, test_iterator, loss_func)

print(f'Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}%')
print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Val. Loss: 0.185 | Val. Acc: 94.37%
Test Loss: 0.183 | Test Acc: 94.74%


## Building the Model - BiLSTM-CRF

In [16]:
from torchcrf import CRF

class PoSTagger(nn.Module):
    def __init__(self, voc_dim, emb_dim, hid_dim, tag_dim, n_layers, bidirect, dropout, text_pad_idx):
        super().__init__()
        self.emb = nn.Embedding(voc_dim, emb_dim, padding_idx=text_pad_idx)
        self.rnn = nn.LSTM(emb_dim, hid_dim, num_layers=n_layers, 
                           bidirectional=bidirect, dropout=dropout)
        self.hid2tag = nn.Linear(hid_dim*2 if bidirect else hid_dim, tag_dim)
        self.crf = CRF(tag_dim)
        self.dropout = nn.Dropout(dropout)

    def _get_rnn_features(self, text: torch.Tensor, seq_lens: torch.Tensor):
        # embedded: (step, batch, emb_dim)
        embedded = self.dropout(self.emb(text))
        # Pack sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, seq_lens, enforce_sorted=False)
        # hidden: (num_layers*num_directions, batch, hid_dim)
        packed_outs, (hidden, cell) = self.rnn(packed_embedded)
        # Unpack sequence
        # outs: (step, batch, hid_dim)
        outs, out_lens = nn.utils.rnn.pad_packed_sequence(packed_outs)

        # feats: (step, batch, tag_dim)
        feats = self.hid2tag(self.dropout(outs))
        return feats

    def forward(self, text: torch.Tensor, seq_lens: torch.Tensor, tags: torch.Tensor):
        # text/mask: (step, batch)
        mask = (text == self.emb.padding_idx)
        feats = self._get_rnn_features(text, seq_lens)
        
        # losses: (batch)
        losses = -self.crf(feats, tags, mask=(~mask).type(torch.uint8), reduction='none')
        return losses

    def decode(self, text: torch.Tensor, seq_lens: torch.Tensor):
        # text/mask: (step, batch)
        mask = (text == self.emb.padding_idx)
        feats = self._get_rnn_features(text, seq_lens)

        best_paths = self.crf.decode(feats, mask=(~mask).type(torch.uint8))
        return best_paths

## Training the Model - BiLSTM-CRF

In [17]:
def init_weights(m):
    for name, param in m.named_parameters():
        # NOTE: The CRF parameters have already been initialized. 
        if not name.startswith('crf'):
            nn.init.normal_(param.data, mean=0, std=0.1)

def count_parameters(model: nn.Module):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


tagger = PoSTagger(VOC_DIM, EMB_DIM, HID_DIM, TAG_DIM, N_LAYERS, 
                   BIDIRECT, DROPOUT, TEXT_PAD_IDX).to(device)

tagger.apply(init_weights)
print(f'The model has {count_parameters(tagger):,} trainable parameters')

The model has 1,476,170 trainable parameters


In [18]:
# Initialize Embeddings with Pre-Trained Vectors
print(TEXT.vocab.vectors.size())
print(tagger.emb.weight.size())

tagger.emb.weight.data.copy_(TEXT.vocab.vectors)

TEXT_UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
tagger.emb.weight.data[TEXT_UNK_IDX].zero_()
tagger.emb.weight.data[TEXT_PAD_IDX].zero_()

print(tagger.emb.weight[:5, :8])

torch.Size([8389, 100])
torch.Size([8389, 100])
tensor([[ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [-0.1077,  0.1105,  0.5981, -0.5436,  0.6740,  0.1066,  0.0389,  0.3548],
        [-0.0382, -0.2449,  0.7281, -0.3996,  0.0832,  0.0440, -0.3914,  0.3344],
        [-0.3398,  0.2094,  0.4635, -0.6479, -0.3838,  0.0380,  0.1713,  0.1598]],
       device='cuda:3', grad_fn=<SliceBackward>)


In [19]:
optimizer = optim.AdamW(tagger.parameters())

In [20]:
def train_epoch(tagger, iterator, optimizer):
    tagger.train()
    epoch_loss = 0
    epoch_acc = 0
    for batch in iterator:
        # Forward pass & Calculate loss
        text, text_lens = batch.text
        tags, tags_lens = batch.tags
        losses = tagger(text, text_lens, tags)
        loss = losses.mean()

        # Backward propagation
        optimizer.zero_grad()
        loss.backward()
        # Update weights
        optimizer.step()
        # Accumulate loss and acc
        epoch_loss += loss.item()

        best_paths = tagger.decode(text, text_lens)
        best_paths = torch.tensor([path + [TAG_PAD_IDX]*(tags.size(0)-len(path)) for path in best_paths], device=device).T
        non_padding = (tags != TAG_PAD_IDX)
        epoch_acc += (best_paths == tags)[non_padding].sum().item() / non_padding.sum().item()
    return epoch_loss/len(iterator), epoch_acc/len(iterator)

def eval_epoch(tagger, iterator):
    tagger.eval()
    epoch_loss = 0
    epoch_acc = 0
    with torch.no_grad():
        for batch in iterator:
            # Forward pass & Calculate loss
            text, text_lens = batch.text
            tags, tags_lens = batch.tags
            losses = tagger(text, text_lens, tags)
            loss = losses.mean()
            
            # Accumulate loss and acc
            epoch_loss += loss.item()

            best_paths = tagger.decode(text, text_lens)
            best_paths = torch.tensor([path + [TAG_PAD_IDX]*(tags.size(0)-len(path)) for path in best_paths], device=device).T
            non_padding = (tags != TAG_PAD_IDX)
            epoch_acc += (best_paths == tags)[non_padding].sum().item() / non_padding.sum().item()
    return epoch_loss/len(iterator), epoch_acc/len(iterator)

In [21]:
import time
N_EPOCHS = 10
best_valid_loss = np.inf

for epoch in range(N_EPOCHS):
    t0 = time.time()
    train_loss, train_acc = train_epoch(tagger, train_iterator, optimizer)
    valid_loss, valid_acc = eval_epoch(tagger, valid_iterator)
    epoch_secs = time.time() - t0

    epoch_mins, epoch_secs = int(epoch_secs // 60), int(epoch_secs % 60)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(tagger.state_dict(), 'models/tut5-model-2.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 19s
	Train Loss: 33.339 | Train Acc: 55.11%
	 Val. Loss: 15.287 |  Val. Acc: 81.67%
Epoch: 02 | Epoch Time: 0m 20s
	Train Loss: 12.655 | Train Acc: 84.45%
	 Val. Loss: 9.080 |  Val. Acc: 88.67%
Epoch: 03 | Epoch Time: 0m 18s
	Train Loss: 8.909 | Train Acc: 88.95%
	 Val. Loss: 7.097 |  Val. Acc: 91.06%
Epoch: 04 | Epoch Time: 0m 17s
	Train Loss: 7.144 | Train Acc: 90.99%
	 Val. Loss: 6.036 |  Val. Acc: 92.41%
Epoch: 05 | Epoch Time: 0m 19s
	Train Loss: 6.076 | Train Acc: 92.24%
	 Val. Loss: 5.463 |  Val. Acc: 92.96%
Epoch: 06 | Epoch Time: 0m 19s
	Train Loss: 5.355 | Train Acc: 93.18%
	 Val. Loss: 4.979 |  Val. Acc: 93.66%
Epoch: 07 | Epoch Time: 0m 19s
	Train Loss: 4.774 | Train Acc: 93.83%
	 Val. Loss: 4.641 |  Val. Acc: 93.89%
Epoch: 08 | Epoch Time: 0m 20s
	Train Loss: 4.325 | Train Acc: 94.34%
	 Val. Loss: 4.422 |  Val. Acc: 94.27%
Epoch: 09 | Epoch Time: 0m 19s
	Train Loss: 3.978 | Train Acc: 94.80%
	 Val. Loss: 4.287 |  Val. Acc: 94.38%
Epoch: 10 | Epoc

In [22]:
tagger.load_state_dict(torch.load('models/tut5-model-2.pt'))

valid_loss, valid_acc = eval_epoch(tagger, valid_iterator)
test_loss, test_acc = eval_epoch(tagger, test_iterator)

print(f'Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}%')
print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Val. Loss: 4.198 | Val. Acc: 94.53%
Test Loss: 4.082 | Test Acc: 94.80%
