In [1]:
import random
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

SEED = 515
random.seed(SEED)
np.random.seed(SEED)
# This function also invokes `torch.cuda.manual_seed_all(SEED)`.
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# Preparing Data
In [Sequence to Sequence Learning with Neural Networks](https://arxiv.org/abs/1409.3215), th authors find it beneficial to reverse the order of the input which they believe "introduces many short term dependencies in the data that make the optimization problem much easier". 

In [9]:
import spacy
spacy_de = spacy.load('de')
spacy_en = spacy.load('en')

def tokenize_de_rev(text):
    """
    Tokenize German text and reverse it. 
    """
    return [tok.text for tok in spacy_de.tokenizer(text)][::-1]

def Tokenize_en(text):
    """
    Tokenize English text.
    """
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [33]:
from torchtext.data import Field, BucketIterator

SRC = Field(tokenize=tokenize_de_rev, init_token='<sos>', eos_token='<eos>', 
            lower=True, include_lengths=True)
TRG = Field(tokenize=Tokenize_en, init_token='<sos>', eos_token='<eos>', 
            lower=True, include_lengths=True)

In [34]:
from torchtext.datasets import Multi30k

train_data, valid_data, test_data = Multi30k.splits(exts=['.de', '.en'], 
                                                    # fields=[SRC, TRG], 
                                                    fields=[('src', SRC), ('trg', TRG)], 
                                                    root='data/')

In [35]:
print(train_data[0].src)
print(train_data[0].trg)

['.', 'büsche', 'vieler', 'nähe', 'der', 'in', 'freien', 'im', 'sind', 'männer', 'weiße', 'junge', 'zwei']
['two', 'young', ',', 'white', 'males', 'are', 'outside', 'near', 'many', 'bushes', '.']


In [36]:
SRC.build_vocab(train_data, min_freq=2)
TRG.build_vocab(train_data, min_freq=2)

len(SRC.vocab), len(TRG.vocab)

(7855, 5893)

In [37]:
BATCH_SIZE = 128

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size=BATCH_SIZE, device=device)

In [39]:
for batch in train_iterator:
    batch_src, batch_src_lens = batch.src
    batch_trg, batch_trg_lens = batch.trg
    break
print(batch_src)
print(batch_src_lens)
print(batch_trg)
print(batch_trg_lens)

tensor([[  2,   2,   2,  ...,   2,   2,   2],
        [  4,   4,   4,  ...,   4,   4,   4],
        [547, 441, 187,  ..., 284, 123, 120],
        ...,
        [  1,   1,   1,  ...,   1,   1,   1],
        [  1,   1,   1,  ...,   1,   1,   1],
        [  1,   1,   1,  ...,   1,   1,   1]])
tensor([13, 15, 18,  8, 17, 13, 11, 11, 15, 16, 23, 21, 16, 13,  9, 10, 11, 15,
        18, 13, 17, 14, 11, 20, 18, 19, 17, 10, 20, 19, 13, 17, 13, 14, 19, 10,
        21,  9, 13, 14, 14, 14, 13, 18, 11, 17, 14, 13, 14,  9, 13, 11, 13, 13,
        12, 25, 12,  9, 15, 11, 14, 11, 24, 17, 12, 10, 14, 15, 15, 15, 12, 10,
        23, 11, 20, 17, 16, 19, 14, 16,  8, 15, 15, 22, 14, 12, 13, 12, 13, 17,
        21, 12,  9, 14, 15, 17, 12, 13, 15, 23, 18, 17, 19, 12, 11, 13, 22, 16,
        12, 11, 16, 10, 19, 16, 11, 17, 18, 12, 18, 15, 19, 18, 21, 12, 11, 11,
        16, 14])
tensor([[  2,   2,   2,  ...,   2,   2,   2],
        [ 48,  16,  16,  ...,   4,   4,   4],
        [112,  50,  70,  ...,  14,  34,  

# Build Model
## Encoder

In [46]:
class Encoder(nn.Module):
    def __init__(self, in_dim, emb_dim, hid_dim, n_layers, dropout, pad_idx):
        super().__init__()
        self.emb = nn.Embedding(in_dim, emb_dim, padding_idx=pad_idx)
        self.rnn = nn.LSTM(emb_dim, hid_dim, num_layers=n_layers, dropout=dropout)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src, src_lens):
        # src: (step, batch)
        embedded = self.dropout(self.emb(src))

        # Pack sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, src_lens, enforce_sorted=False)
        # hidden/cell: (num_layers*num_directions, batch, hid_dim)
        packed_outs, (hidden, cell) = self.rnn(packed_embedded)
        return hidden, cell

In [47]:
SRC_IN_DIM = len(SRC.vocab)
TRG_IN_DIM = len(TRG.vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5
ENC_PAD_IDX = SRC.vocab.stoi[SRC.pad_token]
DEC_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]

encoder = Encoder(SRC_IN_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT, ENC_PAD_IDX)
hidden, cell = encoder(batch_src, batch_src_lens)

print(batch_src.size())
print(hidden.size())
print(cell.size())

torch.Size([25, 128])
torch.Size([2, 128, 512])
torch.Size([2, 128, 512])


## Decoder

In [51]:
class Decoder(nn.Module):
    def __init__(self, in_dim, emb_dim, hid_dim, n_layers, dropout, pad_idx):
        super().__init__()
        self.emb = nn.Embedding(in_dim, emb_dim, padding_idx=pad_idx)
        self.rnn = nn.LSTM(emb_dim, hid_dim, num_layers=n_layers, dropout=dropout)
        # The output dimension equals the input dimension for the decoder.
        self.fc = nn.Linear(hid_dim, in_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, trg, hidden, cell):
        """
        One-step forward. 
        """
        # trg: (step=1, batch)
        embedded = self.dropout(self.emb(trg))

        # outs: (step=1, batch, hid_dim)
        # hidden/cell: (num_layers*num_directions, batch, hid_dim)
        outs, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        # preds: (step=1, batch, out_dim=in_dim)
        preds = self.fc(outs)
        return preds, hidden, cell

In [56]:
decoder = Decoder(TRG_IN_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT, DEC_PAD_IDX)
preds, hidden, cell = decoder(batch_trg[0].unsqueeze(0), hidden, cell)

print(batch_trg.size())
print(preds.size())
print(hidden.size())
print(cell.size())

torch.Size([28, 128])
torch.Size([1, 128, 5893])
torch.Size([2, 128, 512])
torch.Size([2, 128, 512])


## Seq2Seq

In [75]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, src, src_lens, trg, teacher_forcing_ratio=0.5):
        # src: (step, batch)
        # trg: (step, batch)
        hidden, cell = encoder(src, src_lens)
        
        preds = []
        # The first input to the decoder is the <sos> token. 
        # trg_t: (step=1, batch)
        trg_t = trg[0].unsqueeze(0)
        for t in range(1, trg.size(0)):
            # preds_t: (step=1, batch, trg_out_dim)
            preds_t, hidden, cell = decoder(trg_t, hidden, cell)
            # top1: (step=1, batch)
            top1 = preds_t.argmax(dim=-1)
            if np.random.rand() < teacher_forcing_ratio:
                trg_t = trg[t].unsqueeze(0)
            else:
                trg_t = top1
            preds.append(preds_t)
        # preds: (step-1, batch, trg_out_dim)
        return torch.cat(preds, dim=0)

In [79]:
model = Seq2Seq(encoder, decoder)
preds = model(batch_src, batch_src_lens, batch_trg)

print(batch_trg.size())
print(preds.size())

torch.Size([28, 128])
torch.Size([27, 128, 5893])


# Train Model

In [82]:
def init_weights(m: nn.Module):
    for name, param in m.named_parameters():
        if 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)

def count_parameters(model: nn.Module):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


encoder = Encoder(SRC_IN_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT, ENC_PAD_IDX)
decoder = Decoder(TRG_IN_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT, DEC_PAD_IDX)
model = Seq2Seq(encoder, decoder).to(device)

model.apply(init_weights)
print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 13,899,013 trainable parameters


In [83]:
loss_func = nn.CrossEntropyLoss(ignore_index=DEC_PAD_IDX, reduction='mean')
optimizer = optim.AdamW(model.parameters())

In [84]:
def train_epoch(model, iterator, optimizer, loss_func, clip):
    model.train()
    epoch_loss = 0
    for batch in iterator:
        # Forward pass
        batch_src, batch_src_lens = batch.src
        batch_trg, batch_trg_lens = batch.trg
        # preds: (step-1, batch, trg_out_dim)
        preds = model(batch_src, batch_src_lens, batch_trg)
        
        # Calculate loss
        preds_flattened = preds.view(-1, preds.size(-1))
        batch_trg_flattened = batch_trg[1:].flatten()
        loss = loss_func(preds_flattened, batch_trg_flattened)

        # Backward propagation
        optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), clip)

        # Update weights
        optimizer.step()
        # Accumulate loss
        epoch_loss += loss.item()
    return epoch_loss/len(iterator)

def eval_epoch(model, iterator, loss_func):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for batch in iterator:
            # Forward pass
            batch_src, batch_src_lens = batch.src
            batch_trg, batch_trg_lens = batch.trg
            # preds: (step-1, batch, trg_out_dim)
            preds = model(batch_src, batch_src_lens, batch_trg)
            
            # Calculate loss
            preds_flattened = preds.view(-1, preds.size(-1))
            batch_trg_flattened = batch_trg[1:].flatten()
            loss = loss_func(preds_flattened, batch_trg_flattened)
            
            # Accumulate loss and acc
            epoch_loss += loss.item()
    return epoch_loss/len(iterator)

In [86]:
import time
N_EPOCHS = 10
CLIP = 1
best_valid_loss = np.inf

for epoch in range(N_EPOCHS):
    t0 = time.time()
    train_loss = train_epoch(model, train_iterator, optimizer, loss_func, CLIP)
    valid_loss = eval_epoch(model, valid_iterator, loss_func)
    epoch_secs = time.time() - t0

    epoch_mins, epoch_secs = int(epoch_secs // 60), int(epoch_secs % 60)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'models/tut1-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {np.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {np.exp(valid_loss):7.3f}')

Epoch: 01 | Epoch Time: 12m 32s


NameError: name 'math' is not defined

In [90]:
model.load_state_dict(torch.load('models/tut1-model.pt'))

valid_loss = eval_epoch(model, valid_iterator, loss_func)
test_loss = eval_epoch(model, test_iterator, loss_func)

print(f'Val. Loss: {valid_loss:.3f} |  Val. PPL: {np.exp(valid_loss):7.3f}')
print(f'Test Loss: {test_loss:.3f} |  Test PPL: {np.exp(test_loss):7.3f}')

Val. Loss: 4.832 |  Val. PPL: 125.522
Test Loss: 4.834 |  Test PPL: 125.724
