In [1]:
import os
import sys 
import time
from typing import List

import json 

import math 

from matplotlib import pyplot as plt 

import numpy as np 

import torch 
from torch import nn, Tensor
from torch.nn import Linear, Transformer
from torch.optim import AdamW
from torch.optim.lr_scheduler import LambdaLR
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter

from transformers import BertTokenizer

# Data: EN-DE

First, let's get a feel for the data. 

In [2]:
# Using this for convenience just for now
en_tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
de_tokenizer = BertTokenizer.from_pretrained("bert-base-german-cased")
de_tokenizer.bos_token = "[BOS]"
de_tokenizer.eos_token = "[EOS]"
print(en_tokenizer)
print(de_tokenizer)

PreTrainedTokenizer(name_or_path='bert-base-cased', vocab_size=28996, model_max_len=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})
PreTrainedTokenizer(name_or_path='bert-base-german-cased', vocab_size=30000, model_max_len=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '[BOS]', 'eos_token': '[EOS]', 'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})


In [3]:
with open("data/raw_data/mmt_wmt17_train.en") as f: 
    en_data = [x.strip() for x in f.readlines()]

with open("data/raw_data/mmt_wmt17_train.de") as f: 
    # we're going to use sep as bos and 
    de_data = [de_tokenizer.bos_token + x.strip() + de_tokenizer.eos_token for x in f.readlines()]
    
en_de_data = [(x, y) for x,y in zip(en_data, de_data)]

In [4]:
[print(x) for x in en_de_data[:10]]

('Two young, White males are outside near many bushes.', '[BOS]Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.[EOS]')
('Several men in hard hats are operating a giant pulley system.', '[BOS]Mehrere Männer mit Schutzhelmen bedienen ein Antriebsradsystem.[EOS]')
('A little girl climbing into a wooden playhouse.', '[BOS]Ein kleines Mädchen klettert in ein Spielhaus aus Holz.[EOS]')
('A man in a blue shirt is standing on a ladder cleaning a window.', '[BOS]Ein Mann in einem blauen Hemd steht auf einer Leiter und putzt ein Fenster.[EOS]')
('Two men are at the stove preparing food.', '[BOS]Zwei Männer stehen am Herd und bereiten Essen zu.[EOS]')
('A man in green holds a guitar while the other man observes his shirt.', '[BOS]Ein Mann in grün hält eine Gitarre, während der andere Mann sein Hemd ansieht.[EOS]')
('A man is smiling at a stuffed lion', '[BOS]Ein Mann lächelt einen ausgestopften Löwen an.[EOS]')
('A trendy girl talking on her cellphone while gliding slowly down th

[None, None, None, None, None, None, None, None, None, None]

In [5]:
# We'll be using torch.utils.DataLoader a lot 
loader = DataLoader(en_de_data[:5], batch_size=3)

for x in loader:
    print(x)

[('Two young, White males are outside near many bushes.', 'Several men in hard hats are operating a giant pulley system.', 'A little girl climbing into a wooden playhouse.'), ('[BOS]Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.[EOS]', '[BOS]Mehrere Männer mit Schutzhelmen bedienen ein Antriebsradsystem.[EOS]', '[BOS]Ein kleines Mädchen klettert in ein Spielhaus aus Holz.[EOS]')]
[('A man in a blue shirt is standing on a ladder cleaning a window.', 'Two men are at the stove preparing food.'), ('[BOS]Ein Mann in einem blauen Hemd steht auf einer Leiter und putzt ein Fenster.[EOS]', '[BOS]Zwei Männer stehen am Herd und bereiten Essen zu.[EOS]')]


en_lens = [len(x) for x in en_tokenizer(en_data)['input_ids']]
plt.hist(en_lens)
plt.title("Lengths of input sequences")
plt.show()

de_lens = [len(x) for x in de_tokenizer(de_data)['input_ids']]
plt.hist(de_lens)
plt.title("Lengths of output sequences")
plt.show()

In [6]:
# Based on our data, we declare the following parameters: 

SRC_SEQ_LEN = 55
TGT_SEQ_LEN = 50

# Model Architecture

First, let's define how we're doing positional encodings. We're going to use learned positional encodings

In [7]:
class PositionalEmbedding(nn.Module): 
    def __init__(self, vocab_size, d_embedding, max_seq_len): 
        super().__init__()
        self.max_seq_len = max_seq_len
        self.token_embeddings = nn.Embedding(vocab_size, d_embedding)
        self.pos_embeddings = nn.Embedding(max_seq_len, d_embedding)
    
    def forward(self, x : Tensor) -> Tensor: 
        """
        Args: 
            x : Tensor, shape [batch_size, seq_len]
        """
        a = self.token_embeddings(x)
        
        positions = torch.arange(x.size(1)).expand(x.shape[0], -1)
        b = self.pos_embeddings(positions)
        return a + b

In [8]:
class TransformerModel(nn.Module): 
    def __init__(self, d_model, dim_ff, nhead, num_encoder_layers, num_decoder_layers, 
                 src_vocab_size, tgt_vocab_size, src_max_len, tgt_max_len, dropout, 
                activation="gelu"): 
        super().__init__()
        self.src_embedding = PositionalEmbedding(src_vocab_size, d_model, src_max_len)
        self.tgt_embedding = PositionalEmbedding(tgt_vocab_size, d_model, tgt_max_len)
        
        self.transformer = Transformer(d_model=d_model, nhead=nhead, num_encoder_layers=num_encoder_layers, 
                                      num_decoder_layers=num_decoder_layers, dim_feedforward=dim_ff, 
                                      dropout=dropout, activation=activation, batch_first=True)
        
        self.lm_head = nn.Linear(d_model, tgt_vocab_size)
                
        self.nhead = nhead
    
    def forward(self, src, tgt, src_padding_mask, tgt_padding_mask): 
        batch_size = src.size(0)
        tgt_seq_len = tgt.size(-1)

        src_vecs = self.src_embedding(src)
        tgt_vecs = self.tgt_embedding(tgt)
                
        clm_mask = get_clm_mask(self.nhead*batch_size, tgt_seq_len)
                
        # Note that in pytorch, mask[i,j]=1 means don't attend, so we flip 
        # the outputs of huggingface tokenizer 
        x = self.transformer(src=src_vecs, tgt=tgt_vecs, tgt_mask=clm_mask, 
                             src_key_padding_mask=src_padding_mask==0, 
                             tgt_key_padding_mask=tgt_padding_mask==0)
        
        out = self.lm_head(x)
        
        return out 
        
        

In [9]:
# This is a quick method for making causal attention masks 
def clm_mask(batch_size, size): 
    attn_shape = (1, size, size)
    mask = np.triu(np.ones(attn_shape), k=1).astype('uint8')
    return (torch.from_numpy(mask)==1).expand(batch_size, -1, -1)

print("Causal language modelling mask:")
print(clm_mask(1, 10))

Causal language modelling mask:
tensor([[[False,  True,  True,  True,  True,  True,  True,  True,  True,  True],
         [False, False,  True,  True,  True,  True,  True,  True,  True,  True],
         [False, False, False,  True,  True,  True,  True,  True,  True,  True],
         [False, False, False, False,  True,  True,  True,  True,  True,  True],
         [False, False, False, False, False,  True,  True,  True,  True,  True],
         [False, False, False, False, False, False,  True,  True,  True,  True],
         [False, False, False, False, False, False, False,  True,  True,  True],
         [False, False, False, False, False, False, False, False,  True,  True],
         [False, False, False, False, False, False, False, False, False,  True],
         [False, False, False, False, False, False, False, False, False, False]]])


In [10]:
def get_clm_mask(batch_size, seq_len): 
    attn_shape = (batch_size, seq_len, seq_len)
    mask = np.triu(np.ones(attn_shape), k=1).astype('uint8')
    return torch.from_numpy(mask)==1

In [11]:
# Let's make sure everything works
toy_model = TransformerModel(d_model=16, dim_ff=64, nhead=2, num_encoder_layers=3, num_decoder_layers=3, 
                            src_vocab_size=en_tokenizer.vocab_size, tgt_vocab_size=de_tokenizer.vocab_size, 
                            src_max_len=SRC_SEQ_LEN, tgt_max_len=TGT_SEQ_LEN, dropout=0.1)

toy_model.eval()

batch = en_tokenizer(en_data[0:5], return_tensors="pt", padding='max_length', max_length=SRC_SEQ_LEN)
batch_out = de_tokenizer(de_data[0:5], return_tensors="pt", padding='max_length', max_length=TGT_SEQ_LEN)

batch_size = batch['input_ids'].size(0)

out = toy_model(src=batch["input_ids"], tgt=batch_out["input_ids"], 
                    src_padding_mask=batch["attention_mask"], 
                   tgt_padding_mask=batch_out["attention_mask"])
print(out.shape)

torch.Size([5, 50, 30000])


# Training Utilities

In [12]:
def inv_sqrt_lambda(d_model, num_warmup_steps): 
    return lambda step: min(math.pow(step+1, -0.5), (step+1) * math.pow((num_warmup_steps+1), -1.5))

### LR scheduler demo

num_warmup_steps = 2000
lr = 0.05
lrs = []
dummy_model = nn.Linear(1,1)
optimizer = AdamW(dummy_model.parameters(), lr)

scheduler = LambdaLR(optimizer, lr_lambda=inv_sqrt_lambda(512, num_warmup_steps))

for _ in range(25_000):
    dummy_input = torch.zeros((1, 1))
    dummy_loss = dummy_model(dummy_input)
    dummy_loss.backward()
    optimizer.step()
    scheduler.step()
    lrs.append(scheduler.get_last_lr())

plt.plot(lrs)
plt.show()

## Evaluation

In [13]:
def evaluate(model : TransformerModel, eval_data: List[str], src_tokenizer, 
             tgt_tokenizer, eval_batch_size) -> float: 
    model.eval()
    
    loss_fn = nn.CrossEntropyLoss(reduction='sum', label_smoothing=0.1)
    
    total_loss = 0 
    loader = DataLoader(eval_data, eval_batch_size, drop_last=False)
        
    with torch.no_grad():
        print("EVALUATING")
        for srcs, tgts in tqdm(loader):
            src_tokens = src_tokenizer(list(srcs), return_tensors="pt", padding=True)
            tgt_tokens = tgt_tokenizer(list(tgts), return_tensors="pt", padding=True)
            
            out = model(src_tokens['input_ids'], tgt_tokens['input_ids'], 
                       src_tokens['attention_mask'], tgt_tokens['attention_mask'])
            
            loss = loss_fn(out[:, :-1, :], tgt_tokens['input_ids'])
            
            total_loss += loss.item()
        
    return total_loss/len(eval_src)

In [14]:
def train(model: TransformerModel, train_data: List[str], eval_data: List[str], optimizer, scheduler, 
          num_steps, batch_size, eval_batch_size, src_tokenizer, tgt_tokenizer, write_dir: str, grad_clip=0.5, 
          log_steps=100, eval_steps=1000): 
    
    writer = SummaryWriter(log_dir=write_dir)
    
    loss_fn = nn.CrossEntropyLoss(label_smoothing=0.1)
    
    loader = DataLoader(train_data, batch_size=batch_size, drop_last=True, shuffle=True)
    
    i = 0 
    epoch = 0 
    total_loss = 0
    start_time = time.time()
    print(f"EPOCH {epoch}, STEP {i}")
    while i<=num_steps:
        for srcs, tgts in loader: 
            src_tokens = src_tokenizer(list(srcs), return_tensors="pt", padding=True)
            tgt_tokens = tgt_tokenizer(list(tgts), return_tensors="pt", padding=True)
            print(srcs)
            print(tgts)
            print(src_tokens['input_ids'])
            print(tgt_tokens['input_ids'])
            print(de_tokenizer.decode(src_tokens['input_ids'].tolist()[0]))
            
            output = model(src_tokens['input_ids'], tgt_tokens['input_ids'], src_tokens["attention_mask"], 
                          tgt_tokens["attention_mask"])
            
            print(output[:, :, :].shape)
            
            flattened_outs = output[:, :-1, :].view(-1, tgt_tokenizer.vocab_size)
            aligned_seqlen = tgt_tokens['input_ids'].size(1) - 1
            batch_size = tgt_tokens['input_ids'].size(0)
            target = tgt_tokens['input_ids'][:, 1:].view(aligned_seqlen*batch_size)
            
            loss = loss_fn(flattened_outs, target)
            
            optimizer.zero_grad()
            loss.backward()
            nn.utils.clip_grad_norm(model.parameters(), grad_clip)
            i+=1 
            
            if i%log_steps==0: 
                last_lr = scheduler.get_last_lr()
                avg_loss = total_loss/log_steps 
                ms_per_batch = (time.time()-start_time)*1_000 /log_steps
                
                writer.add_scalar('lr', last_lr, i)
                writer.add_scalar('loss/train', avg_loss, i)
                writer.add_scalar('ms/batch', ms_per_batch, i)
                
                
            if i%eval_steps==0: 
                val_loss = evaluate(model, eval_data, src_tokenizer, tgt_tokenizer, eval_batch_size)
                
                writer.add_scalar('loss/val', val_loss, i)
    

# A Synthetic Task

We're going to train our model on a very simple task: the copy task, where we have an alphabet of 5 characters (!, @, #, $, ^), and the model's job is to copy the source sequence. 

In [15]:
from random import choices, randrange
def data_gen(): 
    alphabet = ['!', '@', '#', '$', '^']
    copy = []
    for _ in range(10_000): 
        k = randrange(1, 15)
        seq = "".join(choices(alphabet, k=k))
        copy.append((seq, de_tokenizer.bos_token+seq+de_tokenizer.eos_token))
    return copy

copy = data_gen()

copy_train = copy[:8000]
copy_val = copy[8000:9000]
copy_test = copy[9000:]

for pair in copy_train[:10]:
    print(pair)

('$', '[BOS]$[EOS]')
('@', '[BOS]@[EOS]')
('$', '[BOS]$[EOS]')
('@^!^@#^', '[BOS]@^!^@#^[EOS]')
('!^@^#@', '[BOS]!^@^#@[EOS]')
('!^#@!^!###!', '[BOS]!^#@!^!###![EOS]')
('^!#!', '[BOS]^!#![EOS]')
('#$$#', '[BOS]#$$#[EOS]')
('#$!^$#!#@', '[BOS]#$!^$#!#@[EOS]')
('#!$$$^^^$!!!@^', '[BOS]#!$$$^^^$!!!@^[EOS]')


Now we train the simple model 

In [16]:
D_MODEL = 16
toy_model = TransformerModel(d_model=D_MODEL, dim_ff=64, nhead=2, num_encoder_layers=3, num_decoder_layers=3, 
                            src_vocab_size=en_tokenizer.vocab_size, tgt_vocab_size=de_tokenizer.vocab_size, 
                            src_max_len=15, tgt_max_len=17, dropout=0.1)

optimizer = AdamW(toy_model.parameters(), lr=1/math.sqrt(D_MODEL))
num_warmup_steps=100 
scheduler = LambdaLR(optimizer, lr_lambda=inv_sqrt_lambda(D_MODEL, num_warmup_steps))

train(model=toy_model, train_data=copy_train, eval_data=copy_val, optimizer=optimizer, scheduler=scheduler, 
     num_steps=1000, batch_size=4, eval_batch_size=50, src_tokenizer=de_tokenizer, tgt_tokenizer=de_tokenizer, 
     write_dir = "runs/test1", log_steps=100, eval_steps=500)

EPOCH 0, STEP 0
('!$^!^^', '!###^#!^!$!@', '^^#@', '!!$!#$$$##')
('[BOS]!$^!^^[EOS]', '[BOS]!###^#!^!$!@[EOS]', '[BOS]^^#@[EOS]', '[BOS]!!$!#$$$##[EOS]')
tensor([[    3, 26982, 26992, 26999, 26982, 26999, 26999,     4,     0,     0,
             0,     0,     0,     0],
        [    3, 26982, 26990, 26990, 26990, 26999, 26990, 26982, 26999, 26982,
         26992, 26982, 26991,     4],
        [    3, 26999, 26999, 26990, 26991,     4,     0,     0,     0,     0,
             0,     0,     0,     0],
        [    3, 26982, 26982, 26992, 26982, 26990, 26992, 26992, 26992, 26990,
         26990,     4,     0,     0]])
tensor([[    3, 26984,    35, 10053, 26985, 26982, 26992, 26999, 26982, 26999,
         26999, 26984,    55, 10053, 26985,     4,     0,     0,     0,     0,
             0,     0],
        [    3, 26984,    35, 10053, 26985, 26982, 26990, 26990, 26990, 26999,
         26990, 26982, 26999, 26982, 26992, 26982, 26991, 26984,    55, 10053,
         26985,     4],
        [    

IndexError: index out of range in self