<a href="https://colab.research.google.com/github/sysgenerated/transformer-from-scratch/blob/main/TransformerFromScratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# PyTorch Transformer From Scratch

This code replicates a BERT transformer using standard PyTorch modules.

It includes the following sections:
0. Import Libraries
1. Initialize Variables
2. Create Data Loader
3. Load Dataset
4. Create Transformer
5. Create Model
6. Train Model
7. Evaluate Model

## 0. Import Libraries

In [22]:
from torch.utils.data import Dataset
import torch.nn.functional as F
from collections import Counter
from os.path import exists
import torch.optim as optim
import torch.nn as nn
import requests
import numpy as np
import random
import torch
import math
import re

## 1. Initialize Variables

In [2]:
print("Initializing...")

n_vocab = 40000                 # Number of tokens (words = tokens) used
batch_size = 128                # Number of tokens processed in one forward pass
seq_len = 20                    # Maximum number of tokens allowed in a sentence
n_encoders = 8                  # Number of stacked encoders
n_heads = 8                     # Number of attention heads within each encoder
embed_size = 128                # Length of word embeddings
inner_ff_size = embed_size * 4  # Size of vector passed between stacked encoders
dropout = 0.1                   # Percent of neurons dropped out in training
n_workers = 12

optim_kwargs = {'lr':2e-3, 'weight_decay':1e-4, 'betas':(.9,.999)}

n_iteration = 30000             # Number of training iterations
print_each = 5                  # Print training results modulo this number


Initializing...


## 2. Create Data Loader

In [3]:
class SentencesDataset(Dataset):
    def __init__(self, sentences, vocab, seq_len):
        dataset = self

        dataset.sentences = sentences
        dataset.vocab = vocab + ['<ignore>', '<oov>', '<mask>']
        dataset.vocab = {e:i for i, e in enumerate(dataset.vocab)}    # Create a lookup list for tokens (words); word = int
        dataset.rvocab = {v:k for k,v in dataset.vocab.items()}       # Create a reverse lookup list for integers; int = word
        dataset.seq_len = seq_len                                     # Maximum number of tokens allowed in a sentence

        dataset.IGNORE_IDX = dataset.vocab['<ignore>']                # Replacement tag for tokens to ignore
        dataset.OUT_OF_VOCAB_IDX = dataset.vocab['<oov>']             # Replacement tag for unknown words
        dataset.MASK_IDX = dataset.vocab['<mask>']                    # Replacement tag for the masked word prediction task

    def __getitem__(self, index, p_random_mask=0.15):
        dataset = self
        s = []
        while len(s) < dataset.seq_len:
            s.extend(dataset.get_sentence_idx(index % len(dataset)))
            index += 1

        s = s[:dataset.seq_len]                                       # Ensure that the sequence is of length seq_len
        [s.append(dataset.IGNORE_IDX) for i in range(dataset.seq_len - len(s))] # Pad retrieved sentence to seq_len using IGNORE_IDX values

        s = [(dataset.MASK_IDX, w) if random.random() < p_random_mask else (w, dataset.IGNORE_IDX) for w in s] # Create (masked input : word output) or (word input : ignore output)

        return {'input': torch.Tensor([w[0] for w in s]).long(),
                'target': torch.Tensor([w[1] for w in s]).long()}

    def __len__(self):
        return len(self.sentences)

    def get_sentence_idx(self, index):
        dataset = self
        s = dataset.sentences[index]
        s = [dataset.vocab[w] if w in dataset.vocab else dataset.OUT_OF_VOCAB_IDX for w in s]
        return s


def get_batch(loader, loader_iter):
    try:
        batch = next(loader_iter)
    except StopIteration:
        loader_iter = iter(loader)
        batch = next(loader_iter)
    return batch, loader_iter


## 3. Load Dataset

In [25]:
# Read training data from text file
print('loading text...')
url = f"https://raw.githubusercontent.com/sysgenerated/transformer-from-scratch/main/europarl30k.fr.txt"
sentences = requests.get(url).text.lower().split('\n')

# Split sentences into words
print('tokenizing sentences...')
special_chars = ',?;.:/*!+-()[]{}"\'&'
sentences = [re.sub(f'[{re.escape(special_chars)}]', ' \g<0> ', s).split(' ') for s in sentences]
sentences = [[w for w in s if len(w)] for s in sentences]   # [[words in sentence 1] ... [words in sentence n]]


# Create a hacky tokenizer that uses word count
print('creating/loading vocab...')
pth = 'vocab.txt'
if not exists(pth):
    words = [w for s in sentences for w in s]
    vocab = Counter(words).most_common(n_vocab)             # Keep the N most frequent words
    vocab = [w[0] for w in vocab]
    open(pth, 'w+').write('\n'.join(vocab))
else:
    vocab = open(pth).read().split('\n')


# Create dataset and data loader
print('creating dataset...')
dataset = SentencesDataset(sentences, vocab, seq_len)
kwargs = {'num_workers':n_workers, 'shuffle':True,  'drop_last':True, 'pin_memory':True, 'batch_size':batch_size}
data_loader = torch.utils.data.DataLoader(dataset, **kwargs)


loading text...
tokenizing sentences...
creating/loading vocab...
creating dataset...




## 4. Create Transformer

In [26]:
# Standalone function to calculate attention. Input = Q,K,V tensors. Output = One tensor with the same shape as Q=K=V
def attention(q, k, v, mask = None, dropout = None):            # q,k,v are all 4d tensors of the same shape; [batch, head, sequence, embedding mod % heads]; eg [128, 8, 20, 16]
    scores = q.matmul(k.transpose(-2, -1))                      # Get matrix dot product Q * K^T (applied for each batch); eg [128, 8, 20, 20]
    scores /= math.sqrt(q.shape[-1])                            # Scale dot product to get cosine similarity (scales embedding dimension)

    scores = scores if mask is None else scores.masked_fill(mask == 0, -1e3)  # Set element value = -1e3 for each entry in mask matrix that is 0; applies a mask to words that shouldn't influence results

    scores = F.softmax(scores, dim = -1)                        # Apply softmax to dotproduct similarity dimension; softmax sums to 1 which keeps the new vectors similar in vector length
    scores = dropout(scores) if dropout is not None else scores # Randomly set elements of the matrix to 0
    output = scores.matmul(v)                                   # Get matrix dot product (scaled QK^T) * V^T
    return output                                               # Outputs a 4d tensor [batch, head, sequence, transformed embedding (split by head)]; eg [128, 8, 20, 16]


class MultiHeadAttention(nn.Module):
    def __init__(self, n_heads, out_dim, dropout=0.1):
        super().__init__()

        # Concatenated linear layers of learnable weights, q|k|v
        self.linear = nn.Linear(out_dim, out_dim*3)             # Learnable weights to create q,k,v tensors; eg [128, 128 * 3]

        self.n_heads = n_heads                                  # Number of attention heads; eg 8
        self.out_dim = out_dim                                  # 128
        self.out_dim_per_head = out_dim // n_heads              # 128 // 8 = 16
        self.out = nn.Linear(out_dim, out_dim)                  # [128, 128]
        self.dropout = nn.Dropout(dropout)                      # 0.1

    def split_heads(self, t):
        return t.reshape(t.shape[0], -1, self.n_heads, self.out_dim_per_head)

    def forward(self, x, y=None, mask=None):                    # x is a 3d tensor of shape [batch, sequence length, embedding size]; eg [128, 20, 128]
        y = x if y is None else y                               # In decoder, y comes from encoder. In encoder, y = x

        qkv = self.linear(x)                                    # Create 3 transformed versions of X to become q|k|v; [128, 20, 128 * 3]

        q = qkv[:, :, :self.out_dim]                            # Extract q|k|v from concatenated matrix [batch, sequence, q__]; [128, 20, 128]
        k = qkv[:, :, self.out_dim:self.out_dim*2]              # [batch, sequence, _k_]; [128, 20, 128]
        v = qkv[:, :, self.out_dim*2:]                          # [batch, sequence, __v]; [128, 20, 128]

        #break into n_heads
        q, k, v = [self.split_heads(t) for t in (q,k,v)]        # Reshape 3d to 4d tensor [batches, words, # of heads, # dim per head]; eg: [128, 20, 8, 16]
        q, k, v = [t.transpose(1,2) for t in (q,k,v)]           # Transpose words and # of heads [ batches, t(# of heads), t(words), # dim per heads]; eg: [128, 8, 20, 16]

        scores = attention(q, k, v, mask, self.dropout)         # Apply attention mechanism to q,k,v; [128, 8, 20, 16]
        scores = scores.transpose(1,2).contiguous().view(scores.shape[0], -1, self.out_dim) # Reshape back to 3d tensor; [128, 20, 128]
        out = self.out(scores)                                  # Apply another linear transformation to embeddings; [128, 20, 128]

        return out

In [27]:
class FeedForward(nn.Module):
    def __init__(self, inp_dim, inner_dim, dropout=0.1):
        super().__init__()
        self.linear1 = nn.Linear(inp_dim, inner_dim)        # [128, 128 * 4]
        self.linear2 = nn.Linear(inner_dim, inp_dim)        # []
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        return self.linear2(self.dropout(F.relu(self.linear1(x))))


class EncoderLayer(nn.Module):
    def __init__(self, n_heads, inner_transformer_size, inner_ff_size, dropout=0.1):
        super().__init__()
        self.mha = MultiHeadAttention(n_heads, inner_transformer_size, dropout) # [8, 128, 0.1]
        self.ff = FeedForward(inner_transformer_size, inner_ff_size, dropout)   # [128, 128 * 4, 0.1]
        self.norm1 = nn.LayerNorm(inner_transformer_size)                       # 128
        self.norm2 = nn.LayerNorm(inner_transformer_size)                       # 128
        self.dropout1 = nn.Dropout(dropout)                                     # 0.1
        self.dropout2 = nn.Dropout(dropout)                                     # 0.1

    def forward(self, x, mask=None):                                            # x is a 3d tensor of shape [batch, seq_len, embedding]; [128, 20, 128]
        x2 = self.norm1(x)                                                      # Normalize the embedding vector
        x = x + self.dropout1(self.mha(x2, mask=mask))                          # Apply attention to the embedding vectors and then add to themselves with dropout
        x2 = self.norm2(x)                                                      # Renormalize the embedding vectors
        x = x + self.dropout2(self.ff(x2))                                      #
        return x

In [30]:
class PositionalEmbedding(nn.Module):
    def __init__(self, embed_size, max_seq_len = 80):
        super().__init__()
        self.embed_size = embed_size                              # In this example specifically 128
        pe = torch.zeros(max_seq_len, embed_size)                 # In this example specifically [20, 128]
        pe.requires_grad = False
        for pos in range(max_seq_len):                            # Create sin/cos encoding with shape [max_seq_len, d_model], positional encoding varies across embedding length
            for i in range(0, embed_size, 2):
                pe[pos, i] = math.sin(pos / (10000 ** ((2 * i)/embed_size)))
                pe[pos, i + 1] = math.cos(pos / (10000 ** ((2 * (i + 1))/embed_size)))
        pe = pe.unsqueeze(0)                                      # Convert 2d tensor into 3d tensor of shape [0, max_seq_len, embed_size], [0, 20, 128]
        self.register_buffer('pe', pe)

    def forward(self, x):                                         # The input is a tensor of shape; [128, 20, 128]
        return self.pe[:,:x.size(1)]                              # Returns 3d tensor of shape [batch, seq_len, embed_size], [128, 20, 128]


class Transformer(nn.Module):
    def __init__(self, n_encoders, n_heads, embed_size, inner_ff_size, n_embeddings, seq_len, dropout=.1):
        super().__init__()

        self.embeddings = nn.Embedding(n_embeddings, embed_size)  # In this example specifically [40000, 128]
        self.pe = PositionalEmbedding(embed_size, seq_len)        # In this example specifically [0, 128, 20]

        encoders = []
        for i in range(n_encoders):                                     # Number of stacked encoder heads; eg: 8
          encoders += [EncoderLayer(n_heads, embed_size, inner_ff_size, dropout)]
        self.encoders = nn.ModuleList(encoders)                   # Holds list of encoder heads

        self.norm = nn.LayerNorm(embed_size)                      # ToDo: Determine direction of normalization
        self.linear = nn.Linear(embed_size, n_embeddings, bias=False) # In this example specifically [128, 40000]


    def forward(self, x):                                         # x is a batch of vectors, vector contains word IDs; [128, 20, 1]
        x = self.embeddings(x)                                    # Word IDs converted to embeddings; [128, 20, 128]
        x = x + self.pe(x)                                        # Add positional encoding to 3d matrix from line above, eg: [batch, word, embedding + pos_enc]
        for encoder in self.encoders:                             # Iterate through list of Multi-Attention Heads
            x = encoder(x)                                        # ToDo: Determine shape
        x = self.norm(x)                                            # Applies normalization to the last dimension of the tensor; [batch, word, normalized vector representation]
        x = self.linear(x)                                        # Applies linear layer to each word embedding; [128, 20, 40000]
        return x


## 5. Create Model

In [33]:
# Create an instance of the previously defined Transformer class and push to GPU
print('initializing model...')
model = Transformer(n_encoders, n_heads, embed_size, inner_ff_size, len(dataset.vocab), seq_len, dropout)
model = model.cuda()

# Create Adam optimizer and CrossEntropyLoss objects
print('initializing optimizer and loss...')
optimizer = optim.Adam(model.parameters(), **optim_kwargs)
loss_model = nn.CrossEntropyLoss(ignore_index=dataset.IGNORE_IDX)   # ignore_index used to mask entries not contributing to loss

initializing model...
initializing optimizer and loss...


## 6. Train Model

In [34]:
print("Training...")
model.train()
batch_iter = iter(data_loader)

for it in range(n_iteration):
    batch, batch_iter = get_batch(data_loader, batch_iter)  # Get batch

    masked_input = batch["input"]                           # Infer
    masked_target = batch["target"]                         # Infer

    masked_input = masked_input.cuda(non_blocking=True)
    masked_target = masked_target.cuda(non_blocking=True)

    output = model(masked_input)

    output_v = output.view(-1,output.shape[-1])
    target_v = masked_target.view(-1,1).squeeze()
    loss = loss_model(output_v, target_v)                   # Compute the cross entropy loss

    loss.backward()                                         # Compute gradients
    optimizer.step()                                        # Apply gradients

    if it % print_each == 0:
        print("Iteration:", it,
              " | Loss", np.round(loss.item(),2),
              " | Δw:", round(model.embeddings.weight.grad.abs().sum().item(),3))

    optimizer.zero_grad()                                   # Reset gradients

Training...
Iteration: 0  | Loss 10.31  | Δw: 1.6
Iteration: 5  | Loss 8.8  | Δw: 0.125
Iteration: 10  | Loss 7.74  | Δw: 0.064
Iteration: 15  | Loss 6.68  | Δw: 0.052
Iteration: 20  | Loss 6.78  | Δw: 0.042
Iteration: 25  | Loss 6.71  | Δw: 0.044
Iteration: 30  | Loss 6.68  | Δw: 0.042
Iteration: 35  | Loss 6.57  | Δw: 0.045
Iteration: 40  | Loss 7.04  | Δw: 0.051
Iteration: 45  | Loss 6.38  | Δw: 0.045
Iteration: 50  | Loss 6.45  | Δw: 0.035
Iteration: 55  | Loss 6.79  | Δw: 0.041
Iteration: 60  | Loss 6.76  | Δw: 0.038
Iteration: 65  | Loss 6.46  | Δw: 0.037
Iteration: 70  | Loss 6.31  | Δw: 0.039
Iteration: 75  | Loss 6.75  | Δw: 0.039
Iteration: 80  | Loss 6.34  | Δw: 0.038
Iteration: 85  | Loss 6.26  | Δw: 0.041
Iteration: 90  | Loss 6.24  | Δw: 0.05
Iteration: 95  | Loss 6.39  | Δw: 0.061
Iteration: 100  | Loss 6.28  | Δw: 0.049
Iteration: 105  | Loss 6.62  | Δw: 0.068
Iteration: 110  | Loss 6.6  | Δw: 0.086
Iteration: 115  | Loss 6.58  | Δw: 0.093
Iteration: 120  | Loss 6.28  |

KeyboardInterrupt: ignored

## 7. Evaluate Model

In [None]:
print("Saving embeddings...")
N = 3000
np.savetxt("values.tsv", np.round(model.embeddings.weight.detach().cpu().numpy()[0:N], 2), delimiter="\t", fmt="%1.2f")
s = [dataset.rvocab[i] for i in range(N)]
open("names.tsv", "w+").write("\n".join(s))

print("End.")

References:

https://hyugen-ai.medium.com/transformers-in-pytorch-from-scratch-for-nlp-beginners-ff3b3d922ef7  
https://github.com/Whiax/BERT-Transformer-Pytorch  
