### Build the Econder part of a Transformer from the scratch based on the paper
* This notebook implemented a transformer consists of multiple Encoder layers from the scratch
* Each layer and its functionalities was implemented by Pytorch following the paper [attention is all you need](https://arxiv.org/pdf/1706.03762.pdf)  

In [59]:
import torch
import torch.nn as nn
import torch.utils.data as data
import math
import copy
from typing import List

In [60]:
class MultiHeadAttention(torch.nn.Module):
    def __init__(self, d_in: int, d_out: int, block_size: int, dropout:int, num_heads:int):
        """
        initialize MultiHeadAttend Layer
        
        args:
            d_in (int): dimension of input
            d_out (int): dimension of output
            block_size (int): maximum sequent length
            dropout (int): percentage of drop out for dropout layers
            num_heads (int): number of heads in the MultiHeadAttention layer
        returns:
            None
        """
        super().__init__()
        assert d_out % num_heads == 0, "d_out must be divisible by n_heads"

        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads # dimension for each head

        # define paramter matrices for query, key and value
        self.W_query = torch.nn.Linear(d_in, d_out, bias=False)
        self.W_key = torch.nn.Linear(d_in, d_out, bias=False)
        self.W_value = torch.nn.Linear(d_in, d_out, bias=False)

        # the optional linear layer as the last output layer
        self.out_proj = torch.nn.Linear(d_out, d_out)  # Linear layer to combine head outputs
        
        # dropout layer to reduce overfitting and speed up computations
        self.dropout = torch.nn.Dropout(dropout)
        
        # define the upper triaglar matrix to mask the words after the current word to prevent leakage
        # here we use block_size to generate the mask matrix with the largest possible dimensions
        self.register_buffer('mask', torch.triu(torch.ones(block_size, block_size), diagonal=1))

    def split_heads(self, x:torch.Tensor):
        """
        Splits the input tensor's features (d_out) to num_heads sections. Each head has self.head_dim features
        args:
            x: input tensor of the shape (batch_size, seq_length, d_out)
        returns:
            a tensor of the shape (batch_size, seq_length, num_heads, head_dim)
        """
        # Reshape the input to have num_heads for multi-head attention
        batch_size, seq_length, d_model = x.size()
        return x.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)

    def combine_heads(self, x):
        """
        Combines the featurs from multiple heads back to the original shape
        args:
            x: tensor of the shape (batch_size, seq_length, num_heads, head_dim)
        returns:
            a tensor with the shape (batch_size, seq_length, d_out) where num_heads * head_dim = d_out
        """
        # Combine the multiple heads back to original shape
        batch_size, num_heads, seq_length, head_dim = x.size()

        # move the num_heads and head_dim as the last two dimensions, and then combine them 
        # back to d_out dimension
        return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_out)
    
    def scaled_dot_product_attention(self, Q, K, V, mask=None):  
        """
        caculate attention scores as the dot product of query and key vectors for each query
        and then normalize the attention score by softmax and dropout layer
        """
        # Calculate attention scores. This results in a seq_length * seq_length matrix
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.head_dim)
        
        # Apply mask if provided. We use self.mask, which is a upper triangle matrix
        # the upper elements are 1s, and will be converted to -tarch.inf. In the following
        # softmax transformation, the attention scores of these positions will be zeros. Thus
        # We hide words after the current word to prevent information leakage for encoder
        if mask is not None:
            attn_scores = attn_scores.masked_fill_(mask, -torch.inf)
        
        # Softmax is applied to obtain attention probabilities
        attn_probs = torch.softmax(attn_scores, dim=-1)
        # apply dropout to the attn_probs
        attn_probs = self.dropout(attn_probs)
        
        # Multiply by values to obtain the final output        
        output = torch.matmul(attn_probs, V)
        return output 
        
    def forward(self, Q, K, V, if_mask: bool):
        seq_length = Q.size()[1]
        
        # Apply linear transformations and split heads
        Q = self.split_heads(self.W_query(Q))
        K = self.split_heads(self.W_key(K))
        V = self.split_heads(self.W_value(V))
        mask = None
        if if_mask:
            mask = self.mask.bool()[:seq_length, :seq_length].unsqueeze(0).unsqueeze(0)
        
        # Perform scaled dot-product attention
        attn_output = self.scaled_dot_product_attention(Q, K, V, mask)
        
        # Combine heads and apply output transformation
        output = self.out_proj(self.combine_heads(attn_output))
        return output
    
   

In [61]:
# implement PositionWiseFeedForward as two linear layer followed by ReLU activation
class PositionWiseFeedForward(nn.Module):
    def __init__(self, d_out, d_ff):
        """
        define and initialize PositionWiseFeedforward by a linear transformation, a
        relu activation and a final linear transformation back to d_out dimension
        """
        super(PositionWiseFeedForward, self).__init__()
        self.fc1 = nn.Linear(d_out, d_ff)
        self.fc2 = nn.Linear(d_ff, d_out)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

In [62]:
# implement PositionalEncoding using Sin and Cos functions
# an easier implementation as mentioned in the paper is to
# use embedding layer to encode positions to vectors having d_out dimension
class PositionalEncoding(nn.Module):
    def __init__(self, d_out, max_seq_length):
        """
        initialize PositionalEncoding matrix and fill the matrix using
        sin and cos for even and odd columns, respectively
        args:
            d_out (int): the output dimension, or the "total" model dimension
            max_seq_length: the max length of the input sequence the model can handle
        """
        super(PositionalEncoding, self).__init__()
        
        # construct the positional encoding matrix
        pe = torch.zeros(max_seq_length, d_out)

        # establish the n * 1 tensor where n = max_seq_length using unsqueeze(1)
        # this provides the row for Positional Encoding matrix
        position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)
        
        # calculate the columns for Encoding matrix before sin/cos transformation        
        div_term = torch.exp(torch.arange(0, d_out, 2).float() * -(math.log(10000.0) / d_out))
        
        # assign values for pe positional encoding matrix using sin and cos transformations
        # for even and odd columns (refer to the publicated paper for how transformations were defined)
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        
        # add batch size dimension and store the pe matrix since we don't need to train these parameters
        self.register_buffer('pe', pe.unsqueeze(0))
        
    def forward(self, x):
        """
        add input x vector, which is learned from attention mechanism 
        to the positional encoded vactor 
        note that self.pe has a dimension of (batch_size, seq_length, d_out)
        this is the same as the matrix from embedding of input tokens and therefore
        these two matrix can be added
        """
        
        return x + self.pe[:, :x.size(1)]

In [63]:
# implement EncoderLayer as described by the paper
class EncoderLayer(nn.Module):
    def __init__(self, d_in, d_out, max_seq_len, num_heads, d_ff, dropout):
        """
        combines the self attention computed by MultiHeadAttention layer with original input tensor (as residual connection),
        apply layer normalization. This tensor and the resulting tensor of positionwiseFeedForward of this tensor were added, 
        and finally apply layer normalization. This just follows the paper
        args:
            d_in: input dimension
            d_out: output dimension, or model's feature dimension
            max_seq_len: the maximum length of the sequences the model can handle
            num_heads: the number of heads
            d_ff: the internal dimension used in PositionWiseFeedForward linear transformation
            dropout: dropout probability used in dropout layers
        """
        super(EncoderLayer, self).__init__()
        
        self.self_attn = MultiHeadAttention(d_in, d_out, max_seq_len, dropout, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_out, d_ff)
        self.norm1 = nn.LayerNorm(d_out)
        self.norm2 = nn.LayerNorm(d_out)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, if_mask):
        """
        apply self attention, residual connection, layer normalization, positionwiseFeedforward 
        and another residual connection and layer normalization to generate the output
        """
        attn_output = self.self_attn(x, x, x, if_mask)
        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        return x

In [64]:
# this class implemented the encoder transformer by integrating encoder embedding, position encoding layers with
# multiple encoder layers together, and finally, output the raw results as tensors in shape (batch_size, seq_length, vocab_size)
class Encode_Transformer(nn.Module):
    def __init__(self, vocab_size, d_in, d_out, num_heads, num_layers, d_ff, max_seq_length, dropout):
        """
        Initialize and set the embedding and positional_econding layers, the ecoder_layers and a linear layer to convert d_out dimensions
        to vocab_size dimension, resulting in output tensor in (batch_size, seq_length, vocab_size). This output tensor can be
        used by CrossEntropyLoss to calculate the loss for optimization
        """
        super(Encode_Transformer, self).__init__()

        # define embedding and positional_encoding layers
        self.encoder_embedding = nn.Embedding(vocab_size, d_out)
        self.positional_encoding = PositionalEncoding(d_out, max_seq_length)

        # define the repeated blocks of EncoderLayers
        self.encoder_layers = nn.ModuleList([EncoderLayer(d_in, d_out, max_seq_length, num_heads, d_ff, dropout) for _ in range(num_layers)])
        
        # define the linear layer to transform from d_out to vocab_size
        self.fc = nn.Linear(d_out, vocab_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        
        # combine the emcoder embedding and positional_encoding of input tokens
        src_embedded = self.dropout(self.positional_encoding(self.encoder_embedding(src)))
        
        # transform the enc_ouput throught the blocks of repeated encoder layers
        enc_output = src_embedded
        layer_index = 0
        for enc_layer in self.encoder_layers:
            if layer_index == 0:
                enc_output = enc_layer(enc_output, True)
            else:
                enc_output = enc_layer(enc_output, False)
            layer_index += 1    
            
        # transform output with the right dimensions
        output = self.fc(enc_output)
        return output

### Test Transformer
#### Define tokenizer, dataset and dataloader to load data (The implementation of GPTDatasetcode is from [the link](https://github.com/rasbt/LLMs-from-scratch/blob/3eb9358cbe013af00c37c1c321de0c4e83c689da/ch03/01_main-chapter-code/multihead-attention.ipynb))
* This is a commonly used implementation of Pytorch Dataset and implemented the two required methods, `__len__` and `__getitem__`
* The text content was separated into two chunks: `input_chunck` and `target_chunck`. Each chunck was tokenized as token ids and returned when Dataloader encapsulating this dataset is iterated
* Every `target_chunck` was one position right shifted relative to its corresponding `input_chunck`

In [65]:
import tiktoken
import torch
from torch.utils.data import Dataset, DataLoader


class GPTDataset(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.tokenizer = tokenizer
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text
        token_ids = tokenizer.encode(txt)

        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

# This function was implemented to load datat to GPU device
# def collate_fn(batch):
#     data = [item[0].to(device) for item in batch]
#     target = [item[1].to(device) for item in batch]
#     return data, target

#### Read file, encode the content by gpt2 tokenizer, and load input and target tokens
* here we used the gpt2 tokenizer from tiktoken package. The advantage of this tokenizer is that if there is a word not contained in vocabulary list, the tokenizer will further tokenize the word to sub-words or characters. Therefore, we don't need to consider the word out of vocabulary list.
* we define the dataset using GPTDataset class, which returns the `input_chunck` and `target_chunck` as the input and labels for training
* we then define a Dataloader using this dataset and a `batch_size`
* gpt2 tokenizer has a vocabulary size of 50257, and I use 256 as the size of the embedding layer for encoder. In addition, I defined the number of heads as 2 for the attention layer

In [66]:
file_path = "the-verdict.txt"
def get_dataloader(file_path: str, encoding: str="utf-8", batch_size:int =4, max_length:int=256, stride:int=128):
    """
    given the file_path, generate tokens using gpt2 tokenizer, and return a DataLoader encapsulate GPTDataset
    that tokenized the content of the file, with input_chunks and target_chunks. The target_chunks shift one position
    to the input_chunks
    """
        
    try:
        with open(file_path, "r", encoding=encoding) as f:
            raw_text = f.read()
    except FileNotFoundError:
        print("Error: File not found.")
    except PermissionError:
        print("Error: Permission denied.")
    except Exception as e:  # Catch any other unexpected errors
        print(f"An error occurred: {e}")        
    
    # tokenize the content of the file and returns input and target chunks
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPTDataset(raw_text, tokenizer, max_length, stride)        
    dataloader = DataLoader(dataset, batch_size=batch_size) 
   
    return dataloader

In [67]:
max_length = 4

model_config = {
    "vocab_size": 50257,
    "max_seq_length": 4,
    "d_in": 256,
    "d_out": 256,
    "num_heads": 2,
    "num_layers": 2,
    "d_ff":2048,
    "dropout": 0.1
}

et = Encode_Transformer(**model_config)

In [68]:
import torch.optim as optim
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(et.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)
text_file = "the-verdict.txt"
dataloader = get_dataloader(text_file, batch_size=8, max_length=block_size, stride=5)

et.train()

for epoch in range(10):
    for batch in dataloader:
        x, y = batch        
        optimizer.zero_grad()

        # foramt output as in shape (batch_size * seq_len, vocab_size)
        output = et(x).view(-1, vocab_size)
        
        # format y value as (batch_size * seq_len,) using view(-1) to
        # get all the items as in one dimension
        y = y.view(-1)
        loss = criterion(output, y)
        loss.backward()
        optimizer.step()
    print(f"Epoch: {epoch+1}, Loss: {loss.item()}")

Epoch: 1, Loss: 8.372800827026367
Epoch: 2, Loss: 7.001795768737793
Epoch: 3, Loss: 6.156527042388916
Epoch: 4, Loss: 5.566163063049316
Epoch: 5, Loss: 4.95515775680542
Epoch: 6, Loss: 4.574779987335205
Epoch: 7, Loss: 3.970862627029419
Epoch: 8, Loss: 3.5937862396240234
Epoch: 9, Loss: 3.2626678943634033
Epoch: 10, Loss: 3.1146388053894043


### Save models and reload model
* follow the good practice to save the model's `state_dict` instead of the model itself, and rebuild the model from its class definition, and load the parameters from the saved `state_dict`

In [69]:
from pathlib import Path
dir_path = Path('./models')
dir_path.mkdir(parents=True, exist_ok=True)
model_path = './models/encoder_transformer_v2.pth'
torch.save(et.state_dict(), model_path)

In [70]:
# vocab_size = 50257
# d_out = 256
# d_in = d_out
# block_size = 4
# max_length = 4
# num_heads = 2
# num_layers = 2
# d_ff = 2048
# dropout = 0.1

test_model_path = './models/encoder_transformer_test.pth'
model = Encode_Transformer(**model_config)
model.load_state_dict(torch.load(model_path))

<All keys matched successfully>

In [72]:
# load the first batch of text and test the results
text_file = "the-verdict.txt"
dl = get_dataloader(text_file, batch_size=8, max_length=block_size, stride=5)
x,_ = next(iter(dl))
with torch.no_grad():
    rs = model(x)
rs_ids = torch.argmax(torch.softmax(rs, dim=-1), dim=-1)
print([tokenizer.decode(x[i].tolist()) for i in range(8)])
print([tokenizer.decode(rs_ids[i].tolist()) for i in range(8)])

['I HAD always', ' Jack Gisburn', ' a cheap genius--', ' a good fellow enough', 'so it was no', ' surprise to me to', ' that, in the', ' of his glory,']
[' justAD always--', ' Gisburn had', '------and', ' little-- up--', '- was no--', ' to me to me', ', and the the', ' his,, so']


### Future work
* separate the dataset into training, validation and test datasets for a "real" model training process
* utilize Pytorch lightning to control and store the best model by defining the monitoring metrics and automatically  handle the usage of GPU depending on the availability of the computing resources