In [1]:
import os
import math
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from pathlib import Path
from datasets import load_dataset
from tqdm import tqdm
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace

In [2]:
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
raw_train_dataset=load_dataset("Helsinki-NLP/opus-100","en-ms",split='train')
raw_validate_dataset=load_dataset("Helsinki-NLP/opus-100","en-ms",split='validation')
raw_test_dataset=load_dataset("Helsinki-NLP/opus-100","en-ms",split="test")

In [4]:

dataset_en = []     
dataset_my = []
file_count = 1  

In [5]:
for data in tqdm(raw_train_dataset["translation"]):
    dataset_en.append(data["en"].replace('\n'," "))
    dataset_my.append(data["ms"].replace('\n'," "))
    if len(dataset_en)==50000:
        with open(f'./dataset-en/file{file_count}.txt', 'w', encoding='utf-8') as fp:
            fp.write('\n'.join(dataset_en))
            dataset_en
        with open(f'./dataset-my/file{file_count}.txt', 'w', encoding='utf-8') as fp:
            fp.write('\n'.join(dataset_my))
            dataset_my = []
        file_count += 1    


100%|██████████| 1000000/1000000 [00:00<00:00, 2129062.39it/s]


In [6]:
path_en=[str(file) for file in Path('./dataset-en').glob("**/*.txt")]
path_my=[str(file) for file in Path('./dataset-my').glob("**/*.txt")]

In [7]:
toknizer_en=Tokenizer(BPE(unk_token="[UNK]"))
#Additional special tokens are created such as [UNK] - to represent Unknown words, [PAD] - Padding token to maintain same sequence length across the model.
# [CLS] - token to denote start of sentence, [SEP] - token to denote end of sentence.
trainer_en=BpeTrainer(min_frequency=2,special_tokens=["[PAD]","[UNK]","[CLS]","[SEP]","[MASK]"])
toknizer_en.pre_tokenizer=Whitespace()
toknizer_en.train(files=path_en,trainer=trainer_en)
toknizer_en.save("./tokenizer_en/tokenizer_en.json")

                     

In [8]:
toknizer_my=Tokenizer(BPE(unk_token="[UNK]"))
trainer_my=BpeTrainer(min_frequency=2,special_tokens=["[PAD]","[UNK]","[CLS]","[SEP]","[MASK]"])
toknizer_my.pre_tokenizer=Whitespace()
toknizer_my.train(files=path_my,trainer=trainer_my)
toknizer_my.save("./tokenizer_my/tokenizer_my.json")
                          

In [9]:
toknizer_en=Tokenizer.from_file("./tokenizer_en/tokenizer_en.json")
toknizer_my=Tokenizer.from_file("./tokenizer_my/tokenizer_my.json")

In [10]:
source_vocab_size=toknizer_en.get_vocab_size()
target_vocab_ize=toknizer_my.get_vocab_size()

In [11]:
CLS_ID = torch.tensor([toknizer_my.token_to_id("[CLS]")], dtype=torch.int64).to(device)
SEP_ID = torch.tensor([toknizer_my.token_to_id("[SEP]")], dtype=torch.int64).to(device)
PAD_ID = torch.tensor([toknizer_my.token_to_id("[PAD]")], dtype=torch.int64).to(device)

In [12]:
# This class takes raw dataset and max_seq_len (maximum length of a sequence in the entire dataset).
class EncodeDataset(Dataset):
    def __init__(self, raw_dataset, max_seq_len):
        super().__init__()
        self.raw_dataset = raw_dataset
        self.max_seq_len = max_seq_len
    
    def __len__(self):
        return len(self.raw_dataset)

    def __getitem__(self, index):
        
        # Fetching raw text for the given index that consists of source and target pair.
        raw_text = self.raw_dataset[index]
        
        # Separating text to source and target text and will be later used for encoding.
        source_text = raw_text["en"]
        target_text = raw_text["ms"]

        # Encoding source text with source tokenizer(tokenizer_en) and target text with target tokenizer(tokenizer_my).
        source_text_encoded = torch.tensor(toknizer_en.encode(source_text).ids, dtype = torch.int64).to(device)    
        target_text_encoded = torch.tensor(toknizer_my.encode(target_text).ids, dtype = torch.int64).to(device)

        # To train the model, the sequence lenth of each input sequence should be equal max seq length. 
        # Hence additional number of padding will be added to the input sequence if the length is less than the max_seq_len.
        num_source_padding = self.max_seq_len - len(source_text_encoded) - 2 
        num_target_padding = self.max_seq_len - len(target_text_encoded) - 1 

        encoder_padding = torch.tensor([PAD_ID] * num_source_padding, dtype = torch.int64).to(device)
        decoder_padding = torch.tensor([PAD_ID] * num_target_padding, dtype = torch.int64).to(device)
        
        # encoder_input has the first token as start of sentence - CLS_ID, followed by source encoding which is then followed by the end of sentence token - SEP.
        # To reach the required max_seq_len, addition PAD token will be added at the end.        
        encoder_input = torch.cat([CLS_ID, source_text_encoded, SEP_ID, encoder_padding]).to(device)    

        # decoder_input has the first token as start of sentence - CLS_ID, followed by target encoding.
        # To reach the required max_seq_len, addition PAD token will be added at the end. There is no end of sentence token - SEP in decoder_input.
        decoder_input = torch.cat([CLS_ID, target_text_encoded, decoder_padding ]).to(device)           
        
        # target_label has the first token as target encoding followed by end of sentence token - SEP. There is no start of sentence token - CLS in target label.
        # To reach the required max_seq_len, addition PAD token will be added at the end. 
        target_label = torch.cat([target_text_encoded,SEP_ID,decoder_padding]).to(device)               
        
        # As we've added extra padding token with input encoding, during training, we don't want this token to be trained by model as there is nothing to learn in this token.
        # So, we'll use encoder mask to nullify the padding token value prior to calculating output of self attention in encoder block.
        encoder_mask = (encoder_input != PAD_ID).unsqueeze(0).unsqueeze(0).int().to(device)             
        
        # We also don't want any token to get influenced by the future token during the decoding stage. Hence, Causal mask is being implemented during masked multihead attention to handle this. 
        decoder_mask = (decoder_input != PAD_ID).unsqueeze(0).unsqueeze(0).int() & causal_mask(decoder_input.size(0)).to(device) 

        return {
            'encoder_input': encoder_input,
            'decoder_input': decoder_input,
            'target_label': target_label,
            'encoder_mask': encoder_mask,
            'decoder_mask': decoder_mask,
            'source_text': source_text,
            'target_text': target_text
        }

# Causal mask will make sure any token that comes after the current token will be masked, meaning the value will be replaced by -ve infinity which will be converted to zero or close to zero after softmax function. 
# Hence the model will just ignore these value or willn't be able to learn anything from these values.
def causal_mask(size):
  # dimension of causal mask (batch_size, seq_len, seq_len)
  mask = torch.triu(torch.ones(1, size, size), diagonal = 1).type(torch.int)
  return mask == 0

# To calculate the max sequence lenth in the entire training dataset for the source and target dataset.
max_seq_len_source = 0
max_seq_len_target = 0

for data in raw_train_dataset["translation"]:
    enc_ids = toknizer_en.encode(data["en"]).ids
    dec_ids = toknizer_my.encode(data["ms"]).ids
    max_seq_len_source = max(max_seq_len_source, len(enc_ids))
    max_seq_len_target = max(max_seq_len_target, len(dec_ids))
    
print(f'max_seqlen_source: {max_seq_len_source}')   #530
print(f'max_seqlen_target: {max_seq_len_target}')   #526

# To simplify the training process, we'll just take single max_seq_len and add 20 to cover the additional length of tokens such as PAD, CLS, SEP in the sequence.
max_seq_len = 550

# Instantiate the EncodeRawDataset class and create the encoded train and validation-dataset.
train_dataset = EncodeDataset(raw_train_dataset["translation"], max_seq_len)
val_dataset = EncodeDataset(raw_validate_dataset["translation"], max_seq_len)

# Creating DataLoader wrapper for both training and validation dataset. This dataloader will be used later stage during training and validation of our LLM model.
train_dataloader = DataLoader(train_dataset, batch_size = 10, shuffle = True, generator=torch.device("cuda"))
val_dataloader = DataLoader(val_dataset, batch_size = 1, shuffle = True, generator=torch.device("cuda"))


max_seqlen_source: 1358
max_seqlen_target: 563


In [13]:
class EmabadingLayer(nn.Module):
    def __init__(self, d_model:int, vocab_size:int):
        super().__init__()
        self.d_model=d_model
        self.embading=nn.Embedding(vocab_size,d_model)
    def forward(self,input):
        embading_output=self.embading(input)*math.sqrt(self.d_model)
        return embading_output
    
    


In [14]:
class positional_encoding(nn.Module):
     def __init__(self, d_model:int, max_seq_len:int,dropout_rate:float):
        super().__init__()
        ##dropout to prevent overfiting
        self.dropout=nn.Dropout(dropout_rate)
        pe=torch.zeros(max_seq_len,d_model)
        pos=torch.arange(0,max_seq_len,dtype=torch.float).unsqueeze(1)
        #calc angles for sin and cos
        div_term=torch.exp(torch.arange(0,d_model,2).float()** (-math.log(10000.0) / d_model))
        
        pe[:, 0::2] = torch.sin(pos * div_term)
        pe[:, 1::2] = torch.cos(pos * div_term)

        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

     def forward(self, input_embdding):
        input_embdding = input_embdding + (self.pe[:, :input_embdding.shape[1], :]).requires_grad_(False)   # to prevent from calculating gradient
        return self.dropout(input_embdding)

In [15]:
class MultiheadAttention(nn.Module):
    def __init__(self,d_model:int,num_heads:int,dropout_rate:float):
        super().__init__()
        self.dropout=nn.Dropout(dropout_rate)
        self.num_heads=num_heads
        assert d_model %num_heads==0, "d_model must be divisible by number of heads"
        self.d_k=d_model//num_heads
        self.W_q=nn.Linear(d_model,d_model,bias=False)
        self.W_k=nn.Linear(d_model,d_model,bias=False)
        self.W_v=nn.Linear(d_model,d_model,bias=False)
        self.W_o=nn.Linear(d_model,d_model,bias=False)

    def Forward(self,q,k,v,encoder_mask):
        query=self.W_q(q)
        key=self.W_k(k)
        value=self.W_v(v)
        query=query.view(query.shape[0],query.shape[1],self.num_heads,self.d_k).transpose(1,2)
        key = key.view(key.shape[0], key.shape[1], self.num_heads ,self.d_k).transpose(1,2)
        value = value.view(value.shape[0], value.shape[1], self.num_heads ,self.d_k).transpose(1,2)
        attention_score = (query @ key.transpose(-2,-1))/math.sqrt(self.d_k)

        if encoder_mask is not None:
            attention_score.masked_fill_(encoder_mask==0,-1e9)
        attention_score=attention_score.softmax(dim=-1)
        if self.dropout is not None:
            attention_score=self.dropout(attention_score)


        attention_output=attention_score @ value

        attention_output=attention_output.transpose(1,2).contiguous().view(attention_output.shape[0], -1, self.num_heads * self.d_k) 

        multihead_output=self.W_o(attention_output)

        return multihead_output       


