## Byte Pair Encoding

In [2]:
# !pip install tiktoken

In [3]:
import importlib
import importlib.metadata
import tiktoken

print("Tiktoken version: ", importlib.metadata.version("tiktoken"))

Tiktoken version:  0.8.0


In [4]:
tokenizer = tiktoken.get_encoding("gpt2")

In [5]:
text =(
    "Hello, do you like tea? <|endoftext|> In the sunlit terraces"
    "of someunknownPlace."
)

integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})

print(integers)

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 1659, 617, 34680, 27271, 13]


In [6]:
strings = tokenizer.decode(integers)

print(strings)

Hello, do you like tea? <|endoftext|> In the sunlit terracesof someunknownPlace.


#### Creating input-target pairs

In [7]:
with open("the-verdict.txt", "r") as f:
    raw_text = f.read()

enc_text = tokenizer.encode(raw_text)
print(len(enc_text))

5145


In [8]:
enc_sample = enc_text[50:]

In [9]:
context_size = 4

x = enc_sample[:context_size]
y = enc_sample[1:context_size + 1]

print(f"x: {x}")
print(f"y:      {y}")

x: [290, 4920, 2241, 287]
y:      [4920, 2241, 287, 257]


In [10]:
for i in range(1, context_size + 1):
    context = enc_sample[:i]
    desired = enc_sample[i]

    print(f"{context}: -----> {desired}")

[290]: -----> 4920
[290, 4920]: -----> 2241
[290, 4920, 2241]: -----> 287
[290, 4920, 2241, 287]: -----> 257


The list to the left of the arrow refers to the input tokens and right of the arrow refers to the target

Repeating the above step with decoded text results as follows:

In [11]:
for i in range(1, context_size + 1):
    context = enc_sample[:i]
    desired = enc_sample[i]

    print(f"{tokenizer.decode(context)}: -----> {tokenizer.decode([desired])}")

 and: ----->  established
 and established: ----->  himself
 and established himself: ----->  in
 and established himself in: ----->  a


### Implementation of a Data Loader with PyTorch

<div class="alert alert-block alert-warning">
<h4>Steps</h4>
1. Tokenize the text </br>
2. Use a sliding window of context size to chunk the input into overlapping sequences </br>
3. Return the total number of rows in the dataset </br>
4. Return a single row from the dataset 
</div>

In [12]:
#!pip install torch

In [13]:
#!pip install torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

In [14]:
#import torch
#print(torch.__version__)
#print(torch.cuda.is_available())

In [15]:
from torch.utils.data import Dataset, DataLoader

In [16]:
class GPTDatasetV1(Dataset):
    def __init__(self, text, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(text, allowed_special={"<|endoftext|>"})

        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i: i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]


<div class="alert alert-block alert-success">
<h4>Creating the DataLoader</h4>
1. Initialize the tokenizer </br>
2. Create the dataset </br>
3. drop_last=True drops the last batch if it is shorter thanthe specified batch_size to prevent loss spikes during training </br>
4. The number of CPU processes to use for preprocessing
</div>

In [17]:
def create_dataloader_v1(text, # dataset provided
                         batch_size=4, # number of cpu/gpu threads to run in parallel
                         max_length=256, # context length
                         stride=128, # overlap between contexts
                         shuffle=True, # shuffle the dataset
                         drop_last=True, # drop the last batch to prevent loss spike
                         num_workers=0): # number of cpu/gpu threads available
    tokenizer = tiktoken.get_encoding("gpt2") # use tiktoken to get the tokenizer

    dataset = GPTDatasetV1(text, tokenizer, max_length, stride) # create dataset by creating torch tensor chunks

    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, 
                            drop_last=drop_last, num_workers=num_workers) # create torch dataloader
    
    return dataloader

Testing the dataset and dataloader with batch_size=1 and context_size=4

In [18]:
print(len(raw_text))

20479


In [19]:
import torch
print("Pytorch version: ", torch.__version__)

dataloader = create_dataloader_v1(raw_text, batch_size=1, max_length=4, stride=1, shuffle=False)

data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

Pytorch version:  2.5.1+cu124
[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]


In [20]:
dataloader = create_dataloader_v1(raw_text, batch_size=8, max_length=4, stride=4, shuffle=False)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Inputs: \n", inputs)
print("Targets: \n", targets)

Inputs: 
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])
Targets: 
 tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])


### Vector Embeddings

<div class="alert alert-success">
Random numbers and One Hot Encoding fails to capture the semantic meaning of the token they represent. Without the semantic meaning, the relationship between words (Semantic relationship) is lost.</br>
We have to train a neural network to create vector embeddings. Creating vector embeddings is not easy and is an expensive task.
</div>

In [21]:
#!pip install gensim

In [22]:
import gensim.downloader as api
model = api.load("word2vec-google-news-300")

In [23]:
word_vectors = model

print(word_vectors["king"].shape)

(300,)


In [24]:
# Example using most similar
print(word_vectors.most_similar(positive=["king", "woman"], negative="man", topn=3))

[('queen', 0.7118191123008728), ('monarch', 0.6189674735069275), ('princess', 0.5902430415153503)]


#### Creating Token Embeddings

##### Testing with 4 words with a vocab size of 6 and 3 dimensions

In [25]:
input_ids = torch.tensor([2,3,5,1])

#### Notes
1. Assuming a list of 4 words represented using above 4 tokens
2. Assuming there are only 6 total words in the vocabulary
3. Assuming we want to create an output embedding of size 3 (GPT-3 has an embedding size of 12,288 dimensions).


In [26]:
vocab_size = 6
output_dim = 3

torch.manual_seed(123)
embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [27]:
print(embedding_layer.weight)

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)


####### Applyig the aove embedding  to a random token ID

In [None]:
print(embedding_layer(torch.tensor([3])))

In [28]:
print(embedding_layer(input_ids))

tensor([[ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-2.8400, -0.7849, -1.4096],
        [ 0.9178,  1.5810,  1.3010]], grad_fn=<EmbeddingBackward0>)


#### Positional Encoding

There are two types of positional embeddings, Absolute Embedding and Relative Embedding.


As an experiment, we are considering vector dimension to be 256 with the total vocab size of 50257 from BPE (tiktoken)

In [32]:
vocab_size = 50257
output_dim = 256

token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [33]:
max_length = 4 #context length
dataloader = create_dataloader_v1(raw_text, batch_size=8, max_length=max_length, stride=max_length, shuffle=False)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)

In [34]:
print("Token IDs: \n", inputs)
print("\nInputs Shape: \n", inputs.shape)

Token IDs: 
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Inputs Shape: 
 torch.Size([8, 4])


For each batch the embedding generated would be 8x4x256

In [35]:
token_embeddings = token_embedding_layer(inputs)
print(token_embeddings.shape)

torch.Size([8, 4, 256])


In [36]:
context_length = max_length

pos_embdding_layer = torch.nn.Embedding(context_length, output_dim)

In [37]:
pos_embeddings = pos_embdding_layer(torch.arange(max_length))
print(pos_embeddings.shape)

torch.Size([4, 256])


In [38]:
input_embeddings = token_embeddings + pos_embeddings
print(input_embeddings.shape)

torch.Size([8, 4, 256])


This final input_embeddings will be the input to the Transformer. This input_embeddings still contains randomly generated vectors. During the training of the LLM, we will optimize with the actual values before prediction