# Data Loading Pipeline

1. Tokenize the text
2. Sliding window approach to create sequences
3. Load to `DataLoader` class
4. Create embeddings layer
5. Create positional embeddings layer
6. Add positional embeddings to the embeddings

In [18]:
import tiktoken
from tiktoken.core import Encoding
import torch
from torch.utils.data import DataLoader, Dataset

class GPTDataset(Dataset):
    def __init__(self, text: str, tokenizer: Encoding, max_length: int, step: int):
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(text, allowed_special={"<|endoftext|>"})

        for i in range(0, len(token_ids) - max_length, step):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1:i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [19]:
def create_dataloader_v1(
        text: str,
        batch_size: int,
        max_length: int,
        step: int,
        shuffle: bool = True,
        drop_last: bool = True,
        num_workers: int = 0,
):
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPTDataset(text, tokenizer, max_length, step)

    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers,
    )
    return dataloader

In [20]:
with open("../../data/the_verdict.txt", "r") as f:
    raw_text = f.read()

max_length = 4
dataloader = create_dataloader_v1(raw_text, batch_size=8, max_length=max_length, step=max_length)

data_iter = iter(dataloader)
input_ids, target_ids = next(data_iter)
print(input_ids)
print(target_ids)

tensor([[ 6653,  3656, 27846,   379],
        [   12, 12239,    13,   198],
        [39136,   278,   290,   285],
        [  607,    13,   314,  2391],
        [  890,   276,   284,   910],
        [34537,   526,   198,   198],
        [ 2106,    13,  1375,  4762],
        [19713, 14676,    25,  9675]])
tensor([[ 3656, 27846,   379,   683],
        [12239,    13,   198,   198],
        [  278,   290,   285,  4185],
        [   13,   314,  2391,   531],
        [  276,   284,   910,    25],
        [  526,   198,   198,     1],
        [   13,  1375,  4762,   287],
        [14676,    25,  9675,   284]])


In [26]:
vocab_size = 50257
output_dim = 256
context_length = 1024

token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
position_embedding_layer = torch.nn.Embedding(context_length, output_dim)

token_embeddings = token_embedding_layer(input_ids)
position_embeddings = position_embedding_layer(torch.arange(max_length))


In [27]:
for batch in dataloader:
    x, y  = batch

    token_embeddings = token_embedding_layer(x)
    position_embeddings = position_embedding_layer(torch.arange(max_length))
    input_embeddings = token_embeddings + position_embeddings

    break

In [28]:
print(input_embeddings.shape)

torch.Size([8, 4, 256])
