## Step 1: Tokenization

In [11]:
# Use the tiktokenizer to create tokens
with open("the-verdict.txt") as f:
    raw_text = f.read()

print(f"Number of characters in the file: {len(raw_text)}")

Number of characters in the file: 20479


In [12]:
# We will use tiktoken, which was used by GPT2
import importlib
import tiktoken

print("tiktoken version:", importlib.metadata.version("tiktoken"))

tiktoken version: 0.11.0


In [13]:
# Initialzie tokenizer
tokenizer = tiktoken.get_encoding("gpt2")

In [14]:
# Encode the text from "the-verdict.txt" using tokenizer
integers = tokenizer.encode(raw_text, allowed_special={"<|endoftext|>"})
print(len(integers))

5145


In [15]:
# Generate back the string and compare with original raw text
strings = tokenizer.decode(integers)
print(len(strings))
print(len(strings) == len(raw_text))

20479
True


In [16]:
# Let's see how it decodes the unknown words
integers = tokenizer.encode("Akwirw ier")
print(integers)
print([tokenizer.decode([i]) for i in integers])
print(tokenizer.decode(integers))
    

[33901, 86, 343, 86, 220, 959]
['Ak', 'w', 'ir', 'w', ' ', 'ier']
Akwirw ier


### Step 2: CREATE INPUT-TARGET PAIRS USING DATALOADERS

##### STEPS:
<li>Tokenize entire text</li>
<li>Use a sliding window to chunk the book into overlapping sequence of max_length</li>
<li>Return the total number of rows in the dataset</li>
<li>Return a single row from the dataset</li>

<b> For efficient data loader implementation, we will use PyTorch's built-in dataset and dataloader classes.</b>

In [17]:
from torch.utils.data import Dataset
# First we will create a Dataset class that will be used by DataLoader to extract the data from 'raw_text' efficiently
class GPTDataset(Dataset):
    def __init__(self, text, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entier dataset
        token_ids = tokenizer.encode(text, allowed_special={"<|endoftext|>"})

        # Use sliding window to chunk the book into overlapping sequence of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length - 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, index):
        return self.input_ids[index], self.target_ids[index]
        

In [None]:
# The above Dataset class will be served as input to create datasets using dataloader
from torch.utils.data import DataLoader
import tiktoken
def dataloader_v1(text, batch_size=4, max_length = 256, stride = 128, shuffle = True, drop_last = True, num_workers = 0):
    # Initialize tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = GPTDataset(text=text, tokenizer=tokenizer, stride=stride, max_length=max_length)

    # Create Dataloader
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers)

    return dataloader

In [19]:
# Convert dataloader into a python iterator to fetch next entry
import torch
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
dataloader = dataloader_v1(text=raw_text, batch_size=1, max_length=4, stride=1, shuffle=False)

data_iter = iter(dataloader)

print(next(data_iter))

RuntimeError: Broken pipe