In [1]:
from importlib.metadata import version

print("torch version:", version("torch"))
print("tiktoken version:", version("tiktoken"))

torch version: 2.0.1+cu118
tiktoken version: 0.7.0


## Exercise 2.1

> EXERCISE 2.1 BYTE PAIR ENCODING OF UNKNOWN WORDS
- Try the BPE tokenizer from the tiktoken library on the unknown words "Akwirw ier"
and print the individual token IDs. Then, call the decode function on each of the
resulting integers in this list to reproduce the mapping shown in Figure 2.11. Lastly,
call the decode method on the token IDs to check whether it can reconstruct the
original input, "Akwirw ier".


In [2]:
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")

In [3]:
integers = tokenizer.encode("Akwirw ier")
print(integers)

[33901, 86, 343, 86, 220, 959]


In [4]:
for i in integers:
    print(f"{i} -> {tokenizer.decode([i])}")

33901 -> Ak
86 -> w
343 -> ir
86 -> w
220 ->  
959 -> ier


In [5]:
tokenizer.encode("Ak")

[33901]

In [6]:
tokenizer.encode("w")

[86]

In [7]:
tokenizer.encode("ir")

[343]

In [8]:
tokenizer.encode("w")

[86]

In [9]:
tokenizer.encode(" ")

[220]

In [10]:
tokenizer.encode("ier")

[959]

In [11]:
tokenizer.decode([33901, 86, 343, 86, 220, 959])

'Akwirw ier'

## Exercise 2.2

Exercise 2.2 DATA LOADERS WITH DIFFERENT STRIDES AND CONTEXT SIZES

- To develop more intuition for how the data loader works, try to run it with different
settings such as max_length=2 and stride=2 and max_length=8 and stride=2.

In [12]:
import torch 
print(f'PyTorch version: {torch.__version__}')

PyTorch version: 2.0.1+cu118


In [13]:
# A dataset for batched inputs and targets

from torch.utils.data import Dataset, DataLoader

class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(txt)                           #A

        for i in range(0, len(token_ids) - max_length, stride):     #B
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + 1 + max_length]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):                                              #C
        return len(self.input_ids)
    
    def __getitem__(self, idx):                                     #D
        return self.input_ids[idx], self.target_ids[idx]
    
#A Tokenize the entire text
#B Use a sliding window to chunk the book into overlapping sequences of max_length
#C Return the total number of rows in the dataset
#D Return a single row from the dataset

In [14]:
def create_dataloader_v1(txt, batch_size=4, max_length=256,
        stride=128, shuffle=True, drop_last=True, num_workers=0):
    tokenizer = tiktoken.get_encoding("gpt2")                       #A
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)      #B
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,                                        #C
        num_workers=0                                               #D
    )
    
    return dataloader

#A Initialize the tokenizer
#B Create dataset
#C drop_last=True drops the last batch if it is shorter than the specified batch_size to prevent loss 
    # spikes during training
#D The number of CPU processes to use for preprocessing

In [15]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

print(f'Original text length: {len(raw_text)}')
print(f"Tokenized text length: {len(tokenizer.encode(raw_text))}")

Original text length: 20479
Tokenized text length: 5145


### max_length = 2, stride = 2

In [17]:
dataloader = create_dataloader_v1(raw_text, batch_size=1, max_length=2, stride=2, shuffle=False)

In [18]:
len(dataloader)  

2572

In [19]:
data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)  # the first tensor stores input_ids, while second target_ids

[tensor([[ 40, 367]]), tensor([[ 367, 2885]])]


In [20]:
second_batch = next(data_iter)
print(second_batch)

[tensor([[2885, 1464]]), tensor([[1464, 1807]])]


> we can notice that each tensor consists from 2 elements (because of max_length), 
and due to stride = 2 in the second_batch input_ids is started from 2885, because we jumped for two elements to forward.