# Chapter 2

## 2.2 tokenizing text

In [2]:
import urllib.request
url = ("https://raw.githubusercontent.com/rasbt/"
       "LLMs-from-scratch/main/ch02/01_main-chapter-code/"
       "the-verdict.txt")
file_path = "the-verdict.txt"
urllib.request.urlretrieve(url, file_path)

('the-verdict.txt', <http.client.HTTPMessage at 0x112290770>)

In [11]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
print("Total number of chars:", len(text))
print(raw_text[:99])

Total number of chars: 31
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


### text sample sizes

In [5]:
import re
text = "Hello, world. This, is a test."
result = re.split(r'(\s)', text)
print(result)

['Hello,', ' ', 'world.', ' ', 'This,', ' ', 'is', ' ', 'a', ' ', 'test.']


### when to remove whitespace

In [37]:
text = "Hello, world. Is this-- a test?"
regex_string = r'([,.:;?_!()\']|--|\s)'
REGEX_STRING = regex_string
result = re.split(regex_string, text)
reesult = [item.strip() for item in result if item.strip()]
print(result)

['Hello', ',', '', ' ', 'world', '.', '', ' ', 'Is', ' ', 'this', '--', '', ' ', 'a', ' ', 'test', '?', '']


In [13]:
preprocessed = re.split(regex_string, raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(len(preprocessed))

4606


In [14]:
print(preprocessed[:30])

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']


## 2.3 converting tokens into token IDs

In [15]:
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)
print(vocab_size)

1158


In [17]:
vocab = {token:integer for integer, token in enumerate(all_words)}
for i, item in enumerate(vocab.items()):
    print(item)
    if i >= 50:
        break

('!', 0)
('"', 1)
('"Ah', 2)
('"Be', 3)
('"Begin', 4)
('"By', 5)
('"Come', 6)
('"Destroyed', 7)
('"Don', 8)
('"Gisburns"', 9)
('"Grindles', 10)
('"Hang', 11)
('"Has', 12)
('"How', 13)
('"I', 14)
('"If', 15)
('"It', 16)
('"Jack', 17)
('"Money', 18)
('"Moon-dancers"', 19)
('"Mr', 20)
('"Mrs', 21)
('"My', 22)
('"Never', 23)
('"Of', 24)
('"Oh', 25)
('"Once', 26)
('"Only', 27)
('"Or', 28)
('"That', 29)
('"The', 30)
('"Then', 31)
('"There', 32)
('"This', 33)
('"We', 34)
('"Well', 35)
('"What', 36)
('"When', 37)
('"Why', 38)
('"Yes', 39)
('"You', 40)
('"but', 41)
('"deadening', 42)
('"dragged', 43)
('"effects"', 44)
('"interesting"', 45)
('"lift', 46)
('"obituary"', 47)
('"strongest', 48)
('"strongly"', 49)
('"sweetly"', 50)


In [18]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()} #B

    def encode(self, text): #C
        preprocessed = re.split(regex_string, text)
        preprocessed = [
            item.strip() for item in preprocessed if item.strip()
        ]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
    
    def decode(self, ids): #D
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(regex_string, r'\1', text) #E
        return text

### a mispelling 'sad', resulting in 'token' not recognised in vocab

In [23]:
tokenizer = SimpleTokenizerV1(vocab)
text = """It's the last he painted, you know.
       Mrs, Gisburn sad with pardonale pride."""
ids = tokenizer.encode(text)
print(ids)

KeyError: 'sad'

### corrected 'sad' to 'said'

In [25]:
tokenizer = SimpleTokenizerV1(vocab)
text = """It's the last he painted, you know.
       Mrs, Gisburn said with pardonable pride."""
ids = tokenizer.encode(text)
print(ids)

[95, 51, 880, 1015, 633, 564, 776, 54, 1154, 627, 56, 104, 54, 82, 881, 1136, 784, 823, 56]


In [26]:
print(tokenizer.decode(ids))

It ' s the last he painted , you know . Mrs , Gisburn said with pardonable pride .


### now formally introducing a token not recognised int he vocab

In [28]:
text = """Hello, do you like tea?"""
print(tokenizer.decode(text))

KeyError: 'H'

## 2.4 adding special context tokens

In [38]:
EOT_TOKEN = "<|endoftext|>"
UNKNOWN_TOKEN = "<|unk|>"

In [39]:
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend([EOT_TOKEN, UNKNOWN_TOKEN])
vocab = {token:integer for integer, token in enumerate(all_tokens)}

print(len(vocab.items()))

1160


In [40]:
for i, item in enumerate(list(vocab.items())[-5:]):
    print(item)

('younger', 1155)
('your', 1156)
('yourself', 1157)
('<|endoftext|>', 1158)
('<|unk|>', 1159)


In [59]:
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = { i:s for s, i in vocab.items()}

    def encode(self, text):
        print('here')
        preprocessed = re.split(regex_string, text)
        preprocessed = [
            item.strip() for item in preprocessed if item.strip()
        ]
        preprocessed = [item if item in self.str_to_int #A
                        else UNKNOWN_TOKEN for item in preprocessed]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
    
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(regex_string, r'\1', text) #B
        return text


In [60]:
text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."
text = f" {EOT_TOKEN} ".join((text1, text2))
print(text)

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.


In [61]:
tokenizer = SimpleTokenizerV2(vocab)
ids = tokenizer.encode(text)
print(ids)

here
[1159, 54, 386, 1154, 659, 1002, 59, 1158, 94, 1015, 984, 1011, 752, 1015, 1159, 56]


In [62]:
print(tokenizer.decode(ids))

<|unk|> , do you like tea ? <|endoftext|> In the sunlit terraces of the <|unk|> .


## 2.5 byte pair encoding

In [63]:
pip freeze | grep tiktoken

tiktoken==0.7.0
Note: you may need to restart the kernel to use updated packages.


In [6]:
from importlib.metadata import version
import tiktoken
print("tiktoken version:", version("tiktoken"))

tiktoken version: 0.7.0


In [65]:
tokenizer = tiktoken.get_encoding("gpt2")

In [66]:
text = (
    "Hello, do you like tea? <|endoftext|> In the sunlit terraces"
    "of someunkonPlace."
)
integers = tokenizer.encode(text, allowed_special={EOT_TOKEN})
print(integers)

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 1659, 617, 2954, 261, 27271, 13]


In [67]:
strings = tokenizer.decode(integers)
print(strings)

Hello, do you like tea? <|endoftext|> In the sunlit terracesof someunkonPlace.


### BPE

Try the BPE tokenizer from the tiktoken library on the unknown words “Akwirw ier” and print the individual token IDs. 

In [68]:
string = "Akwirw ier"
tokens = tokenizer.encode(string)
print(tokens)

[33901, 86, 343, 86, 220, 959]


Then, call the decode function on each of the resulting integers in this list to reproduce the mapping shown in figure 2.11.

In [70]:
for token in tokens:
    print(token, tokenizer.decode([token]))

33901 Ak
86 w
343 ir
86 w
220  
959 ier


Lastly, call the decode method on the token IDs to check whether it can reconstruct the original input, “Akwirw ier.”

In [71]:
print(tokenizer.decode(tokens))

Akwirw ier


## 2.6 data smapling using the sliding window approach

In [72]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

enc_text = tokenizer.encode(raw_text)
print(len(enc_text))

5145


In [73]:
enc_sample = enc_text[50:]

In [74]:
context_size = 4
x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]
print(f"x: {x}")
print(f"y:      {y}")

x: [290, 4920, 2241, 287]
y:      [4920, 2241, 287, 257]


In [75]:
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]
    print(context, "---->", desired)

[290] ----> 4920
[290, 4920] ----> 2241
[290, 4920, 2241] ----> 287
[290, 4920, 2241, 287] ----> 257


In [77]:
for i in range(1, context_size+1):
    context = tokenizer.decode(enc_sample[:i])
    desired = tokenizer.decode([enc_sample[i]])
    print(context, "---->", desired)

 and ---->  established
 and established ---->  himself
 and established himself ---->  in
 and established himself in ---->  a


In [78]:
pip freeze | grep torch

torch==2.3.1
Note: you may need to restart the kernel to use updated packages.


In [1]:
import torch

In [3]:
from torch.utils.data import Dataset, DataLoader

class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(txt) #A

        for i in range(0, len(token_ids) - max_length, stride): #B
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self): #C
        return len(self.input_ids)
    
    def __getitem__(self, idx): #D
        return self.input_ids[idx], self.target_ids[idx]

In [4]:
def create_dataloader_v1(txt, 
                         batch_size=4, 
                         max_length=256,
                         stride=128,
                         shuffle=True,
                         drop_last=True,
                         num_workers=0):
  tokenizer = tiktoken.get_encoding("gpt2")
  dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)
  dataloader = DataLoader(
      dataset,
      batch_size=batch_size,
      shuffle=shuffle,
      drop_last=drop_last,
      num_workers=num_workers
  )

  return dataloader

In [37]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

dataloader = create_dataloader_v1(
    raw_text, batch_size=1, max_length=4, stride=1, shuffle=False
)
data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]


In [39]:
type(first_batch[0])

torch.Tensor

In [40]:
tokenizer = tiktoken.get_encoding("gpt2")

# Decode and print the decoded representation of the first batch
decoded_texts = [tokenizer.decode_batch(batch.tolist()) for batch in first_batch]
print(decoded_texts)

[['I HAD always'], [' HAD always thought']]


In [41]:
second_bath = next(data_iter)
print(second_bath)
decoded_texts = [tokenizer.decode_batch(batch.tolist()) for batch in second_bath]
print(decoded_texts)

[tensor([[ 367, 2885, 1464, 1807]]), tensor([[2885, 1464, 1807, 3619]])]
[[' HAD always thought'], ['AD always thought Jack']]


### `max_length=` and `stride=2`

In [46]:
dataloader = create_dataloader_v1(
    raw_text, batch_size=1, max_length=2, stride=2, shuffle=False
)
data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)
decoded_texts = [tokenizer.decode_batch(batch.tolist()) for batch in first_batch]
print(decoded_texts)

[tensor([[ 40, 367]]), tensor([[ 367, 2885]])]
[['I H'], [' HAD']]


In [47]:
second_bath = next(data_iter)
print(second_bath)
decoded_texts = [tokenizer.decode_batch(batch.tolist()) for batch in second_bath]
print(decoded_texts)

[tensor([[2885, 1464]]), tensor([[1464, 1807]])]
[['AD always'], [' always thought']]


In [48]:
third_bath = next(data_iter)
print(third_bath)
decoded_texts = [tokenizer.decode_batch(batch.tolist()) for batch in third_bath]
print(decoded_texts)

[tensor([[1807, 3619]]), tensor([[3619,  402]])]
[[' thought Jack'], [' Jack G']]


In [49]:
fourth_bath = next(data_iter)
print(fourth_bath)
decoded_texts = [tokenizer.decode_batch(batch.tolist()) for batch in fourth_bath]
print(decoded_texts)

[tensor([[402, 271]]), tensor([[  271, 10899]])]
[[' Gis'], ['isburn']]


### `max_length=8` and `stride=2`

In [50]:
dataloader = create_dataloader_v1(
    raw_text, batch_size=1, max_length=8, stride=2, shuffle=False
)
data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)
decoded_texts = [tokenizer.decode_batch(batch.tolist()) for batch in first_batch]
print(decoded_texts)

[tensor([[  40,  367, 2885, 1464, 1807, 3619,  402,  271]]), tensor([[  367,  2885,  1464,  1807,  3619,   402,   271, 10899]])]
[['I HAD always thought Jack Gis'], [' HAD always thought Jack Gisburn']]


In [51]:
second_bath = next(data_iter)
print(second_bath)
decoded_texts = [tokenizer.decode_batch(batch.tolist()) for batch in second_bath]
print(decoded_texts)

[tensor([[ 2885,  1464,  1807,  3619,   402,   271, 10899,  2138]]), tensor([[ 1464,  1807,  3619,   402,   271, 10899,  2138,   257]])]
[['AD always thought Jack Gisburn rather'], [' always thought Jack Gisburn rather a']]
