## Tokenizing Text

In [1]:
import os
import urllib.request

if not(os.path.exists("the-verdict.txt")):
    url = ("https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/refs/heads/main/ch02/01_main-chapter-code/the-verdict.txt")
    file_path = "the-verdict.txt"
    urllib.request.urlretrieve(url, file_path)


In [2]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
raw_text

'I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no great surprise to me to hear that, in the height of his glory, he had dropped his painting, married a rich widow, and established himself in a villa on the Riviera. (Though I rather thought it would have been Rome or Florence.)\n\n"The height of his glory"--that was what the women called it. I can hear Mrs. Gideon Thwing--his last Chicago sitter--deploring his unaccountable abdication. "Of course it\'s going to send the value of my picture \'way up; but I don\'t think of that, Mr. Rickham--the loss to Arrt is all I think of." The word, on Mrs. Thwing\'s lips, multiplied its _rs_ as though they were reflected in an endless vista of mirrors. And it was not only the Mrs. Thwings who mourned. Had not the exquisite Hermia Croft, at the last Grafton Gallery show, stopped me before Gisburn\'s "Moon-dancers" to say, with tears in her eyes: "We shall not look upon its like again"?\n\nWell!--even 

In [4]:
len(raw_text)

20479

In [None]:
import re

text = "Hello, world. This, is a test."
result = re.split(r'(\s)', text) ## splits on whitespace
print(result)

['Hello,', ' ', 'world.', ' ', 'This,', ' ', 'is', ' ', 'a', ' ', 'test.']


In [5]:
result = re.split(r'([,.]|\s)', text)

In [6]:
print(result)

['Hello', ',', '', ' ', 'world', '.', '', ' ', 'This', ',', '', ' ', 'is', ' ', 'a', ' ', 'test', '.', '']


In [None]:
## takes care of punctuations, -- and whitespaces
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [token for token in preprocessed if token.strip()]
print(preprocessed)


['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in', 'the', 'height', 'of', 'his', 'glory', ',', 'he', 'had', 'dropped', 'his', 'painting', ',', 'married', 'a', 'rich', 'widow', ',', 'and', 'established', 'himself', 'in', 'a', 'villa', 'on', 'the', 'Riviera', '.', '(', 'Though', 'I', 'rather', 'thought', 'it', 'would', 'have', 'been', 'Rome', 'or', 'Florence', '.', ')', '"', 'The', 'height', 'of', 'his', 'glory', '"', '--', 'that', 'was', 'what', 'the', 'women', 'called', 'it', '.', 'I', 'can', 'hear', 'Mrs', '.', 'Gideon', 'Thwing', '--', 'his', 'last', 'Chicago', 'sitter', '--', 'deploring', 'his', 'unaccountable', 'abdication', '.', '"', 'Of', 'course', 'it', "'", 's', 'going', 'to', 'send', 'the', 'value', 'of', 'my', 'picture', "'", 'way', 'up', ';', 'but', 'I', 'don', "'", 't', 'think', 'of', 'that', ',

## Converting tokens into token IDs

In [31]:
'''make a vocabulary'''
all_words = sorted(set(preprocessed))

vocab_size = len(all_words)

In [23]:
vocab = {token:integer for integer, token in enumerate(all_words)}
vocab

{'!': 0,
 '"': 1,
 "'": 2,
 '(': 3,
 ')': 4,
 ',': 5,
 '--': 6,
 '.': 7,
 ':': 8,
 ';': 9,
 '?': 10,
 'A': 11,
 'Ah': 12,
 'Among': 13,
 'And': 14,
 'Are': 15,
 'Arrt': 16,
 'As': 17,
 'At': 18,
 'Be': 19,
 'Begin': 20,
 'Burlington': 21,
 'But': 22,
 'By': 23,
 'Carlo': 24,
 'Chicago': 25,
 'Claude': 26,
 'Come': 27,
 'Croft': 28,
 'Destroyed': 29,
 'Devonshire': 30,
 'Don': 31,
 'Dubarry': 32,
 'Emperors': 33,
 'Florence': 34,
 'For': 35,
 'Gallery': 36,
 'Gideon': 37,
 'Gisburn': 38,
 'Gisburns': 39,
 'Grafton': 40,
 'Greek': 41,
 'Grindle': 42,
 'Grindles': 43,
 'HAD': 44,
 'Had': 45,
 'Hang': 46,
 'Has': 47,
 'He': 48,
 'Her': 49,
 'Hermia': 50,
 'His': 51,
 'How': 52,
 'I': 53,
 'If': 54,
 'In': 55,
 'It': 56,
 'Jack': 57,
 'Jove': 58,
 'Just': 59,
 'Lord': 60,
 'Made': 61,
 'Miss': 62,
 'Money': 63,
 'Monte': 64,
 'Moon-dancers': 65,
 'Mr': 66,
 'Mrs': 67,
 'My': 68,
 'Never': 69,
 'No': 70,
 'Now': 71,
 'Nutley': 72,
 'Of': 73,
 'Oh': 74,
 'On': 75,
 'Once': 76,
 'Only': 77,
 '

In [32]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}
    
    def encode(self, raw_text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
        preprocessed = [token for token in preprocessed if token.strip()]
        
        ids=[
            self.str_to_int[s] 
            for s in preprocessed
        ]
        return ids

    def decode(self, ids):
        text = ' '.join([self.int_to_str[i] for i in ids])
        ## replace spaces before specified punctuations
        return re.sub(r'\s+([,.:;?_!"()\'])', r'\1', text)


In [33]:
tokenizer = SimpleTokenizerV1(vocab)

ids = tokenizer.encode(raw_text)

text = tokenizer.decode(ids)

print(ids)


[53, 44, 149, 1003, 57, 38, 818, 115, 256, 486, 6, 1002, 115, 500, 435, 392, 6, 908, 585, 1077, 709, 508, 961, 1016, 663, 1016, 535, 987, 5, 568, 988, 538, 722, 549, 496, 5, 533, 514, 370, 549, 748, 5, 661, 115, 841, 1102, 5, 157, 397, 547, 568, 115, 1066, 727, 988, 84, 7, 3, 99, 53, 818, 1003, 585, 1120, 530, 208, 85, 734, 34, 7, 4, 1, 93, 538, 722, 549, 496, 1, 6, 987, 1077, 1089, 988, 1112, 242, 585, 7, 53, 244, 535, 67, 7, 37, 100, 6, 549, 602, 25, 897, 6, 326, 549, 1042, 116, 7, 1, 73, 297, 585, 2, 850, 498, 1016, 866, 988, 1059, 722, 697, 769, 2, 1083, 1051, 9, 239, 53, 359, 2, 970, 998, 722, 987, 5, 66, 7, 83, 6, 988, 646, 1016, 16, 584, 145, 53, 998, 722, 7, 1, 93, 1116, 5, 727, 67, 7, 100, 2, 850, 633, 5, 693, 586, 114, 847, 114, 177, 1002, 994, 1088, 827, 568, 156, 389, 1069, 722, 677, 7, 14, 585, 1077, 711, 731, 988, 67, 7, 101, 1097, 688, 7, 45, 711, 988, 410, 50, 28, 5, 180, 988, 602, 40, 36, 882, 5, 929, 663, 209, 38, 2, 850, 1, 65, 1, 1016, 856, 5, 1108, 976, 568, 539, 4

In [34]:
print(text)

I HAD always thought Jack Gisburn rather a cheap genius -- though a good fellow enough -- so it was no great surprise to me to hear that, in the height of his glory, he had dropped his painting, married a rich widow, and established himself in a villa on the Riviera.( Though I rather thought it would have been Rome or Florence.)" The height of his glory" -- that was what the women called it. I can hear Mrs. Gideon Thwing -- his last Chicago sitter -- deploring his unaccountable abdication." Of course it' s going to send the value of my picture' way up; but I don' t think of that, Mr. Rickham -- the loss to Arrt is all I think of." The word, on Mrs. Thwing' s lips, multiplied its_ rs_ as though they were reflected in an endless vista of mirrors. And it was not only the Mrs. Thwings who mourned. Had not the exquisite Hermia Croft, at the last Grafton Gallery show, stopped me before Gisburn' s" Moon-dancers" to say, with tears in her eyes:" We shall not look upon its like again"? Well! --

## Adding special context tokens

In [35]:
text = "Hello, world!"
tokenizer.encode(text)

KeyError: 'Hello'

In [26]:
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}
    
    def encode(self, raw_text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
        preprocessed = [token for token in preprocessed if token.strip()]
        
        ids=[
            self.str_to_int[s] 
            if s in self.str_to_int
            else self.str_to_int["<|unk|>"]
            for s in preprocessed
        ]
        return ids

    def decode(self, ids):
        text = ' '.join([self.int_to_str[i] for i in ids])
        ## replace spaces before specified punctuations
        return re.sub(r'\s+([,.:;?_!"()\'])', r'\1', text)


In [37]:
all_words.extend(["<|endoftext|>", "<|unk|>"])
vocab = {token:integer for integer, token in enumerate(all_words)}

tokenizer = SimpleTokenizerV2(vocab)
tokenizer.encode("Hello, world!")

[1132, 5, 1132, 0]

## Byte Pair Encoding (BPE)

Helps break down words into smaller units or subtokens and handles unknown words quite well. If it does not find a subtoken in it vocabulary, it has a fallback to character level encoding.

In [6]:
import tiktoken

In [39]:
tiktoken.__version__

'0.12.0'

In [14]:
tokenizer = tiktoken.get_encoding("gpt2")
tokenizer.decode(tokenizer.encode("Hello"))

'Hello'

In [16]:
text = (
    "Hello <|endoftext|> I am here asodufg ewqrojdfg9dfg"
)

In [17]:
tokenizer.encode(text, allowed_special={"<|endoftext|>"})

[15496,
 220,
 50256,
 314,
 716,
 994,
 355,
 375,
 3046,
 70,
 304,
 86,
 80,
 305,
 73,
 7568,
 70,
 24,
 7568,
 70]

In [18]:
tokenizer.decode(tokenizer.encode(text, allowed_special={"<|endoftext|>"}))

'Hello <|endoftext|> I am here asodufg ewqrojdfg9dfg'

## Data sampling with sliding window

In [8]:
with open("the-verdict.txt", "r", encoding='utf-8') as f:
    raw_text = f.read()

enc_text = tokenizer.encode(raw_text, allowed_special={"<|endoftext|>"})
print(len(enc_text))

5145


In [9]:
enc_sample = enc_text[50:]


In [10]:
len(enc_sample)

5095

In [11]:
context_size = 4

x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]

print(f"x: {x}")
print(f"y:\t {y}")

x: [290, 4920, 2241, 287]
y:	 [4920, 2241, 287, 257]


In [21]:
for i in range(1, context_size+1):
    context = enc_sample[:i]
    target = enc_sample[i]
    print(tokenizer.decode(context), "--->", tokenizer.decode([target]))

 and --->  established
 and established --->  himself
 and established himself --->  in
 and established himself in --->  a


In [22]:
import torch

In [23]:
torch.__version__

'2.9.1'

In [25]:
from torch.utils.data import Dataset, DataLoader

class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
        for i in range(0, len(token_ids)-max_length, stride):
            input_chunk = token_ids[i:i+max_length]
            target_chunk = token_ids[i+1:i+max_length+1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [26]:
def create_dataloader_v1(txt, batch_size = 4, max_length=256, stride=128, shuffle=True, drop_last=True, num_workers=0 ):
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers)
    return dataloader

In [30]:
dataloader = create_dataloader_v1(raw_text, batch_size=8, max_length=4, stride=4, shuffle=False)
data_iter= iter(dataloader)
inputs, targets = next(data_iter)
print(inputs)
print(targets)

tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])
tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])


## Token IDs -> Token Embeddings

In [37]:
input_ids = torch.tensor([ 2,   3,   4,  1])

In [39]:
vocab_size = 6
output_dim = 3
torch.manual_seed(123)
embedding_layer = torch.nn.Embedding(tokenizer.n_vocab, output_dim)

In [40]:
print(embedding_layer.weight)

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.3035],
        [-0.5880,  0.3486,  0.6603],
        [-0.2196, -0.3792,  0.7671],
        ...,
        [-0.5931,  1.0895, -0.6854],
        [ 0.7447,  0.5803, -0.4246],
        [-0.3130,  0.7558, -1.2656]], requires_grad=True)


In [35]:
embedding_layer(torch.tensor([3]))

tensor([[-0.4015,  0.9666, -1.1481]], grad_fn=<EmbeddingBackward0>)

In [38]:
embedding_layer(input_ids)

tensor([[ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [ 0.9178,  1.5810,  1.3010]], grad_fn=<EmbeddingBackward0>)

## Encoding word positions

In [45]:
vocab_size = 50257
output_dim = 256
torch.manual_seed(123)
token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [42]:
max_length= 4
dataloader = create_dataloader_v1(
    raw_text, batch_size=8, max_length=4, stride=max_length, shuffle=False
)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)


In [43]:
print("inputs:", inputs)
print("inputs size:", inputs.shape)

inputs: tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])
inputs size: torch.Size([8, 4])


In [46]:
token_embeddings = token_embedding_layer(inputs)
token_embeddings.shape

torch.Size([8, 4, 256])

In [47]:
context_length = 4
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)


In [48]:
torch.arange(context_length)

tensor([0, 1, 2, 3])

In [50]:
pos_embeddings = pos_embedding_layer(torch.arange(max_length))

print(pos_embeddings.shape)

torch.Size([4, 256])


In [None]:
'''
Even though the dimensions of the pos_embeddings and token_embeddings are different, 
we can still add them together because PyTorch will broadcast the pos_embeddings across the batch dimension.
'''
input_embeddings = token_embeddings + pos_embeddings
input_embeddings.shape

torch.Size([8, 4, 256])

In [52]:
input_embeddings

tensor([[[ 0.4784,  0.2094, -1.3080,  ...,  0.7864, -3.1091, -1.5083],
         [-0.7497, -0.9066, -0.9927,  ..., -1.9672, -1.3960, -0.3200],
         [ 1.1857,  2.0427, -0.2581,  ..., -1.0175,  1.6710, -1.0276],
         [ 2.2151,  2.9436, -0.2765,  ..., -0.4182, -0.1402,  0.2612]],

        [[ 0.4341, -1.3947, -2.6367,  ..., -0.6684, -0.2993, -0.5983],
         [-0.6400, -0.3430, -0.8915,  ..., -0.9857,  3.8343,  0.0802],
         [-0.3594,  2.7332,  2.2274,  ...,  0.8895, -0.8501, -1.0126],
         [ 1.0652, -0.2726, -0.9140,  ..., -0.7946, -0.4708,  0.4810]],

        [[ 0.6100, -1.1981, -1.2779,  ..., -0.1396, -3.4041,  1.5746],
         [-1.6534, -0.9395, -1.9595,  ...,  0.6251, -0.7305, -2.3072],
         [ 0.3030,  1.8729,  0.0746,  ...,  0.2088,  1.4137, -2.5317],
         [ 1.4252,  1.2383,  0.6891,  ...,  0.5719, -1.7446, -1.0321]],

        ...,

        [[ 0.9363, -1.0170, -1.7172,  ..., -1.7176, -3.8629,  1.1428],
         [ 0.7812, -1.5820, -2.6241,  ..., -0.5137, -0.73