In [1]:
import urllib.request
url = ("https://raw.githubusercontent.com/rasbt/"
"LLMs-from-scratch/main/ch02/01_main-chapter-code/"
       "the-verdict.txt")
file_path = "docs/the-verdict.txt"
urllib.request.urlretrieve(url, file_path)

('docs/the-verdict.txt', <http.client.HTTPMessage at 0x105ca34d0>)

In [2]:
with open(file_path, "r", encoding='utf-8') as f:
    raw_text = f.read()
print(f"Total number of characters: {len(raw_text)}")
print( raw_text[:100])

Total number of characters: 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no g


In [3]:
import re

text = "Hello, world. This is a test."
result = re.split(r'(\s)', text)
result


['Hello,', ' ', 'world.', ' ', 'This', ' ', 'is', ' ', 'a', ' ', 'test.']

In [4]:
result = re.split(r'([,.!]|\s)',text)
print(result)

['Hello', ',', '', ' ', 'world', '.', '', ' ', 'This', ' ', 'is', ' ', 'a', ' ', 'test', '.', '']


In [5]:
result = [item for item in result if item.strip()]
print(result)

['Hello', ',', 'world', '.', 'This', 'is', 'a', 'test', '.']


In [6]:
text = 'Hello, world. This is a-- test?'
result = re.split(r'([,.:;?_"()\']|--|\s)', text)
result = [i.strip() for i in result if i.strip()]
print(result)

['Hello', ',', 'world', '.', 'This', 'is', 'a', '--', 'test', '?']


Now that we have a basic tokenizer working, let’s apply it to Edith Wharton’s Verdicts entire short story:


In [7]:
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(preprocessed[:30])


['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']


# Converting tokens into token IDs

In [8]:
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)
print(f"Vocab Size : {vocab_size}")

Vocab Size : 1130


In [9]:
vocab = {token:id for id,token in enumerate(all_words)}
for i,item in enumerate(vocab.items()):
    print(item)
    if i>50:
        break

('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)
('A', 11)
('Ah', 12)
('Among', 13)
('And', 14)
('Are', 15)
('Arrt', 16)
('As', 17)
('At', 18)
('Be', 19)
('Begin', 20)
('Burlington', 21)
('But', 22)
('By', 23)
('Carlo', 24)
('Chicago', 25)
('Claude', 26)
('Come', 27)
('Croft', 28)
('Destroyed', 29)
('Devonshire', 30)
('Don', 31)
('Dubarry', 32)
('Emperors', 33)
('Florence', 34)
('For', 35)
('Gallery', 36)
('Gideon', 37)
('Gisburn', 38)
('Gisburns', 39)
('Grafton', 40)
('Greek', 41)
('Grindle', 42)
('Grindles', 43)
('HAD', 44)
('Had', 45)
('Hang', 46)
('Has', 47)
('He', 48)
('Her', 49)
('Hermia', 50)
('His', 51)


In [10]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.<>?/:;"\'{}()_]|--|\s)', text)
        preprocessed = [i.strip() for i in preprocessed if i.strip()]
        ids = [self.str_to_int[i] for i in preprocessed]
        return ids

    def decode(self,ids):
        text = " ". join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.?!"()\'])',r'\1', text)
        return text





In [11]:
tokenizer = SimpleTokenizerV1(vocab)

text = """
"It's the last he painted, you know,"
Mrs. Gisburn said that with pardonable pride.
"""
ids = tokenizer.encode(text)
print(ids)

print(tokenizer.decode(ids))

[1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 987, 1108, 754, 793, 7]
" It' s the last he painted, you know," Mrs. Gisburn said that with pardonable pride.


In [12]:
# Problem

text = 'Hello, you like tea?'
print(tokenizer.encode(text))

KeyError: 'Hello'

# Adding special tokens

In [13]:
all_tokens = sorted(set(preprocessed))
all_tokens.extend(['<|UNK|>','<|ENDOFTEXT|>'])
vocab: dict = {token:id for id, token in enumerate(all_tokens)}

for i,item in enumerate(list(vocab.items())[-5:]):
    print(item)



('younger', 1127)
('your', 1128)
('yourself', 1129)
('<|UNK|>', 1130)
('<|ENDOFTEXT|>', 1131)


In [14]:
# Creating a tokenizer class with special tokens

class SimpleTokenizerV2:
    def __init__(self, vocab:dict):
        self.str_to_int: dict = vocab
        self.int_to_str : dict = {id:token for token,id in vocab.items()}

    def encode(self,text:str):
        preprocessed =  re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [item if item in self.str_to_int else '<|UNK|>' for item in preprocessed]
        ids = [
            self.str_to_int.get(token) for token in preprocessed
        ]
        return ids

    def decode(self, ids:list):
        text = " ".join([self.int_to_str[id] for id in ids])

        text = re.sub(r'\s+([,.:;?!"()\'])', r'\1', text)
        return text



In [15]:
tokenizer = SimpleTokenizerV2(vocab)
text1 = 'Hello, you like tea?'
text2 = "In the sunlit the terraces of the palace."
text3 = 'The Sun is shinning and the weather is green.'
text = " <|ENDOFTEXT|> ".join((text1,text2,text3,))
print(text)
encoded_text_sample = tokenizer.encode(text)
print(encoded_text_sample)

Hello, you like tea? <|ENDOFTEXT|> In the sunlit the terraces of the palace. <|ENDOFTEXT|> The Sun is shinning and the weather is green.
[1130, 5, 1126, 628, 975, 10, 1131, 55, 988, 956, 988, 984, 722, 988, 1130, 7, 1131, 93, 1130, 584, 1130, 157, 988, 1130, 584, 1130, 7]


In [16]:
print(tokenizer.decode(encoded_text_sample))


<|UNK|>, you like tea? <|ENDOFTEXT|> In the sunlit the terraces of the <|UNK|>. <|ENDOFTEXT|> The <|UNK|> is <|UNK|> and the <|UNK|> is <|UNK|>.


# Bite pair Encoding

In [17]:
from importlib.metadata import version
import tiktoken
print(f"Tiktoken version: {version('tiktoken')}")

Tiktoken version: 0.9.0


In [18]:
tokenizer = tiktoken.get_encoding('gpt2')

In [19]:
type(tokenizer)

tiktoken.core.Encoding

In [20]:
text = (
    "Hello, you like tea? <|endoftext|> In the sunlit the terraces of the palace. <|endoftext|> The Sun is shinning and the weather is green."
)
integers = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
print(integers)

[15496, 11, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 262, 8812, 2114, 286, 262, 20562, 13, 220, 50256, 383, 3825, 318, 427, 23062, 290, 262, 6193, 318, 4077, 13]


In [21]:
tokenizer.decode(integers)

'Hello, you like tea? <|endoftext|> In the sunlit the terraces of the palace. <|endoftext|> The Sun is shinning and the weather is green.'

In [22]:
tokenizer.encode("<|endoftext|>", allowed_special={"<|endoftext|>"})

[50256]

In [23]:
tokenizer.decode(tokenizer.encode("hasttheklyt"))

'hasttheklyt'

# Data sampling with sliding window

+LLMs are pretrained by predicting the next word in a text

In [24]:
with open('docs/the-verdict.txt','r',encoding='utf-8') as f:
    raw_text=f.read()

encoded_text = tokenizer.encode(raw_text)
print(len(encoded_text))

enc_sample = encoded_text[50:]
# print(enc_sample)


5145


In [25]:
# The context size determines how many tokens are included in the input
context_size=4

x  = enc_sample[:context_size]
y  = enc_sample[1:context_size+1]

print(f"x: {x}")
print(f"y:      {y}")


x: [290, 4920, 2241, 287]
y:      [4920, 2241, 287, 257]


In [26]:
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]
    print(f"{context}---->{desired}")

[290]---->4920
[290, 4920]---->2241
[290, 4920, 2241]---->287
[290, 4920, 2241, 287]---->257


In [27]:
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]
    print(f"{tokenizer.decode(context)}---->{tokenizer.decode([desired])}")

 and----> established
 and established----> himself
 and established himself----> in
 and established himself in----> a


In [28]:
import torch
from torch.utils.data import Dataset, DataLoader

class MyGPTDatasetV2(Dataset):
    def __init__(self, raw_text:str, tokenizer, max_length:int=256, stride=128 ):
        super().__init__()
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(raw_text)

        for i in range(0,len(token_ids)-max_length,stride):
            input_chunk = token_ids[i:i+max_length]
            output_chunk = token_ids[i+1 :i+max_length+1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(output_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        x = self.input_ids[idx]
        y = self.target_ids[idx]
        return x, y



  device: torch.device = torch.device(torch._C._get_default_device()),  # torch.device('cpu'),


In [29]:
def create_dataloaderV1(txt, batch_size=4, max_length=256,
                         stride=128, shuffle=True, drop_last=True,
                         num_workers=0,
                         ):
    tokenizer= tiktoken.get_encoding('gpt2')
    dataset= MyGPTDatasetV2(txt, tokenizer, max_length, stride)
    dataloader= DataLoader(
        dataset,
        batch_size=batch_size,
        drop_last=drop_last,
        shuffle=shuffle,
        num_workers=num_workers
    )
    return dataloader





In [30]:
# test dataloader

with open('docs/the-verdict.txt','r' ,encoding='utf-8') as f:
    raw_text = f.read()


dataloader = create_dataloaderV1(raw_text, batch_size=1, max_length=4, stride=1, shuffle=False)

data_iter = iter(dataloader)

first_batch = next(data_iter)
print(first_batch)

[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]


In [31]:
dataloader = create_dataloaderV1(raw_text, batch_size=1, max_length=8, stride=2, shuffle=False)

data_iter = iter(dataloader)

first_batch = next(data_iter)
print(first_batch)

[tensor([[  40,  367, 2885, 1464, 1807, 3619,  402,  271]]), tensor([[  367,  2885,  1464,  1807,  3619,   402,   271, 10899]])]


Then creating multiple batches from the input dataset, we slide an input window across the text. If the stride is set to 1, we shift the input window by one position when creating the next batch. If we set the stride equal to the input window size, we can prevent overlaps between the batches.

In [174]:
print ([i for i in range(0,10,4)])

[0, 4, 8]


# creating token  embeddings
The last step in preparing the input text for LLM training is to convert the token IDs into embedding vectors

In [180]:
# Let’s see how the token ID to embedding vector conversion works with a hands-on example.

# Suppose we have 4 input token with ids

input_ids = torch.tensor([2,3,5,1])

# For simplicity we take vocab size of 6 insteas of 50256 in BPE
# and we create embedding of size 3 (GPT3 had  12288 dimensions)

vocab_size = 6
output_dim = 3

torch.manual_seed(123)
embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
print(embedding_layer.weight)

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)


In [181]:
# we can apply this to the token Id to obrain the embedding vector

print(embedding_layer(torch.tensor([3])))

tensor([[-0.4015,  0.9666, -1.1481]], grad_fn=<EmbeddingBackward0>)


The embedding layer is essen- tially a lookup operation that retrieves rows from the embedding layer’s weight matrix via a token ID.

In [184]:
print(embedding_layer(input_ids).shape)
print(embedding_layer(input_ids))

torch.Size([4, 3])
tensor([[ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-2.8400, -0.7849, -1.4096],
        [ 0.9178,  1.5810,  1.3010]], grad_fn=<EmbeddingBackward0>)


# Encoding word positions

Now, let’s con- sider more realistic and useful embedding sizes and encode the input tokens into a 256-dimensional vector representation, which is smaller than what the original GPT-3 model used (in GPT-3, the embedding size is 12,288 dimensions)

In [32]:
vocab_size = 50257
output_dim = 256
token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)



In [33]:
max_length=4

dataloader = create_dataloaderV1(
    raw_text,
    batch_size=8,
    max_length= max_length,
    stride=max_length,
    shuffle=False
)

data_iter = iter(dataloader)
inputs, targes = next(data_iter)
print(f"Input token ids :\n {inputs}")
print(f"Input shape:\n {inputs.shape}")

Input token ids :
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])
Input shape:
 torch.Size([8, 4])


In [38]:
torch.tensor([8, 4, 256])+ torch.ones(3)

tensor([  9.,   5., 257.])

In [34]:
token_embeddings= token_embedding_layer(inputs)
print(token_embeddings.shape)

torch.Size([8, 4, 256])


For a GPT model’s absolute embedding approach, we just need to create another embedding layer that has the same embedding dimension as the `token_embedding_ layer`:

In [39]:
context_length= max_length
pos_embeddings_layer = torch.nn.Embedding(context_length, output_dim)
pos_embeddings = pos_embeddings_layer(torch.arange(context_length))
print(pos_embeddings.shape)

torch.Size([4, 256])


In [40]:
print(torch.arange(context_length))

tensor([0, 1, 2, 3])


In [41]:
input_embeddings = token_embeddings + pos_embeddings
print(input_embeddings.shape)

torch.Size([8, 4, 256])
