In [1]:
import sys
print(sys.executable)

/Users/zihanhu/Desktop/projects/LLMs-from-scratch/my-gpt-impl/venv/bin/python


In [2]:
import urllib.request
import re

# 1. Download the text
url = ("https://raw.githubusercontent.com/rasbt/"
       "LLMs-from-scratch/main/ch02/01_main-chapter-code/"
       "the-verdict.txt")
file_path = "the-verdict.txt"
urllib.request.urlretrieve(url, file_path)

with open(file_path, "r", encoding="utf-8") as f:
    raw_text = f.read()

# 2. Split text into tokens (words & punctuation)
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]

# 3. Create the Vocabulary (Unique words only)
all_words = sorted(list(set(preprocessed)))
vocab_size = len(all_words)

print(f"Total Characters: {len(raw_text)}")
print(f"Total Tokens (Words+Punctuation): {len(preprocessed)}")
print(f"Vocabulary Size (Unique Tokens): {vocab_size}")

Total Characters: 20479
Total Tokens (Words+Punctuation): 4690
Vocabulary Size (Unique Tokens): 1130


In [3]:
# Create the mapping: Word -> Integer
vocab = {token:integer for integer,token in enumerate(all_words)}

# Print the first 50 entries to inspect
for i, item in enumerate(vocab.items()):
    print(item)
    if i >= 50:
        break

('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)
('A', 11)
('Ah', 12)
('Among', 13)
('And', 14)
('Are', 15)
('Arrt', 16)
('As', 17)
('At', 18)
('Be', 19)
('Begin', 20)
('Burlington', 21)
('But', 22)
('By', 23)
('Carlo', 24)
('Chicago', 25)
('Claude', 26)
('Come', 27)
('Croft', 28)
('Destroyed', 29)
('Devonshire', 30)
('Don', 31)
('Dubarry', 32)
('Emperors', 33)
('Florence', 34)
('For', 35)
('Gallery', 36)
('Gideon', 37)
('Gisburn', 38)
('Gisburns', 39)
('Grafton', 40)
('Greek', 41)
('Grindle', 42)
('Grindles', 43)
('HAD', 44)
('Had', 45)
('Hang', 46)
('Has', 47)
('He', 48)
('Her', 49)
('Hermia', 50)


In [4]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}
    def encode(self, text):
        # Split Text
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        # Convert to integers
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
    def decode(self, ids):
        # Convert integers back to strings
        text = "".join([self.int_to_str[i] for i in ids])
        # Basic cleanup
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [5]:
tokenizer = SimpleTokenizerV1(vocab)
text = """It's the last he painted, you know," Mrs. Gisburn said with pardonable pride."""
ids = tokenizer.encode(text)
print("Encoded IDs:", ids)

decoded_text = tokenizer.decode(ids)
print("Decoded text:", decoded_text)


Encoded IDs: [56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]
Decoded text: It'sthelasthepainted,youknow,"Mrs.Gisburnsaidwithpardonablepride.


In [6]:
# Stress test
# Test with a word that IS NOT in the training text
tokenizer = SimpleTokenizerV1(vocab)

try:
    text = "Hello, do you like watercolor painting?"
    ids = tokenizer.encode(text)
    print(ids)
except KeyError as e:
    print(f"CRASH! The tokenizer failed because it doesn't know the word: {e}")

CRASH! The tokenizer failed because it doesn't know the word: 'Hello'


In [7]:
%pip install tiktoken


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [8]:
import tiktoken

# 1. Load the tokenizer used for GPT-2
tokenizer = tiktoken.get_encoding("gpt2")

# 2. Run the exact text that crashed your previous code
text = "Hello, do you like watercolor painting? sdanklasnldka"
ids = tokenizer.encode(text)

print("IDs:", ids)
print("Count:", len(ids))

# 3. Decode to prove no information was lost
decoded = tokenizer.decode(ids)
print("Decoded:", decoded)

IDs: [15496, 11, 466, 345, 588, 1660, 8043, 12036, 30, 45647, 962, 21921, 77, 335, 4914]
Count: 15
Decoded: Hello, do you like watercolor painting? sdanklasnldka


In [9]:
# Break down the IDs one by one to see the sub-words
print("Token breakdown:")
for token_id in ids:
    word_chunk = tokenizer.decode([token_id])
    print(f"{token_id} -> {repr(word_chunk)}")

Token breakdown:
15496 -> 'Hello'
11 -> ','
466 -> ' do'
345 -> ' you'
588 -> ' like'
1660 -> ' water'
8043 -> 'color'
12036 -> ' painting'
30 -> '?'
45647 -> ' sd'
962 -> 'ank'
21921 -> 'las'
77 -> 'n'
335 -> 'ld'
4914 -> 'ka'


# Create 'Sliding Window' (Input vs Target)

In [10]:
import torch
from torch.utils.data import Dataset, DataLoader

class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.tokenizer = tokenizer
        self.input_ids = []
        self.target_ids = []

        # 1. Tokenize the entire text
        token_ids = tokenizer.encode(txt)

        # 2. Slide the window across the text
        # We move the window by 'stride' steps each time
        for i in range(0, len(token_ids) - max_length, stride):
            
            # Chunk the input
            input_chunk = token_ids[i : i + max_length]
            
            # Chunk the target (shifted by 1)
            target_chunk = token_ids[i + 1 : i + max_length + 1]
            
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [11]:
def create_dataloader_v1(txt, batch_size=4, max_length=256, 
                         stride=128, shuffle=True, drop_last=True):
    
    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create the Dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create the DataLoader
    dataloader = DataLoader(
        dataset, 
        batch_size=batch_size, 
        shuffle=shuffle, 
        drop_last=drop_last # Drop the last batch if it's not full
    )

    return dataloader

In [12]:
# 1. Use the raw text we downloaded earlier
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

# 2. Create a dataloader with small parameters for inspection
max_length = 4  # Short context window
dataloader = create_dataloader_v1(
    raw_text, 
    batch_size=2, 
    max_length=max_length, 
    stride=max_length,
    shuffle=False
)

# 3. Grab the first batch
data_iter = iter(dataloader)
first_batch = next(data_iter)
x, y = first_batch

print(f"Input shape:  {x.shape}")
print(f"Target shape: {y.shape}")

print("\nInput Batch (x):\n", x)
print("\nTarget Batch (y):\n", y)

Input shape:  torch.Size([2, 4])
Target shape: torch.Size([2, 4])

Input Batch (x):
 tensor([[  40,  367, 2885, 1464],
        [1807, 3619,  402,  271]])

Target Batch (y):
 tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899]])


In [13]:
import torch.nn as nn

# GPT-2 Configuration (Small version)
vocab_size = 50257      # Size of GPT-2 tokenizer
output_dim = 256        # Embedding dimension (vector size)
context_length = 1024   # Max sequence length the model can handle

# 1. Token Embedding Layer
# Map 50257 words -> 256-dimensional vectors
token_embedding_layer = nn.Embedding(vocab_size, output_dim)

# 2. Positional Embedding Layer
# Map 1024 positions -> 256-dimensional vectors
pos_embedding_layer = nn.Embedding(context_length, output_dim)

print("Embedding layers initialized.")

Embedding layers initialized.
