In [33]:
import re
import requests
from string import punctuation

In [17]:
TEXT_FILEPATH = "../data/the_verdict.txt"

In [24]:
res = requests.get("https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/refs/heads/main/ch02/01_main-chapter-code/the-verdict.txt")
with open(TEXT_FILEPATH, "w") as f:
    f.write(res.content.decode("utf-8"))

In [25]:
with open(TEXT_FILEPATH, "r") as f:
    raw_text = f.read()

In [26]:
raw_text

'I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no great surprise to me to hear that, in the height of his glory, he had dropped his painting, married a rich widow, and established himself in a villa on the Riviera. (Though I rather thought it would have been Rome or Florence.)\n\n"The height of his glory"--that was what the women called it. I can hear Mrs. Gideon Thwing--his last Chicago sitter--deploring his unaccountable abdication. "Of course it\'s going to send the value of my picture \'way up; but I don\'t think of that, Mr. Rickham--the loss to Arrt is all I think of." The word, on Mrs. Thwing\'s lips, multiplied its _rs_ as though they were reflected in an endless vista of mirrors. And it was not only the Mrs. Thwings who mourned. Had not the exquisite Hermia Croft, at the last Grafton Gallery show, stopped me before Gisburn\'s "Moon-dancers" to say, with tears in her eyes: "We shall not look upon its like again"?\n\nWell!--even 

In [27]:
print("Number of characters:", len(raw_text))
print(raw_text[:99])

Number of characters: 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


## Simple Tokenization using Regex

In [32]:
# example
text = "Hello, world. this is a text passage."
result = re.split(r"([,.]|\s)", text)
print(result)

result = [item for item in result if item.strip()]
print(result)

['Hello', ',', '', ' ', 'world', '.', '', ' ', 'this', ' ', 'is', ' ', 'a', ' ', 'text', ' ', 'passage', '.', '']
['Hello', ',', 'world', '.', 'this', 'is', 'a', 'text', 'passage', '.']


In [36]:
# more complex text
text = "Hello, world. Is this-- a test?"
result = re.split(r'([,.:;?_!"()\']|--|\s)', text)
print(result)

result = [item.strip() for item in result if item.strip()]
print(result)

['Hello', ',', '', ' ', 'world', '.', '', ' ', 'Is', ' ', 'this', '--', '', ' ', 'a', ' ', 'test', '?', '']
['Hello', ',', 'world', '.', 'Is', 'this', '--', 'a', 'test', '?']


Let's apply the latest regex to the sample text.

In [40]:
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]

print("Number of tokens:", len(preprocessed))

Number of tokens: 4690


In [41]:
print(preprocessed[:50])

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in', 'the', 'height', 'of', 'his', 'glory', ',', 'he', 'had', 'dropped', 'his', 'painting', ',', 'married', 'a', 'rich', 'widow', ',', 'and', 'established', 'himself']


## Buiding Vocabulary

In [42]:
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)
print("Vocab size:", vocab_size)

Vocab size: 1130


In [45]:
print(all_words[:30])

['!', '"', "'", '(', ')', ',', '--', '.', ':', ';', '?', 'A', 'Ah', 'Among', 'And', 'Are', 'Arrt', 'As', 'At', 'Be', 'Begin', 'Burlington', 'But', 'By', 'Carlo', 'Chicago', 'Claude', 'Come', 'Croft', 'Destroyed']


In [49]:
vocab = {token: token_id for token_id, token in enumerate(all_words)}

for i, token in enumerate(vocab.items()):
    print(token)
    if i == 50:
        break

('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)
('A', 11)
('Ah', 12)
('Among', 13)
('And', 14)
('Are', 15)
('Arrt', 16)
('As', 17)
('At', 18)
('Be', 19)
('Begin', 20)
('Burlington', 21)
('But', 22)
('By', 23)
('Carlo', 24)
('Chicago', 25)
('Claude', 26)
('Come', 27)
('Croft', 28)
('Destroyed', 29)
('Devonshire', 30)
('Don', 31)
('Dubarry', 32)
('Emperors', 33)
('Florence', 34)
('For', 35)
('Gallery', 36)
('Gideon', 37)
('Gisburn', 38)
('Gisburns', 39)
('Grafton', 40)
('Greek', 41)
('Grindle', 42)
('Grindles', 43)
('HAD', 44)
('Had', 45)
('Hang', 46)
('Has', 47)
('He', 48)
('Her', 49)
('Hermia', 50)


In [59]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.token_to_id = vocab
        self.id_to_token = {token_id: token for token, token_id in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        ids = [self.token_to_id[token] for token in preprocessed]
        return ids

    def decode(self, ids):
        text = " ".join([self.id_to_token[token_id] for token_id in ids])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [60]:
tokenizer = SimpleTokenizerV1(vocab)
text = """"It's the last he painted, you know," 
       Mrs. Gisburn said with pardonable pride."""
print("Text:", text)

ids = tokenizer.encode(text)
print(ids)

Text: "It's the last he painted, you know," 
       Mrs. Gisburn said with pardonable pride.
[1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]


In [61]:
print(tokenizer.decode(ids))

" It' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.


In [62]:
text = "Hello, do you like tea?"
print(tokenizer.encode(text))

KeyError: 'Hello'

## Extending to Special Tokens

In [63]:
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])
vocab = {token: token_id for token_id, token in enumerate(all_tokens)}

print("New vocab size:", len(vocab))

New vocab size: 1132


In [67]:
for i, item in enumerate(list(vocab.items())[-5:]):
    print(item)

('younger', 1127)
('your', 1128)
('yourself', 1129)
('<|endoftext|>', 1130)
('<|unk|>', 1131)


In [73]:
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.token_to_id = vocab
        self.id_to_token = {token_id: token for token, token_id in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [
            item.strip() if item else "<|chunk|>" 
            for item in preprocessed if item.strip()
        ]
        return [
            self.token_to_id[token] for token in preprocessed
        ]

    def decode(self, ids):
        text = " ".join([self.id_to_token[token_id] for token_id in ids])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [74]:
tokenizer_v2 = SimpleTokenizerV2(vocab)
print(tokenizer_v2.encode(text))

KeyError: 'Hello'