In [1]:
import urllib.request as request

url = "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt"
file_path = "the-verdict.txt"

request.urlretrieve(url, file_path)

('the-verdict.txt', <http.client.HTTPMessage at 0x7fc0a05ddc70>)

In [3]:
with open(file_path, "r", encoding="utf-8") as f:
    raw_data = f.read()

print("Total number of character:", len(raw_data))
print(raw_data[:99])

Total number of character: 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


In [7]:
import re
text = "Hello, world. This is a test."
result = re.split(r'(\s)', text)
print(result)

['Hello,', ' ', 'world.', ' ', 'This', ' ', 'is', ' ', 'a', ' ', 'test.']


In [9]:
result = re.split(r"([,.]|\s)", text) # splits on whitespace, commas, and periods
print(result)

['Hello', ',', '', ' ', 'world', '.', '', ' ', 'This', ' ', 'is', ' ', 'a', ' ', 'test', '.', '']


In [10]:
result = [it for it in result if it.strip()] # remove whitespace charachters
print(result)

['Hello', ',', 'world', '.', 'This', 'is', 'a', 'test', '.']


In [11]:
text = "Hello, world. Is this-- a test?"
result = re.split(r'([,.:;?_!"()\']|--|\s)', text)
result = [it.strip() for it in result if it.strip()]
print(result)

['Hello', ',', 'world', '.', 'Is', 'this', '--', 'a', 'test', '?']


In [12]:
def tokenizer(text):
    result = re.split(r'([,.:;?_!"()\']|--|\s)', text)
    result = [it.strip() for it in result if it.strip()]
    return result

In [13]:
preprocessed = tokenizer(raw_data) # preprocess the whole text data

In [14]:
print(len(preprocessed), preprocessed[:30])

4690 ['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']


In [15]:
all_words = sorted(set(preprocessed)) # sorting the unique tokens
vocab_size = len(all_words)
print(vocab_size)

1130


In [17]:
vocab = {token:index for index,token in enumerate(all_words)}
for i, item in vocab.items():
    print(i, item)
    if item >= 50:
        break

! 0
" 1
' 2
( 3
) 4
, 5
-- 6
. 7
: 8
; 9
? 10
A 11
Ah 12
Among 13
And 14
Are 15
Arrt 16
As 17
At 18
Be 19
Begin 20
Burlington 21
But 22
By 23
Carlo 24
Chicago 25
Claude 26
Come 27
Croft 28
Destroyed 29
Devonshire 30
Don 31
Dubarry 32
Emperors 33
Florence 34
For 35
Gallery 36
Gideon 37
Gisburn 38
Gisburns 39
Grafton 40
Greek 41
Grindle 42
Grindles 43
HAD 44
Had 45
Hang 46
Has 47
He 48
Her 49
Hermia 50


In [24]:
class SimpleTokenizerV1(object):
    def __init__(self, vocab):
        self.str2int = vocab
        self.int2str = {i:s for s,i in vocab.items()}
        
    def encode(self, text): # string to token ids
        preprocessed = tokenizer(text)
        ids = [self.str2int[s] for s in preprocessed]
        return ids
    
    def decode(self, ids): # token ids to string
        text = " ".join([self.int2str[i] for i in ids])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text) # remove spaces before the specified punctuation
        return text

In [25]:
my_tokenizer = SimpleTokenizerV1(vocab)

In [26]:
text = """"It's the last he painted, you know, "
       Mrs. Gisburn said with pardonable pride."""
ids = my_tokenizer.encode(text)
print(ids)

[1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]


In [27]:
print(my_tokenizer.decode(ids))

" It' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.


In [32]:
all_words = sorted(set(preprocessed)) # sorting the unique tokens
all_words.extend(["<|unk|>", "<|endoftext|>"])
vocab_size = len(all_words)
print(vocab_size)

1132


In [33]:
vocab = {token:index for index,token in enumerate(all_words)}
for i, item in list(enumerate(vocab.items()))[-5:]:
    print(item)

('younger', 1127)
('your', 1128)
('yourself', 1129)
('<|unk|>', 1130)
('<|endoftext|>', 1131)


In [42]:
class SimpleTokenizerV2(object):
    def __init__(self, vocab):
        self.str2int = vocab
        self.int2str = {i:s for s,i in vocab.items()}
        self.unk = "<|unk|>"
        self.eof = "<|endoftext|>"
        
    def encode(self, text): # string to token ids
        preprocessed = tokenizer(text)
        ids = [self.str2int[s] if s in self.str2int else self.str2int[self.unk] for s in preprocessed]
        return ids
    
    def decode(self, ids): # token ids to string
        text = " ".join([self.int2str[i] for i in ids])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text) # remove spaces before the specified punctuation
        return text


In [36]:
text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."
text = " <|endoftext|> ".join([text1, text2])
print(text)

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.


In [43]:
my_tokenizer = SimpleTokenizerV2(vocab)

In [44]:
ids = my_tokenizer.encode(text)
print(ids)

[1130, 5, 355, 1126, 628, 975, 10, 1131, 55, 988, 956, 984, 722, 988, 1130, 7]


In [45]:
print(my_tokenizer.decode(ids))

<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.
