# Setup

In [2]:
from cs336_basics.bpe import train_bpe_tokenizer
from cs336_basics.bpe_optim import train_bpe_tokenizer_optim
from cs336_basics.io_utils import save_pickle, load_pickle

%load_ext autoreload
%autoreload 2

In [3]:
import os

os.cpu_count()

64

# REPL

As of Unicode 16.0 (released in September 2024), the standard defines 154,998 characters across 168 scripts
```python
>>> test_string = "hello! こんにちは!"
>>> utf8_encoded = test_string.encode("utf-8")
>>> print(utf8_encoded)
b'hello! \xe3\x81\x93\xe3\x82\x93\xe3\x81\xab\xe3\x81\xa1\xe3\x81\xaf!'
>>> print(type(utf8_encoded))
<class 'bytes'>
>>> # Get the byte values for the encoded string (integers from 0 to 255).
>>> list(utf8_encoded)
[104, 101, 108, 108, 111, 33, 32, 227, 129, 147, 227, 130, 147, 227, 129, 171, 227, 129,
161, 227, 129, 175, 33]
>>> # One byte does not necessarily correspond to one Unicode character!
>>> print(len(test_string))
13
>>> print(len(utf8_encoded))
23
>>> print(utf8_encoded.decode("utf-8"))
hello! こんにちは!
```

In [5]:
print("this is a test" + chr(0) + "string")
print(ord("牛"))

this is a test string
29275


When using byte-level tokenization, we do not need to worry about out-of-vocabulary tokens, since we know that any input text can be expressed as a sequence of integers from 0 to
255 by converting our Unicode codepoints into a sequence of bytes (e.g., via the UTF-8 encoding)

In [8]:
with open("data/TinyStoriesV2-GPT4-valid.txt", "r") as f:
    content = f.readlines()

display(content[:10])

for line in content[:10]:
    print(line)

['u don\'t have to be scared of the loud dog, I\'ll protect you". The mole felt so safe with the little girl. She was very kind and the mole soon came to trust her. He leaned against her and she kept him safe. The mole had found his best friend.\n',
 '<|endoftext|>\n',
 'Once upon a time, in a warm and sunny place, there was a big pit. A little boy named Tom liked to play near the pit. One day, Tom lost his red ball. He was very sad.\n',
 'Tom asked his friend, Sam, to help him search for the ball. They looked high and low, but they could not find the ball. Tom said, "I think my ball fell into the pit."\n',
 'Sam and Tom went close to the pit. They were scared, but they wanted to find the red ball. They looked into the pit, but it was too dark to see. Tom said, "We must go in and search for my ball."\n',
 'They went into the pit to search. It was dark and scary. They could not find the ball. They tried to get out, but the pit was too deep. Tom and Sam were stuck in the pit. They called

u don't have to be scared of the loud dog, I'll protect you". The mole felt so safe with the little girl. She was very kind and the mole soon came to trust her. He leaned against her and she kept him safe. The mole had found his best friend.

<|endoftext|>

Once upon a time, in a warm and sunny place, there was a big pit. A little boy named Tom liked to play near the pit. One day, Tom lost his red ball. He was very sad.

Tom asked his friend, Sam, to help him search for the ball. They looked high and low, but they could not find the ball. Tom said, "I think my ball fell into the pit."

Sam and Tom went close to the pit. They were scared, but they wanted to find the red ball. They looked into the pit, but it was too dark to see. Tom said, "We must go in and search for my ball."

They went into the pit to search. It was dark and scary. They could not find the ball. They tried to get out, but the pit was too deep. Tom and Sam were stuck in the pit. They called for help, but no one could h

In [2]:
chr(104)

'h'

In [6]:
type("hello! こんにちは!".encode("utf-8"))

bytes

In [None]:
ord(b"\xe3")

227

In [28]:
import regex as re

PAT = r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
re.findall(PAT, "some text that i'll pre-tokenize\n\nand this is a test 1234")

['some',
 ' text',
 ' that',
 ' i',
 "'ll",
 ' pre',
 '-',
 'tokenize',
 '\n',
 '\n',
 'and',
 ' this',
 ' is',
 ' a',
 ' test',
 ' 1234']

In [None]:
i = 1
if i > 0 and (prev_token := "", next_token := ""):
    print("This is a test")
    print(f"prev_token: {prev_token}, next_token: {next_token}")

This is a test
prev_token: , next_token: 


# Train BPE

In [None]:
print((b"as", b"t") > (b" .", b".."))
a = b"as" + b"t"
b = b" ." + b".."

```bash
# test / profile
uv run cs336_basics/bpe.py --input_path=data/test.txt --vocab_size=270
uv run cs336_basics/bpe.py --input_path=data/TinyStoriesV2-GPT4-valid.txt --vocab_size=300
uv run cs336_basics/bpe.py --input_path=data/owt_valid.txt --vocab_size=300
uv run cs336_basics/bpe.py --input_path=data/owt_train.txt --vocab_size=260
uv run scalene cs336_basics/bpe.py --input_path=data/owt_train.txt --vocab_size=300 --output_dir=data/test --pre_tokens_path=data/output/owt_train-pre_tokens.pkl --profile=true

# production
uv run cs336_basics/bpe.py --input_path=data/TinyStoriesV2-GPT4-train.txt --vocab_size=10000
uv run cs336_basics/bpe.py --input_path=data/owt_train.txt --vocab_size=32000 --pre_tokens_path=data/output/owt_train-pre_tokens.pkl
```

In [4]:
vocab_size = 10_000
vocab_size = 32_000
vocab_size = 300
name = "TinyStoriesV2-GPT4-valid"
# name = "TinyStoriesV2-GPT4-train"
# name = "owt_train"
name = "test"

vocab, merges = train_bpe_tokenizer_optim(
    input_path=f"data/{name}.txt",
    vocab_size=257 + 10,
    special_tokens=["<|endoftext|>"],
    output_dir="data/test",
    debug=True,
)

print(f"Vocab size: {len(vocab)}")
print(f"Number of merges: {len(merges)}")
display(merges[:10])

Training BPE tokenizer on data/test.txt with vocab size 267, special tokens ['<|endoftext|>']...
Pre-tokenizing input data...
Chunking file of size 95 bytes into 1 chunks of size 67108864 bytes each.
Using 60 processes for parallelizing pre-tokenization of 1 chunks.


Pre-tokenizing:   0%|          | 0/1 [00:00<?, ?it/s]

Pre-tokenization took 0.19 seconds. Found 8 unique tokens.
Pre-tokens saved to data/test/test-pre_tokens.pkl.
Unique pre-tokens: 8
First 10 unique pre-tokens: ['\n', ' low', ' lower', ' newest', ' widest', 'low', 'lower', 'newest']
Last 10 unique pre-tokens: ['\n', ' low', ' lower', ' newest', ' widest', 'low', 'lower', 'newest']
Random 10 unique pre-tokens: ['\n', ' low', ' lower', ' newest', ' widest', 'low', 'lower', 'newest']
Starting BPE training with 10 merges.


Training BPE:   0%|          | 0/10 [00:00<?, ?it/s]

---
Merge 1/10: (b's', b't')
Current pairs:
[((b's', b't'), 9), ((b'e', b's'), 9), ((b'w', b'e'), 8), ((b'o', b'w'), 7), ((b'l', b'o'), 7), ((b'n', b'e'), 6), ((b'e', b'w'), 6), ((b' ', b'n'), 5), ((b' ', b'l'), 5), ((b'w', b'i'), 3)]
---
Merge 2/10: (b'e', b'st')
Current pairs:
[((b'e', b'st'), 9), ((b'w', b'e'), 8), ((b'o', b'w'), 7), ((b'l', b'o'), 7), ((b'n', b'e'), 6), ((b'e', b'w'), 6), ((b' ', b'n'), 5), ((b' ', b'l'), 5), ((b'w', b'i'), 3), ((b'i', b'd'), 3)]
---
Merge 3/10: (b'o', b'w')
Current pairs:
[((b'o', b'w'), 7), ((b'l', b'o'), 7), ((b'w', b'est'), 6), ((b'n', b'e'), 6), ((b'e', b'w'), 6), ((b' ', b'n'), 5), ((b' ', b'l'), 5), ((b'w', b'i'), 3), ((b'i', b'd'), 3), ((b'd', b'est'), 3)]
---
Merge 4/10: (b'l', b'ow')
Current pairs:
[((b'l', b'ow'), 7), ((b'w', b'est'), 6), ((b'n', b'e'), 6), ((b'e', b'w'), 6), ((b' ', b'n'), 5), ((b' ', b'l'), 5), ((b'w', b'i'), 3), ((b'i', b'd'), 3), ((b'd', b'est'), 3), ((b' ', b'w'), 3)]
---
Merge 5/10: (b'w', b'est')
Current pairs:
[(

[(b's', b't'),
 (b'e', b'st'),
 (b'o', b'w'),
 (b'l', b'ow'),
 (b'w', b'est'),
 (b'n', b'e'),
 (b'ne', b'west'),
 (b' ', b'newest'),
 (b' ', b'low'),
 (b'w', b'i')]

In [55]:
!head -n 10000 data/TinyStoriesV2-GPT4-train.txt > data/TinyStoriesV2-GPT4-train.10k.txt

In [56]:
!less data/TinyStoriesV2-GPT4-train.10k.txt

7[?47h[?1h=
Once upon a time there was a little boy named Ben. Ben loved to explore the world around him. He saw many amazing things, like beautiful vases that were on display in a store. One day, Ben was walking through the store when he came across a very special vase. When Ben saw it he was amazed!  
He said, “Wow, that is a really amazing vase! Can I buy it?” 
The shopkeeper smiled and said, “Of course you can. You can take it home and show all your friends how amazing it is!”
So Ben took the vase home and he was so proud of it! He called his friends over and showed them the amazing vase. All his friends thought the vase was beautiful and couldn't believe how lucky Ben was. 
And that's how Ben found an amazing vase in the store!
<|endoftext|>
Once upon a time, there was a reliable otter named Ollie. He lived in a river with his family. They all loved to play and swim together.
One day, Ollie's mom said, "Ollie, hurry and get some fish for dinner!" Ollie swam fast to catch fish.

# Tokenizer

In [10]:
b"hello"[0]

104

In [None]:
a = [0] * 5
a[2:4] = [3]
a

sum([["a", "b"], ["c", "d"]], [])

['a', 'b', 'c', 'd']

In [19]:
!ls -lhi tests/fixtures

total 6.6M
407712 -rw-r--r--. 1 yanlin_chen yanlin_chen  1.5K Jun 28 22:06 address.txt
407713 -rw-r--r--. 1 yanlin_chen yanlin_chen  130K Jun 28 22:06 corpus.en
407714 -rw-r--r--. 1 yanlin_chen yanlin_chen   594 Jun 28 22:06 german.txt
407715 -rw-r--r--. 1 yanlin_chen yanlin_chen  446K Jun 28 22:06 gpt2_merges.txt
407716 -rw-r--r--. 1 yanlin_chen yanlin_chen 1018K Jun 28 22:06 gpt2_vocab.json
407717 -rw-r--r--. 1 yanlin_chen yanlin_chen    23 Jun 28 22:06 special_token_double_newlines_non_whitespace.txt
407718 -rw-r--r--. 1 yanlin_chen yanlin_chen    15 Jun 28 22:06 special_token_trailing_newlines.txt
407720 -rw-r--r--. 1 yanlin_chen yanlin_chen  5.0M Jun 28 22:06 tinystories_sample_5M.txt
407719 -rw-r--r--. 1 yanlin_chen yanlin_chen  3.8K Jun 28 22:06 tinystories_sample.txt
407721 -rw-r--r--. 1 yanlin_chen yanlin_chen  1.3K Jun 28 22:06 train-bpe-reference-merges.txt
407722 -rw-r--r--. 1 yanlin_chen yanlin_chen  7.5K Jun 28 22:06 train-bpe-reference-vocab.json
407723 drwxr-sr-x. 2 yan

In [18]:
test_string = "🙃"
test_string.encode("utf-8")

b'\xf0\x9f\x99\x83'

In [None]:
"\ufffd".encode("utf-8") + "hello".encode("utf-8")

b'\xef\xbf\xbdhello'

In [16]:
(b"\ufffd" + "hello".encode("utf-8")).decode("utf-8", errors="replace")

  (b"\ufffd" + "hello".encode("utf-8")).decode("utf-8", errors="replace")


'\\ufffdhello'

In [22]:
import regex as re

pattern = "|".join(re.escape(token) for token in ["<|endoftext|><|endoftext|>", "<|endoftext|>"])
parts = re.split(
    f"({pattern})",
    "some text <|endoftext|><|endoftext|> another text<|endoftext|> <|endoftext|>",
)
[part for part in parts if part]

['some text ',
 '<|endoftext|><|endoftext|>',
 ' another text',
 '<|endoftext|>',
 ' ',
 '<|endoftext|>']

In [19]:
from collections import Counter

c = Counter()
c[tuple(b.to_bytes() for b in "hello".encode("utf-8"))] += 1
c

pair = max(c.items(), key=lambda x: (x[1], x[0]))[0]
new_token = b"".join(pair)
new_token

b'hello'

In [None]:
tuple(b"hello")

(104, 101, 108, 108, 111)

In [23]:
bytes((104))

b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'

In [24]:
tuple([0])

(0,)