In [7]:
from util.mytimeit import timeit
import bpe, bpe_cond_score

In [13]:
def train_and_test(bpe_module, filename, vocab_size, **train_params):
    text = open(filename, "r", encoding="utf-8").read()
    merge_tree, vocab = timeit(lambda: bpe_module.train(text, vocab_size, **train_params), 'Training')
    print('Top 10 tokens:', [vocab[i].decode('utf-8') for i in range(256, 256 + 10)])
    tokenized_text = timeit(lambda: bpe_module.tokenize(text, merge_tree), 'Tokenization')
    print(f'Tokenized text has {len(tokenized_text)} tokens ({100 * len(tokenized_text) / len(text.encode('utf-8')):.2f}% of original).')
    detokenized_text = timeit(lambda: bpe_module.detokenize(tokenized_text, vocab), 'Detokenize')
    assert detokenized_text == text
    return tokenized_text, vocab

In [14]:
vocab_size = 10_000
tokenized_text, vocab = train_and_test(bpe, r'data\taylorswift.txt', vocab_size)
print()

for p, q, r in [[1, 0, 0], [3, 1, 1], [4, 1, 1], [5, 1, 1], [2, 1, 0], [2, 0, 1], [3, 1, 0], [3, 0, 1]]:
  print(f'{p=} {q=} {r=}')
  tokenized_text, vocab = train_and_test(bpe_cond_score, r'data\taylorswift.txt', vocab_size, p=p, q=q, r=r)
  print()

Training tokenizer on text of length 185,561 with vocab of size 10,000.
build_indexed_list took 0.17 seconds.
init_pairs_stats took 0.02 seconds.
Training took 1.01 seconds.
Top 10 tokens: ['e ', ', ', 'd ', '. ', 'r ', '20', 's ', 'in', 'on', 'ri']
Tokenization took 0.44 seconds.
Tokenized text has 24330 tokens (13.10% of original).
Detokenize took 0.00 seconds.

p=1 q=0 r=0
Training tokenizer on text of length 185,561 with vocab of size 10,000.
Training took 1.50 seconds.
Top 10 tokens: ['e ', ', ', 'd ', '. ', 'r ', '20', 's ', 'in', 'on', 'ri']
Tokenization took 0.42 seconds.
Tokenized text has 24330 tokens (13.10% of original).
Detokenize took 0.00 seconds.

p=3 q=1 r=1
Training tokenizer on text of length 185,561 with vocab of size 10,000.
Training took 1.41 seconds.
Top 10 tokens: ['20', ', ', ', 20', 've', 'ved', 'Sw', '. ', 'he', '. R', ', 201']
Tokenization took 0.44 seconds.
Tokenized text has 39534 tokens (21.28% of original).
Detokenize took 0.00 seconds.

p=4 q=1 r=1
Trai

In [18]:
tokenized_text, vocab = train_and_test(bpe_cond_score, r'data\taylorswift.txt', vocab_size, p=5, q=1, r=1)

# Let's inspect our tokenized text:
def debug(tokenized_text, vocab):
    print('🍔'.join([vocab[t].decode('utf-8') for t in tokenized_text]))

debug(tokenized_text[:100], vocab)

Training tokenizer on text of length 185,561 with vocab of size 10,000.
Training took 1.70 seconds.
Top 10 tokens: ['20', ', ', ', 20', '. ', 'd ', 'he', 've', 'in', 'or', 'ved ']
Tokenization took 0.43 seconds.
Tokenized text has 30647 tokens (16.50% of original).
Detokenize took 0.01 seconds.
C🍔op🍔y 🍔p🍔ast🍔e 🍔of the 🍔Wiki🍔p🍔edia 🍔artic🍔le 🍔on 🍔Taylor Swift, 🍔as of 🍔Feb🍔 🍔16🍔, 2024🍔.
🍔-🍔-🍔-🍔

🍔Ma🍔in 🍔m🍔en🍔u🍔

Wikipedia🍔The 🍔Free 🍔En🍔c🍔y🍔c🍔l🍔op🍔edia🍔

🍔S🍔earch🍔
🍔C🍔re🍔ate 🍔account🍔
🍔L🍔og🍔 🍔in🍔

Personal 🍔to🍔ol🍔s
🍔Con🍔t🍔ents 🍔 🍔h🍔id🍔e🍔
🍔(🍔Top🍔)
🍔Life and car🍔e🍔er🍔
Toggle Life and car🍔e🍔er 🍔subsection
Artistry
Toggle Artist🍔ry 🍔subsection
Accolades and 🍔a🍔ch🍔i🍔e🍔vements
Cultural status
Toggle Cultural status subsection
🍔We🍔al🍔th🍔
Toggle 🍔We🍔al🍔th 🍔subsection
🍔D🍔iscograph🍔y🍔
🍔Filmograph🍔y🍔
🍔Tours
🍔S🍔e🍔e 


In [23]:
# What about a GPT-4-like vocabulary with 100K tokens?
tokenized_text, vocab = train_and_test(bpe, r'data\0300511.txt', 100_000)
print()
tokenized_text, vocab = train_and_test(bpe_cond_score, r'data\0300511.txt', 100_000, p=5, q=1, r=1)

Training tokenizer on text of length 1,088,320 with vocab of size 100,000.
build_indexed_list took 1.11 seconds.
init_pairs_stats took 0.14 seconds.
Training took 9.13 seconds.
Top 10 tokens: ['e ', 'th', 'd ', 'in', 't ', ', ', 's ', 'er', 'an', ' th']
Tokenization took 3.14 seconds.
Tokenized text has 86612 tokens (7.95% of original).
Detokenize took 0.02 seconds.

Training tokenizer on text of length 1,088,320 with vocab of size 100,000.
Training took 13.57 seconds.
Top 10 tokens: ['e ', 'th', 'in', ', ', 'd ', 'er', 't ', 's ', 'the ', 'an']
Tokenization took 2.86 seconds.
Tokenized text has 105976 tokens (9.72% of original).
Detokenize took 0.02 seconds.


In [24]:
# Let's inspect our tokenized text:
debug(tokenized_text[:100], vocab)


🍔Project Gutenberg🍔 🍔Austral🍔i🍔a🍔

🍔Title:     🍔 🍔Swann's Way
            (D🍔u🍔 🍔c🍔ô🍔t🍔é🍔 🍔de chez🍔 🍔Swann)
            [Vol. 1 of Remembran🍔c🍔e of Things Past—🍔
🍔            (À🍔 🍔la Recherche du temps perdu)]🍔
🍔A🍔u🍔thor:     Marcel Proust🍔
🍔            Translated from the French🍔 🍔by C. K. Scott Moncrieff🍔
🍔* 🍔A🍔 🍔Project Gutenberg of Austral🍔i🍔a 🍔e🍔Book *🍔
🍔e🍔Book No🍔.🍔: 🍔 🍔0300511🍔.🍔t🍔x🍔t
🍔L🍔angu🍔ag🍔e🍔:   English🍔
🍔D🍔ate 🍔first 🍔pos🍔ted🍔: 🍔        🍔 🍔Mar🍔ch🍔 🍔2003🍔
🍔D🍔ate 🍔most 🍔recently 🍔up🍔d🍔ated🍔: 🍔S🍔ep🍔t 🍔2022🍔

🍔Production notes: Words in 🍔it🍔al🍔ic🍔s in the 🍔book🍔
🍔                  are 🍔en🍔clos🍔ed by 🍔under🍔s🍔cor


In [25]:
# What about a GPT-4-like vocabulary with 100K tokens?
tokenized_text, vocab = train_and_test(bpe, r'data\bible.txt', 100_000)
print()
tokenized_text, vocab = train_and_test(bpe_cond_score, r'data\bible.txt', 100_000, p=5, q=1, r=1)

Training tokenizer on text of length 4,351,186 with vocab of size 100,000.
build_indexed_list took 4.99 seconds.
init_pairs_stats took 0.58 seconds.
Training took 22.20 seconds.
Top 10 tokens: ['th', 'e ', ' th', 'd ', 'an', ', ', ' the ', 't ', 'in', 'er']
Tokenization took 11.27 seconds.
Tokenized text has 454119 tokens (10.44% of original).
Detokenize took 0.12 seconds.

Training tokenizer on text of length 4,351,186 with vocab of size 100,000.
Training took 30.93 seconds.
Top 10 tokens: ['th', ' th', 'e ', 'nd', ' the ', ', ', 'and', 'in', 'of', '.\n']
Tokenization took 10.59 seconds.
Tokenized text has 543110 tokens (12.48% of original).
Detokenize took 0.10 seconds.
