In [None]:
from util.mytimeit import timeit
import bpe, bke, bxe_with_score
from functools import partial

In [None]:
def train_and_test(train, tokenize, detokenize, filename, vocab_size):
    text = open(filename, "r", encoding="utf-8").read()
    merge_tree, vocab = timeit(lambda: train(text, vocab_size), 'Training')
    print('Top 10 tokens:', [vocab[i].decode('utf-8') for i in range(256, 256 + 10)])
    tokenized_text = timeit(lambda: tokenize(text, merge_tree), 'Tokenization')
    detokenized_text = timeit(lambda: detokenize(tokenized_text, vocab), 'Detokenize')
    print(f'Tokenized text has {len(tokenized_text)} tokens ({100 * len(tokenized_text) / len(text.encode('utf-8')):.2f}% of original).')
    assert detokenized_text == text
    return tokenized_text, vocab

BPE

In [None]:
for vocab_size in [256 + 100]:
    tokenized_text, vocab = train_and_test(bpe.train, bpe.tokenize, bpe.detokenize, r'data\taylorswift.txt', vocab_size)

BKE

In [None]:
for k in range(2, 10):
    print('Testing k:', k)
    train = partial(bke.train, k=k)
    tokenize = partial(bke.tokenize, k=k)
    for vocab_size in [256 + 100]:
        tokenized_text, vocab = train_and_test(train, tokenize, bke.detokenize, r'data\taylorswift.txt', vocab_size)

BXE

In [None]:
for x in [5, 10, 20]:
    print('Testing x:', x)
    train = partial(bxe_with_score.train, x=x)
    tokenize = partial(bxe_with_score.tokenize, x=x)
    for vocab_size in [256 + 100]:
        tokenized_text, vocab = train_and_test(train, tokenize, bxe_with_score.detokenize, r'data\taylorswift.txt', vocab_size)

Testing different scores

In [None]:
for x in [10]:
    print('Testing x:', x)
    merge_counts = []
    train = partial(bxe_with_score.train, x=x, merge_counts=merge_counts, score_fn=bxe_with_score.default_score)
    tokenize = partial(bxe_with_score.tokenize, x=x)
    for vocab_size in [256 + 100]:
        tokenized_text, vocab = train_and_test(train, tokenize, bxe_with_score.detokenize, r'data\bible.txt', vocab_size)

In [None]:
def force_space_align_score(tup, count):
    if tup[0] == ord(b' '): return 0
    return (len(tup) - 1) * (count - 1)

for x in [10]:
    print('Testing x:', x)
    merge_counts_space_align = []
    train = partial(bxe_with_score.train, x=x, merge_counts=merge_counts_space_align, score_fn=force_space_align_score)
    tokenize = partial(bxe_with_score.tokenize, x=x)
    for vocab_size in [256 + 100]:
        tokenized_text, vocab = train_and_test(train, tokenize, bxe_with_score.detokenize, r'data\bible.txt', vocab_size)

In [None]:
[vocab[i] for i in range(256, 266)]

In [None]:
def force_space_align_score(tup, count):
    if tup[0] == ord(b' '): return 0
    return (len(tup) - 1) * (count - 1)

for x in [2]:
    print('Testing x:', x)
    merge_counts_space_align = []
    train = partial(bxe_with_score.train, x=x, merge_counts=merge_counts_space_align, score_fn=force_space_align_score)
    tokenize = partial(bxe_with_score.tokenize, x=x)
    for vocab_size in [256 + 100]:
        tokenized_text, vocab = train_and_test(train, tokenize, bxe_with_score.detokenize, r'data\bible.txt', vocab_size)

In [None]:
def force_space_align_score(tup, count):
    if tup[0] == ord(b' '): return 0
    return (len(tup) - 1) * (count - 1)

for x in [2]:
    print('Testing x:', x)
    merge_counts_default = []
    train = partial(bxe_with_score.train, x=x, merge_counts=merge_counts_default)
    tokenize = partial(bxe_with_score.tokenize, x=x)
    for vocab_size in [256 + 100]:
        tokenized_text, vocab = train_and_test(train, tokenize, bxe_with_score.detokenize, r'data\bible.txt', vocab_size)