In [1]:
#pip install tokenizers

Note: you may need to restart the kernel to use updated packages.


Using Peter Norwig's file for tokenization

In [1]:
BIG_FILE_URL = 'https://raw.githubusercontent.com/dscape/spell/master/test/resources/big.txt'

# Download and save the file
from requests import get
with open('big.txt', 'wb') as big_f:
    response = get(BIG_FILE_URL, )
    
    if response.status_code == 200:
        big_f.write(response.content)
    else:
        print("Unable to get the file: {}".format(response.reason))

Create pipeline and tokenizer.


Byte pair encoding or digram coding is a simple form of data compression in which the most common pair of consecutive bytes of data is replaced with a byte that does not occur within that data. A table of the replacements is required to rebuild the original data. The algorithm was first described publicly by Philip Gage in a February 1994 article "A New Algorithm for Data Compression" in the C Users Journal.

A variant of the technique has shown to be useful in several natural language processing applications.


https://leimao.github.io/blog/Byte-Pair-Encoding/

https://arxiv.org/abs/2004.03720

In [7]:
from tokenizers import Tokenizer
from tokenizers.decoders import ByteLevel as ByteLevelDecoder
from tokenizers.models import BPE
from tokenizers.normalizers import Lowercase, NFKC, Sequence
from tokenizers.pre_tokenizers import ByteLevel

# Byte-Pair Encoding model
tokenizer = Tokenizer(BPE())

# Normalize
tokenizer.normalizer = Sequence([
    NFKC(),
    Lowercase()
])

# Pre-tokenize
tokenizer.pre_tokenizer = ByteLevel()

# Plug decoder
tokenizer.decoder = ByteLevelDecoder()

Train tokenizer

In [10]:
from tokenizers.trainers import BpeTrainer

trainer = BpeTrainer(vocab_size=25000, show_progress=True, initial_alphabet=ByteLevel.alphabet())
tokenizer.train(trainer, ["big.txt"])

print("Trained vocab size: {}".format(tokenizer.get_vocab_size()))

Trained vocab size: 25000


Save the model

In [11]:
tokenizer.model.save('.')

['.\\vocab.json', '.\\merges.txt']

Evaluate trained model on new data

In [19]:
from pprint import pprint

tokenizer.model = BPE('vocab.json', 'merges.txt')
encoding = tokenizer.encode("An ant is crawling on a patch of sand. As it crawls, it traces a line in the sand. By pure chance the line that it traces curves and recrosses itself in, such a way that it ends up looking like a recognizable caricature of Winston Churchill")

pprint("Encoded string: {}".format(encoding.tokens))

decoded = tokenizer.decode(encoding.ids)
pprint("Decoded string: {}".format(decoded))

("Encoded string: ['Ġan', 'Ġant', 'Ġis', 'Ġcraw', 'ling', 'Ġon', 'Ġa', "
 "'Ġpatch', 'Ġof', 'Ġsand', '.', 'Ġas', 'Ġit', 'Ġcraw', 'ls', ',', 'Ġit', "
 "'Ġtraces', 'Ġa', 'Ġline', 'Ġin', 'Ġthe', 'Ġsand', '.', 'Ġby', 'Ġpure', "
 "'Ġchance', 'Ġthe', 'Ġline', 'Ġthat', 'Ġit', 'Ġtraces', 'Ġcurves', 'Ġand', "
 "'Ġrec', 'ross', 'es', 'Ġitself', 'Ġin', ',', 'Ġsuch', 'Ġa', 'Ġway', 'Ġthat', "
 "'Ġit', 'Ġends', 'Ġup', 'Ġlooking', 'Ġlike', 'Ġa', 'Ġrecogn', 'izable', "
 "'Ġcar', 'ic', 'ature', 'Ġof', 'Ġwin', 'ston', 'Ġchurch', 'ill']")
('Decoded string:  an ant is crawling on a patch of sand. as it crawls, it '
 'traces a line in the sand. by pure chance the line that it traces curves and '
 'recrosses itself in, such a way that it ends up looking like a recognizable '
 'caricature of winston churchill')
