# Character tokeniser demonstration

In [1]:
import datasets
import matplotlib.pyplot as plt
import tqdm
import collections
import numpy as np

In [2]:
import char_tokeniser

First, instantiate the tokeniser.

In [16]:
tokeniser = char_tokeniser.CharacterTokeniser(1024)

Load the wikitext dataset to train on.

In [4]:
dataset_group = 'wikitext'
dataset_name = 'wikitext-103-raw-v1'
dataset_split = 'train'
dataset_full_name = '/'.join([dataset_group, dataset_name, dataset_split])

dataset = datasets.load_dataset(dataset_group, name=dataset_name, split=dataset_split)

Found cached dataset wikitext (/home/tom/.cache/huggingface/datasets/wikitext/wikitext-103-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126)


Train the tokeniser on the wikitext dataset

In [5]:
tokeniser.train(dataset)  # takes ~30s

100%|██████████████████████████████| 1801350/1801350 [00:28<00:00, 62808.54it/s]


Check tokenisation and detokenisation.

In [6]:
lipsum = """Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed lectus nulla, pulvinar sed auctor nec, facilisis eu odio. Duis euismod pellentesque turpis, vitae ullamcorper tortor rutrum quis. Duis sed odio ut augue convallis convallis. Morbi at elit ut mi imperdiet vehicula. Suspendisse in sem eget est dapibus pellentesque. In ut condimentum purus. Vivamus vulputate est massa, id pretium quam pharetra eget. Duis porta ipsum vitae nibh tempus, eu ultricies nunc molestie. Nulla facilisi. Donec eu erat vitae leo laoreet mollis a quis metus. In eu libero porta magna vehicula venenatis. Praesent fermentum quam libero, ac volutpat dui tincidunt ac. Pellentesque vitae risus viverra, rhoncus augue ut, pellentesque dui."""

In [12]:
tokenised = tokeniser.tokenise(lipsum[:508], max_seq_len=512)

In [13]:
tokeniser.detokenise_to_string(tokenised)

'Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed lectus nulla, pulvinar sed auctor nec, facilisis eu odio. Duis euismod pellentesque turpis, vitae ullamcorper tortor rutrum quis. Duis sed odio ut augue convallis convallis. Morbi at elit ut mi imperdiet vehicula. Suspendisse in sem eget est dapibus pellentesque. In ut condimentum purus. Vivamus vulputate est massa, id pretium quam pharetra eget. Duis porta ipsum vitae nibh tempus, eu ultricies nunc molestie. Nulla facilisi. Donec eu erat vita[PAD][PAD][PAD][PAD]'

How fast does it tokenise?

In [15]:
for _ in tqdm.tqdm(range(int(1e6))):
    tokenised = tokeniser.tokenise(lipsum[:508], max_seq_len=512)

100%|██████████████████████████████| 1000000/1000000 [00:32<00:00, 30619.31it/s]


So it's hardly a speed demon but isn't going to be the bottleneck in our pipeline.