In [2]:
from pathlib import Path
from tokenizers import (Tokenizer, decoders, models, pre_tokenizers,
                        processors, trainers)
from tokenizers.normalizers import NFKC


In [14]:
VOCAB_SIZE = 4096

In [20]:
# We train the tokenizer on the train data only
data_dir = Path("./data/babylm_10M_clean/")

paths = ['./data/babylm_10M_clean/children_stories.train']

# paths
print(len(paths))
assert len(paths) > 0, 'No data files found'

1


In [21]:
tokenizer = Tokenizer(models.BPE())

tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)
tokenizer.decoder = decoders.ByteLevel()
tokenizer.post_processor = processors.ByteLevel(trim_offsets=True)
tokenizer.normalizer = NFKC()

In [22]:
trainer = trainers.BpeTrainer(vocab_size=VOCAB_SIZE, min_frequency=2, special_tokens=["<pad>", "<s>", "</s>"])
tokenizer.train(paths, trainer)






In [23]:
tokenizer_path =  f"./models/tokenizer-{VOCAB_SIZE}.json"
tokenizer.save(str(tokenizer_path), pretty=True)

In [24]:
tokenizer = Tokenizer.from_file(str(tokenizer_path))


# text = 'Shiro Okada (岡田志郎, "Okada Shirō", June 9, 1949; Hirakata, Osaka {age 71} - ) is a Japanese guitarist who participate in the Group Sound band, the Ox. His nickname was Shiro (シロー) and his real name is Shiro Okamoto (岡田史郎).'
text = "The quick brown fox jumps over the lazy dog."

encoded = tokenizer.encode(text)
print(f"Encoded String: {encoded.tokens}")

print(f"Encoded IDs: {encoded.ids}")

decoded = tokenizer.decode(encoded.ids)
print(f"Decoded String: {decoded}")

Encoded String: ['ĠThe', 'Ġquick', 'Ġbrown', 'Ġfox', 'Ġj', 'um', 'ps', 'Ġover', 'Ġthe', 'Ġlaz', 'y', 'Ġdog', '.']
Encoded IDs: [174, 1393, 1895, 2761, 308, 425, 707, 417, 108, 3721, 81, 1215, 14]
Decoded String:  The quick brown fox jumps over the lazy dog.
