# Tokenizers exploration

## Load a french dataset

In [None]:
pip install datasets

In [16]:
from datasets import load_dataset

dataset_path = "frenchtext/banque-fr-2311"

In [None]:
dataset = load_dataset(dataset_path)

In [3]:
dataset["valid"][0]["Text"]

"# Les nouvelles normes européennes sur le paiement pourraient affecter l'e-commerce\r\n\r\nicone ecommerce\r\n\r\nLes nouvelles règles européennes sur la sécurisation des paiements en ligne entreront en vigueur à partir du 14 septembre 2019. Elles ont notamment été pensées pour limiter les fraudes dans le domaine. Les banques et les acteurs du secteur interpellent toutefois les autorités sur les perturbations pouvant être induites par le déploiement de cette nouvelle norme.\r\n\r\nLes plateformes d'e-commerce sont généralement débordées en fin d’année, notamment avec Thanksgiving, le Black Friday et les achats de Noël. Cette période s’annonce encore plus compliquée pour 2019.\r\n\r\nEn effet, les nouvelles normes de sécurité pour le paiement en ligne seront appliquées en Europe à compter de mi-septembre. Elles concerneront notamment les banques, les fournisseurs de services de paiement et les e-commerçants.\r\n\r\nAu regard de cette échéance, les protagonistes de ces différents secteu

## Get the most popular tokenizers

In [None]:
pip install --upgrade transformers

In [None]:
pip install sentencepiece

In [None]:
pip install tiktoken

In [1]:
models = { 
    #"rmkv_world_1b5" : "BlinkDL/rwkv-5-world",

    "phi2_3b" : "microsoft/phi-2",
    "btlm_3b" : "cerebras/btlm-3b-8k-base",
    "redpajama_3b" : "togethercomputer/RedPajama-INCITE-Base-3B-v1",
    "open_llama_3b" : "openlm-research/open_llama_3b_v2",
    "stablelm_3b" : "stabilityai/stablelm-3b-4e1t",      
    
    "yi_6b" : "01-ai/Yi-6B",
    "mistral_7b" : "mistralai/Mistral-7B-v0.1",
    "mpt_7b" : "mosaicml/mpt-7b",
    "falcon_7b" : "tiiuae/falcon-7b",
    "redpajama_7b" : "togethercomputer/RedPajama-INCITE-7B-Base",
    "llama2_7b_32k" : "togethercomputer/LLaMA-2-7B-32K",
    "open_llama_7b" : "openlm-research/open_llama_7b_v2",
    "mpt_7b_8k" : "mosaicml/mpt-7b-8k",
    "qwen_7b" : "Qwen/Qwen-7B",
    "llama2_7b" : "meta-llama/Llama-2-7b-hf",
    "bloomz_7b" : "bigscience/bloomz-7b1-mt",
    "decilm_7b" : "Deci/DeciLM-7B",
    
    "solar_10b" : "upstage/SOLAR-10.7B-v1.0",
    
    "llama2_13b" : "meta-llama/Llama-2-13b-hf",
    "qwen_14b" : "Qwen/Qwen-14B",
    
    "mixtral_8x7B" : "mistralai/Mixtral-8x7B-v0.1",
    
    "mpt_30b" : "mosaicml/mpt-30b",
    "yi_34b" : "01-ai/Yi-34B",
    "falcon_40b" : "tiiuae/falcon-40b"
}

In [3]:
with open("/workspace/hftoken", 'r') as file:
    myhftoken = file.read().strip()

In [31]:
tokenizer = AutoTokenizer.from_pretrained( "Qwen/Qwen-7B", trust_remote_code=True, token=myhftoken)

In [29]:
from transformers import AutoTokenizer
import json

models_tokenizers = {}
models_tokenizers_config = {}
models_tokenizers_specialtokens = {}
for model in models.keys():
    print("------------------------")
    print(f"Loading {model} tokenizer")
    tokenizer = AutoTokenizer.from_pretrained(models[model], trust_remote_code=True, token=myhftoken)
    models_tokenizers[model] = tokenizer
    if model[:5]=="qwen_":
        print(hash(tuple(sorted(tokenizer.mergeable_ranks.items()))))
    else:
        print(hash(tuple(sorted(tokenizer.vocab.items()))))
    config = {}
    specialtokens = {}
    config["type"] = type(tokenizer)
    if model[:5]=="qwen_":
         type(tokenizer.tokenizer)
    else:
        config["backend_type"] = type(tokenizer.backend_tokenizer.model)
    config["vocab_size"] = tokenizer.vocab_size
    config["model_max_length"] = tokenizer.model_max_length
    if hasattr(tokenizer, "special_tokens"): specialtokens["special_tokens"] = tokenizer.special_tokens
    config["padding_side"] = tokenizer.padding_side
    config["truncation_side"] = tokenizer.truncation_side
    config["clean_up_tokenization_spaces"] = tokenizer.clean_up_tokenization_spaces
    if tokenizer.is_fast:
        backend_config = json.loads(tokenizer.backend_tokenizer.to_str())
        if "vocab" in backend_config["model"]: del backend_config["model"]["vocab"]
        if "merges" in backend_config["model"]: del backend_config["model"]["merges"]
        config['truncation'] = backend_config['truncation']
        config['padding'] = backend_config['padding']
        specialtokens['added_tokens'] = backend_config['added_tokens']
        config['normalizer'] = backend_config['normalizer']
        config['pre_tokenizer'] = backend_config['pre_tokenizer']
        config['model'] = backend_config['model']
        config['post_processor'] = backend_config['post_processor']
        config['decoder'] = backend_config['decoder']
    elif model[:3]=="yi_":
        config['model'] = type(tokenizer.sp_model)
    models_tokenizers_config[model] = config
    models_tokenizers_specialtokens[model] = specialtokens
    print(config)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


------------------------
Loading phi2_3b tokenizer
4837434508465586422
{'type': <class 'transformers.models.codegen.tokenization_codegen_fast.CodeGenTokenizerFast'>, 'backend_type': <class 'tokenizers.models.BPE'>, 'vocab_size': 50257, 'model_max_length': 2048, 'padding_side': 'right', 'truncation_side': 'right', 'clean_up_tokenization_spaces': True, 'truncation': None, 'padding': None, 'normalizer': None, 'pre_tokenizer': {'type': 'ByteLevel', 'add_prefix_space': False, 'trim_offsets': True, 'use_regex': True}, 'model': {'type': 'BPE', 'dropout': None, 'unk_token': None, 'continuing_subword_prefix': '', 'end_of_word_suffix': '', 'fuse_unk': False, 'byte_fallback': False}, 'post_processor': {'type': 'ByteLevel', 'add_prefix_space': True, 'trim_offsets': False, 'use_regex': True}, 'decoder': {'type': 'ByteLevel', 'add_prefix_space': True, 'trim_offsets': True, 'use_regex': True}}
------------------------
Loading btlm_3b tokenizer
8272226667653940756
{'type': <class 'transformers.models.

## Test the tokenizers on the french dataset

In [39]:
tokenizer = models_tokenizers["falcon_7b"]

def tokenization(example):
    return tokenizer(example["Text"])

In [40]:
dataset = load_dataset(dataset_path, split="train+valid+test")
dataset = dataset.map(tokenization, batched=True)

Map:   0%|          | 0/85229 [00:00<?, ? examples/s]

In [41]:
words = 0
tokens = 0

for example in dataset:
    words += example['Words']
    tokens += len(example['input_ids'])
    
words, tokens

(67061556, 131722778)

In [43]:
tokenizer.vocab_size, tokens/words

(65024, 1.9642070040844266)

In [17]:
dataset = load_dataset(dataset_path, split="train+valid+test")
    
for model in models:
    tokenizer = models_tokenizers[model]

    def tokenization(example):
        return tokenizer(example["Text"])
    
    dataset = dataset.map(tokenization, batched=True)
    
    words = 0
    tokens = 0
    for example in dataset:
        words += example['Words']
        tokens += len(example['input_ids'])
        
    print("------------------------")
    print(f"{model}: {tokenizer.vocab_size} vocab => {tokens/words} tokens per word")

Resolving data files:   0%|          | 0/42 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/42 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/42 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/42 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/42 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/42 [00:00<?, ?it/s]

Map:   0%|          | 0/85229 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (3094 > 2048). Running this sequence through the model will result in indexing errors


------------------------
phi2_3b: 50257 vocab => 2.329939242686227 tokens per word


Map:   0%|          | 0/85229 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (10506 > 8192). Running this sequence through the model will result in indexing errors


------------------------
btlm_3b: 50257 vocab => 2.340823750048388 tokens per word


Map:   0%|          | 0/85229 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (2765 > 2048). Running this sequence through the model will result in indexing errors


------------------------
redpajama_3b: 50254 vocab => 2.0548674265774567 tokens per word


Map:   0%|          | 0/85229 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (3326 > 2048). Running this sequence through the model will result in indexing errors


------------------------
open_llama_3b: 32000 vocab => 2.4012566752850173 tokens per word


Map:   0%|          | 0/85229 [00:00<?, ? examples/s]

------------------------
stablelm_3b: 50254 vocab => 2.0548674265774567 tokens per word


Map:   0%|          | 0/85229 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (4374 > 4096). Running this sequence through the model will result in indexing errors


------------------------
yi_6b: 64000 vocab => 2.4612843758054166 tokens per word


Map:   0%|          | 0/85229 [00:00<?, ? examples/s]

------------------------
mistral_7b: 32000 vocab => 2.234656410298622 tokens per word


Map:   0%|          | 0/85229 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (2765 > 2048). Running this sequence through the model will result in indexing errors


------------------------
mpt_7b: 50254 vocab => 2.0548674265774567 tokens per word


Map:   0%|          | 0/85229 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (3038 > 2048). Running this sequence through the model will result in indexing errors


------------------------
falcon_7b: 65024 vocab => 1.9642070040844266 tokens per word


Map:   0%|          | 0/85229 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (2765 > 2048). Running this sequence through the model will result in indexing errors


------------------------
redpajama_7b: 50254 vocab => 2.0548674265774567 tokens per word


Map:   0%|          | 0/85229 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (127882 > 32768). Running this sequence through the model will result in indexing errors


------------------------
llama2_7b_32k: 32000 vocab => 2.1736836675844504 tokens per word


Map:   0%|          | 0/85229 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (3326 > 2048). Running this sequence through the model will result in indexing errors


------------------------
open_llama_7b: 32000 vocab => 2.4012566752850173 tokens per word


Map:   0%|          | 0/85229 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (9937 > 8192). Running this sequence through the model will result in indexing errors


------------------------
mpt_7b_8k: 50254 vocab => 2.0548674265774567 tokens per word


Map:   0%|          | 0/85229 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (112350 > 32768). Running this sequence through the model will result in indexing errors


------------------------
qwen_7b: 151851 vocab => 1.8874039844825552 tokens per word


Map:   0%|          | 0/85229 [00:00<?, ? examples/s]

------------------------
llama2_7b: 32000 vocab => 2.1749545745702648 tokens per word


Map:   0%|          | 0/85229 [00:00<?, ? examples/s]

------------------------
bloomz_7b: 250680 vocab => 1.4450713759161806 tokens per word


Map:   0%|          | 0/85229 [00:00<?, ? examples/s]

------------------------
decilm_7b: 32000 vocab => 2.234656410298622 tokens per word


Map:   0%|          | 0/85229 [00:00<?, ? examples/s]

------------------------
solar_10b: 32000 vocab => 2.234656410298622 tokens per word


Map:   0%|          | 0/85229 [00:00<?, ? examples/s]

------------------------
llama2_13b: 32000 vocab => 2.1749545745702648 tokens per word


Map:   0%|          | 0/85229 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (10460 > 8192). Running this sequence through the model will result in indexing errors


------------------------
qwen_14b: 151851 vocab => 1.8874039844825552 tokens per word


Map:   0%|          | 0/85229 [00:00<?, ? examples/s]

------------------------
mixtral_8x7B: 32000 vocab => 2.234656410298622 tokens per word


Map:   0%|          | 0/85229 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (9937 > 8192). Running this sequence through the model will result in indexing errors


------------------------
mpt_30b: 50254 vocab => 2.0548674265774567 tokens per word


Map:   0%|          | 0/85229 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (4374 > 4096). Running this sequence through the model will result in indexing errors


------------------------
yi_34b: 64000 vocab => 2.4612843758054166 tokens per word


Map:   0%|          | 0/85229 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (3038 > 2048). Running this sequence through the model will result in indexing errors


------------------------
falcon_40b: 65024 vocab => 1.9642070040844266 tokens per word


## Train a tokenizer on the french dataset

In [10]:
def batch_iterator(batch_size=1000):
    for i in range(0, len(dataset), batch_size):
        yield dataset[i : i + batch_size]["Text"]

In [17]:
# Basic byte-level BPE
from tokenizers import Tokenizer, decoders, models, normalizers, pre_tokenizers, trainers
tokenizer = Tokenizer(models.BPE())
# tokenizer.normalizer = None
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
tokenizer.decoder = decoders.ByteLevel()
trainer = trainers.BpeTrainer(
    vocab_size=32000,
    min_frequency=100,
    initial_alphabet=pre_tokenizers.ByteLevel.alphabet(),
    special_tokens=["<PAD>", "<BOS>", "<EOS>"],
    show_progress=True
)

In [18]:
tokenizer.train_from_iterator(batch_iterator(), trainer=trainer, length=len(dataset))






In [None]:
def tokenization(examples):
    return {'input_ids': [enc.ids for enc in tokenizer.encode_batch(examples["Text"])]}

dataset = dataset.map(tokenization, batched=True)

words = 0
tokens = 0
for example in dataset:
    words += example['Words']
    tokens += len(example['input_ids'])

print("------------------------")
print(f"custom: {tokens/words} tokens per word")

In [38]:
print(f"custom: {tokens/words} tokens per word")

custom: 1.4874704070391687 tokens per word


In [None]:
[token for token in tokenizer.get_vocab().keys() if len(token)>=10]

In [45]:
from collections import Counter

tokens_counts = Counter()

for example in dataset:
    tokens_counts.update(example['input_ids'])

In [47]:
len(tokens_counts)

31877

In [51]:
len(tokens_counts) - len([{tokenizer.decode([token]):tokens_counts[token]} for token in tokens_counts.keys() if tokens_counts[token]<100])

30087

In [52]:
 len([{tokenizer.decode([token]):tokens_counts[token]} for token in tokens_counts.keys() if tokens_counts[token]<100])

1790