In [1]:
!pip install git+https://github.com/taidopurason/tokenizer-extension.git@v0.1

Collecting git+https://github.com/taidopurason/tokenizer-extension.git@v0.1
  Cloning https://github.com/taidopurason/tokenizer-extension.git (to revision v0.1) to /tmp/pip-req-build-y99hmhg8
  Running command git clone --filter=blob:none --quiet https://github.com/taidopurason/tokenizer-extension.git /tmp/pip-req-build-y99hmhg8
  Running command git checkout -b v0.1 --track origin/v0.1
  Switched to a new branch 'v0.1'
  Branch 'v0.1' set up to track remote branch 'v0.1' from 'origin'.
  Resolved https://github.com/taidopurason/tokenizer-extension.git to commit a71b877098092944fab327d424342abac8e7a4da
  Preparing metadata (setup.py) ... [?25l[?25hdone


## Preparing Datasets


In [2]:
from tokenizer_extension.benchmarking import find_unreachable_tokens_tokenization
from tokenizer_extension.utils import budget_iterator
from datasets import load_dataset, Dataset
import random


def get_fineweb_ds(lang: str = "ekk_Latn", streaming: bool = True):
    extra_shuffle_args = {}
    if streaming:
        extra_shuffle_args = {"buffer_size": 1000}

    if lang == "eng_Latn":
        ds = load_dataset("HuggingFaceFW/fineweb", "sample-10BT", split="train", streaming=streaming)
    else:
        ds = load_dataset("HuggingFaceFW/fineweb-2", lang, split="train", streaming=streaming)

    return ds.shuffle(seed=42, **extra_shuffle_args).skip(10000)


def create_budget_ds(
        lang: str = "ekk_Latn",
        char_budget=1_000_000_000,
        streaming: bool = True
):
    ds_train = get_fineweb_ds(streaming=streaming, lang=lang)
    train_docs = budget_iterator(ds_train, char_budget)

    return list(train_docs)


def load_flores(lang: str, split: str = "devtest"):
    dataset = load_dataset("openlanguagedata/flores_plus", lang)[split]
    return [example["text"] for example in dataset]


def calculate_bytes_per_token(tokenizer, data):
    total_bytes = 0
    total_tokens = 0
    for text in data:
        if text == "":
            raise ValueError("Empty text in dataset.")
        total_bytes += len(text.encode('utf-8'))
        total_tokens += len(tokenizer.tokenize(text))
    return total_bytes / total_tokens


def benchmark_tokenizer(tokenizer, test_data: dict):
    return {
        "vocab_size": len(tokenizer),
        "unreachable_tokens": len(find_unreachable_tokens_tokenization(tokenizer)),
        **{
            f"{lang}_bytes_per_token": calculate_bytes_per_token(tokenizer, ds)
            for lang, ds in test_data.items()
        }
    }

TGT_LANG = "ekk_Latn"
# small character budget for the example:
BUDGET = 10_000_000

tgt_dataset = create_budget_ds(lang=TGT_LANG, char_budget=BUDGET)
en_dataset = create_budget_ds(lang="eng_Latn", char_budget=BUDGET)
joint_dataset = tgt_dataset + en_dataset
random.Random(42).shuffle(joint_dataset)

test_data = {
    "eng_Latn": load_flores("eng_Latn"),
    TGT_LANG: load_flores(TGT_LANG),
}
benchmark = lambda tokenizer: benchmark_tokenizer(tokenizer, test_data)

 98%|█████████▊| 9831704/10000000 [00:03<00:00, 2652550.19it/s]


Resolving data files:   0%|          | 0/27468 [00:00<?, ?it/s]

100%|█████████▉| 9998655/10000000 [00:10<00:00, 945910.64it/s] 


Resolving data files:   0%|          | 0/224 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/218 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/224 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/218 [00:00<?, ?it/s]

dev/ekk_Latn.parquet:   0%|          | 0.00/117k [00:00<?, ?B/s]

devtest/ekk_Latn.parquet:   0%|          | 0.00/122k [00:00<?, ?B/s]

Generating dev split:   0%|          | 0/997 [00:00<?, ? examples/s]

Generating devtest split:   0%|          | 0/1012 [00:00<?, ? examples/s]

## Loading the tokenizer

In [3]:
from transformers import AutoTokenizer
base_tokenizer_name = "Qwen/Qwen2.5-7B-Instruct"
base_tokenizer = AutoTokenizer.from_pretrained(base_tokenizer_name)
results_base = benchmark(base_tokenizer)
results_base

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

{'vocab_size': 151665,
 'unreachable_tokens': 0,
 'eng_Latn_bytes_per_token': 4.782448137286847,
 'ekk_Latn_bytes_per_token': 2.607062637942147}

## Tokenizer Extension

In [4]:
from tokenizer_extension.utils import batch_iterator, update_postprocessor_special_tokens, get_vocab_and_merges


def train_new_tokenizer(tokenizer, train_docs, vocab_size):
    train_batch_iterator = batch_iterator(train_docs)
    # In some versions of the transformers postprocessor tokens are not correctly updated:
    new_tokenizer = update_postprocessor_special_tokens(
        tokenizer.train_new_from_iterator(train_batch_iterator, vocab_size)
    )
    return new_tokenizer


aux_tokenizer = train_new_tokenizer(AutoTokenizer.from_pretrained(base_tokenizer_name), tgt_dataset, 32000)
aux_vocab, aux_merges = get_vocab_and_merges(aux_tokenizer)

In [5]:
from tokenizer_extension.extension import extend_tokenizer

# generating merges with https://github.com/huggingface/transformers/blob/3cdccba080c130805dcfabda4e9baa49802853b7/src/transformers/convert_slow_tokenizer.py#L64
# the tokenizer is changed in-place, so it has to be reloaded
tokenizer_naive = extend_tokenizer(
    AutoTokenizer.from_pretrained(base_tokenizer_name),
    new_vocab=aux_vocab,
    n_tokens=8000,
)

results_naive = benchmark(tokenizer_naive)
results_naive

{'vocab_size': 159665,
 'unreachable_tokens': 660,
 'eng_Latn_bytes_per_token': 4.783833701517401,
 'ekk_Latn_bytes_per_token': 3.769528381813047}

In [6]:
# concatenating the merge lists
tokenizer_naive_merge = extend_tokenizer(
    AutoTokenizer.from_pretrained(base_tokenizer_name),
    new_vocab=aux_vocab,
    new_merges=aux_merges,
    n_tokens=8000,
)

results_naive_merge = benchmark(tokenizer_naive_merge)
results_naive_merge

{'vocab_size': 159665,
 'unreachable_tokens': 3460,
 'eng_Latn_bytes_per_token': 4.783487235198262,
 'ekk_Latn_bytes_per_token': 3.2685799642481084}

In [7]:
from tokenizer_extension.train_vocab_extension import train_vocab_extension

# continued BPE training (ours)
extension_tokens = train_vocab_extension(
    tokenizer=base_tokenizer,
    corpus=tgt_dataset,
    extension_size=32000,
)

tokenizer_continued = extend_tokenizer(
    AutoTokenizer.from_pretrained(base_tokenizer_name),
    new_vocab=extension_tokens["vocab"],
    new_merges=extension_tokens["merges"],
    n_tokens=8000,
)
results_continued = benchmark(tokenizer_continued)
results_continued

computing frequencies: 100%|██████████| 1774/1774 [00:21<00:00, 84.05it/s]
training: 100%|██████████| 32000/32000 [00:10<00:00, 3014.43it/s]


{'vocab_size': 159665,
 'unreachable_tokens': 0,
 'eng_Latn_bytes_per_token': 4.783314020857474,
 'ekk_Latn_bytes_per_token': 3.99434419606787}

In [8]:
import pandas as pd

pd.DataFrame([
    {"name": "baseline", **results_base},
    {"name": "naive extension (regen)", **results_naive},
    {"name": "naive extension (merge)", **results_naive_merge},
    {"name": "continued BPE training", **results_continued},
])

Unnamed: 0,name,vocab_size,unreachable_tokens,eng_Latn_bytes_per_token,ekk_Latn_bytes_per_token
0,baseline,151665,0,4.782448,2.607063
1,naive extension (regen),159665,660,4.783834,3.769528
2,naive extension (merge),159665,3460,4.783487,3.26858
3,continued BPE training,159665,0,4.783314,3.994344


## Tokenizer Pruning

In [9]:
from transformers import AutoTokenizer
from tokenizer_extension.pruning import LeafFrequencyPruner

pruner = LeafFrequencyPruner()
pruner.train(base_tokenizer, joint_dataset)

# here the pruner also filters out any special tokens from removal
# and modified the tokenizer in-place:
pruned_tokenizer = pruner.prune(AutoTokenizer.from_pretrained(base_tokenizer_name), 32000)
results_pruned = benchmark(pruned_tokenizer)
results_pruned

{'vocab_size': 119665,
 'unreachable_tokens': 0,
 'eng_Latn_bytes_per_token': 4.781582567146891,
 'ekk_Latn_bytes_per_token': 2.606909886332565}

In [10]:
# Let's extend the pruned tokenizer
extension_tokens_pruned = train_vocab_extension(
    tokenizer=pruned_tokenizer,
    corpus=tgt_dataset,
    extension_size=8000,
)

tokenizer_pruned_continued = extend_tokenizer(
    pruned_tokenizer,
    new_vocab=extension_tokens_pruned["vocab"],
    new_merges=extension_tokens_pruned["merges"],
    n_tokens=8000,
)
results_pruned_continued = benchmark(tokenizer_pruned_continued)
results_pruned_continued

computing frequencies: 100%|██████████| 1774/1774 [00:17<00:00, 102.29it/s]
training: 100%|██████████| 8000/8000 [00:04<00:00, 1642.10it/s]


{'vocab_size': 127665,
 'unreachable_tokens': 0,
 'eng_Latn_bytes_per_token': 4.782448137286847,
 'ekk_Latn_bytes_per_token': 3.9939856373429086}

In [11]:
pd.DataFrame([
    {"name": "baseline", **results_base},
    {"name": "baseline (pruned)", **results_pruned},
    {"name": "naive extension (regen)", **results_naive},
    {"name": "naive extension (merge)", **results_naive_merge},
    {"name": "continued BPE training", **results_continued},
    {"name": "continued BPE training (pruned)", **results_pruned_continued},
])

Unnamed: 0,name,vocab_size,unreachable_tokens,eng_Latn_bytes_per_token,ekk_Latn_bytes_per_token
0,baseline,151665,0,4.782448,2.607063
1,baseline (pruned),119665,0,4.781583,2.60691
2,naive extension (regen),159665,660,4.783834,3.769528
3,naive extension (merge),159665,3460,4.783487,3.26858
4,continued BPE training,159665,0,4.783314,3.994344
5,continued BPE training (pruned),127665,0,4.782448,3.993986
