In [128]:
import transformers
import numpy as np

In [129]:
corpus = [
    "This is the Hugging Face course.",
    "This chapter is about tokenization.",
    "This section shows several tokenizer algorithms.",
    "Hopefully, you will be able to understand how they are trained and generate tokens.",
]

In [130]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")

In [131]:
from collections import defaultdict
word_freqs = defaultdict(int)

for text in corpus:
    tokens = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)
    for token in tokens:
        word_freqs[token[0]] += 1

In [132]:
character_freqs = defaultdict(int)
for word in word_freqs.keys():
    for char in word:
        character_freqs[char] += 1

In [133]:
alphabet = list(character_freqs.keys())
vocab = ["<|endoftext|>"] + alphabet.copy()
merges = {}

In [134]:
splits = {
    word: [c for c in word] for word in word_freqs.keys()
}

In [135]:
def compute_pair_freqs(splits):
    pair_freqs = defaultdict(int)
    for word, freq in word_freqs.items():
        chunks = splits[word]
        if len(chunks) == 1:
            continue
        for i in range(len(chunks) - 1):
            pair_freqs[(chunks[i], chunks[i + 1])] += freq
        
    return pair_freqs

In [136]:
def merge_pairs(splits, pair):
    for word in splits.keys():
        chunks = splits[word]
        if len(chunks) == 1:
            continue
        
        i = 0
        while i < len(chunks) - 1:
            if chunks[i] == pair[0] and chunks[i + 1] == pair[1]:
                chunks = chunks[:i] + [pair[0] + pair[1]] + chunks[i + 2:]
            else:
                i += 1
        splits[word] = chunks
    return splits

In [137]:
vocab_size = 50

while len(vocab) < vocab_size:
    pair_freqs = compute_pair_freqs(splits)
    max_freq = np.max(list(pair_freqs.values()))
    best_pair = [(a, b) for (a, b), freq in pair_freqs.items() if freq == max_freq][0]
    splits = merge_pairs(splits, best_pair)
    merges[best_pair] = best_pair[0] + best_pair[1]
    vocab.append(best_pair[0] + best_pair[1])

In [138]:
def tokenize(text):
    pre_tokenize_result = tokenizer._tokenizer.pre_tokenizer.pre_tokenize_str(text)
    pre_tokenized_text = [word for word, offset in pre_tokenize_result]
    splits = [[l for l in word] for word in pre_tokenized_text]
    for pair, merge in merges.items():
        for idx, split in enumerate(splits):
            i = 0
            while i < len(split) - 1:
                if split[i] == pair[0] and split[i + 1] == pair[1]:
                    split = split[:i] + [merge] + split[i + 2 :]
                else:
                    i += 1
            splits[idx] = split

    return sum(splits, [])

In [139]:
tokenize("This is not a token.")

['This', 'Ġis', 'Ġ', 'n', 'o', 't', 'Ġa', 'Ġtoken', '.']