### The original HF BERT tokenizer

In [1]:
# if necessary:
# !pip install transfomers

from transformers import BertTokenizer
model_name = 'bert-base-uncased'
hf_tokenizer = BertTokenizer.from_pretrained(model_name)

textm = ["Hello, I am BERT",
         "Semantic embeddings are an important concept and serve as a basis for information retrieval",
         "Some fruits are bananas, oranges, pineapples, strawberries"]
hf_tokenizer(textm, return_tensors='pt', padding=True, truncation=True, max_length=512)

{'input_ids': tensor([[  101,  7592,  1010,  1045,  2572, 14324,   102,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0],
        [  101, 21641,  7861,  8270,  4667,  2015,  2024,  2019,  2590,  4145,
          1998,  3710,  2004,  1037,  3978,  2005,  2592, 26384,   102],
        [  101,  2070, 10962,  2024, 26191,  1010,  4589,  2015,  1010,  7222,
         23804,  2015,  1010, 13137, 20968,   102,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]])}

### The decoupled BERTKit tokenizer

In [2]:
from bertkit_tokenizer.bertkit_tokenizer import BertKitTokenizer

In [3]:
bk_tokenizer = BertKitTokenizer("./bertkit_tokenizer/vocab.txt")

----> len vocab: 30522


In [4]:
bk_tokenizer.tokenize(textm)

{'input_ids': tensor([[  101,  7592,  1010,  1045,  2572, 14324,   102,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0],
         [  101, 21641,  7861,  8270,  4667,  2015,  2024,  2019,  2590,  4145,
           1998,  3710,  2004,  1037,  3978,  2005,  2592, 26384,   102],
         [  101,  2070, 10962,  2024, 26191,  1010,  4589,  2015,  1010,  7222,
          23804,  2015,  1010, 13137, 20968,   102,     0,     0,     0]]),
 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]])}

### Systematic tokenizer comparison for identical returns, done on a Gutenberg text containing also French with accents.

In [11]:
with open("./bertkit_tokenizer/gutenberg_text.txt", "r") as f:
    text = f.readlines()

In [12]:
import torch
def dicts_equal(d1, d2):
    if d1.keys() != d2.keys():
        return False
    return all(torch.equal(d1[k], d2[k]) for k in d1.keys())

In [18]:
for n, sentence in enumerate(text):
    if n%5000 == 0 and n>0:    
        print(f"{n} sentence samples tested for identical tokenization")
    output_hf = hf_tokenizer(sentence, return_tensors='pt', padding=True, truncation=True)
    output_bk = bk_tokenizer.tokenize(sentence)
    assert dicts_equal(output_hf,output_bk)
print(f"all {n} samples ok.")

5000 sentence samples tested for identical tokenization
10000 sentence samples tested for identical tokenization
15000 sentence samples tested for identical tokenization
20000 sentence samples tested for identical tokenization
all 20058 samples ok.
