In [2]:
# pip install -r requirements-extra.txt

# Comparing Various Byte Pair Encoding (BPE) Implementations



## 1. Using BPE from tiktoken


In [3]:
from importlib.metadata import version
print("tiktoken version: ", version("tiktoken"))

tiktoken version:  0.7.0


In [12]:
import tiktoken

tik_tokenizer = tiktoken.get_encoding("gpt2")

text = "Hello, world. Is this-- a test?"

In [13]:
integers = tik_tokenizer.encode(text, allowed_special={"<|endoftext|>"})

print(integers)

[15496, 11, 995, 13, 1148, 428, 438, 257, 1332, 30]


In [14]:
strings = tik_tokenizer.decode(integers)

print(strings)

Hello, world. Is this-- a test?


In [15]:
print(tik_tokenizer.n_vocab)

50257


## 2. Using the original BPE implementatiion used in GPT-2

In [16]:
from bpe_openai_gpt2 import get_encoder, download_vocab

In [17]:
download_vocab()

Fetching encoder.json: 1.04Mit [00:01, 604kit/s]                                                    
Fetching vocab.bpe: 457kit [00:01, 297kit/s]                                                        


In [18]:
orig_tokenizer = get_encoder(model_name="gpt2_model", models_dir=".")

In [19]:
integers = orig_tokenizer.encode(text)
print(integers)

[15496, 11, 995, 13, 1148, 428, 438, 257, 1332, 30]


In [20]:
strings = orig_tokenizer.decode(integers)

print(strings)

Hello, world. Is this-- a test?


## 3. Using the BPE via Hugging Face transformers

In [21]:
import transformers

transformers.__version__

  from .autonotebook import tqdm as notebook_tqdm


'4.43.3'

In [22]:
from transformers import GPT2Tokenizer

hf_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [24]:
hf_tokenizer(strings)["input_ids"]

[15496, 11, 995, 13, 1148, 428, 438, 257, 1332, 30]

## 4. A quick performance benchmark


In [25]:
with open("../the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
    
print("Total number of character:", len(raw_text))
print(raw_text[:99])

Total number of character: 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


In [26]:
%timeit orig_tokenizer.encode(raw_text)

7.93 ms ± 331 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [27]:
%timeit tik_tokenizer.encode(raw_text)

1.97 ms ± 215 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [29]:
%timeit hf_tokenizer(raw_text)["input_ids"]

Token indices sequence length is longer than the specified maximum sequence length for this model (5145 > 1024). Running this sequence through the model will result in indexing errors


15.1 ms ± 575 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [30]:
%timeit hf_tokenizer(raw_text, max_length=5145, truncation=True)["input_ids"]

14.9 ms ± 401 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
