In [1]:
import tiktoken

In [2]:
encoding = tiktoken.get_encoding("cl100k_base")

In [3]:
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")

In [5]:
encoding.encode('Run tiketoken in Cursor is so great!')


[6869, 87272, 295, 1713, 304, 29167, 374, 779, 2294, 0]

In [7]:
encoding.decode([6869, 87272, 295, 1713, 304, 29167, 374, 779, 2294, 0])

'Run tiketoken in Cursor is so great!'

In [8]:
def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Return the number of tokens in a text string."""

    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

In [10]:
num_tokens_from_string("What are u doing now?", "cl100k_base")

6

In [11]:
[encoding.decode_single_token_bytes(token) for token in [83, 1609, 5963, 374, 2294, 0]]

[b't', b'ik', b'token', b' is', b' great', b'!']

In [15]:
def compare_encodings(example_string: str) -> None:
    """Prints a comparison of three string encodings."""
    print(f'\nExample string: "{example_string}"')

    # for each encoding, print the # of tokens, the token_integrers, and the token bytes
    for encoding_name in ["r50k_base", "p50k_base", "cl100k_base"]:
        encoding = tiktoken.get_encoding(encoding_name)
        token_integers = encoding.encode(example_string)
        num_tokens = len(token_integers)
        token_bytes = [encoding.decode_single_token_bytes(token) for token in token_integers]
        print()
        print(f"{encoding_name}: {num_tokens} tokens")
        print(f"token integers: {token_integers}")
        print(f"token bytes: {token_bytes}")

In [16]:
compare_encodings("Waht the day is today? Rafael.")


Example string: "Waht the day is today? Rafael."

r50k_base: 10 tokens
token integers: [54, 993, 83, 262, 1110, 318, 1909, 30, 31918, 13]
token bytes: [b'W', b'ah', b't', b' the', b' day', b' is', b' today', b'?', b' Rafael', b'.']

p50k_base: 10 tokens
token integers: [54, 993, 83, 262, 1110, 318, 1909, 30, 31918, 13]
token bytes: [b'W', b'ah', b't', b' the', b' day', b' is', b' today', b'?', b' Rafael', b'.']

cl100k_base: 9 tokens
token integers: [99327, 427, 279, 1938, 374, 3432, 30, 55500, 13]
token bytes: [b'Wa', b'ht', b' the', b' day', b' is', b' today', b'?', b' Rafael', b'.']


In [23]:
compare_encodings("2 + 2 = 4, 那么你算对了吗?")


Example string: "2 + 2 = 4, 那么你算对了吗?"

r50k_base: 23 tokens
token integers: [17, 1343, 362, 796, 604, 11, 16268, 224, 96, 20046, 230, 19526, 254, 163, 106, 245, 43380, 117, 12859, 228, 28938, 245, 30]
token bytes: [b'2', b' +', b' 2', b' =', b' 4', b',', b' \xe9', b'\x82', b'\xa3', b'\xe4\xb9', b'\x88', b'\xe4\xbd', b'\xa0', b'\xe7', b'\xae', b'\x97', b'\xe5\xaf', b'\xb9', b'\xe4\xba', b'\x86', b'\xe5\x90', b'\x97', b'?']

p50k_base: 23 tokens
token integers: [17, 1343, 362, 796, 604, 11, 16268, 224, 96, 20046, 230, 19526, 254, 163, 106, 245, 43380, 117, 12859, 228, 28938, 245, 30]
token bytes: [b'2', b' +', b' 2', b' =', b' 4', b',', b' \xe9', b'\x82', b'\xa3', b'\xe4\xb9', b'\x88', b'\xe4\xbd', b'\xa0', b'\xe7', b'\xae', b'\x97', b'\xe5\xaf', b'\xb9', b'\xe4\xba', b'\x86', b'\xe5\x90', b'\x97', b'?']

cl100k_base: 19 tokens
token integers: [17, 489, 220, 17, 284, 220, 19, 11, 18630, 224, 96, 82696, 57668, 70203, 33764, 35287, 7305, 245, 30]
token bytes: [b'2', b' +', b' ', b'2', b' 

In [24]:
# Counting tokens for chat completions API calls