### Tokenization

"GenAI is amazaing" => Gen , AI , is , amazaing, ! => [1,2,3,4,5] <br>
most people use only 30-50,000 word vocabulary <br>
But Llama model use 128,000 token vocabulary


In [1]:
import warnings
warnings.filterwarnings('ignore')

In [14]:
from utils import load_env
load_env()

In [2]:
from pathlib import Path
import tiktoken
from tiktoken.load import load_tiktoken_bpe
import torch
import json
import matplotlib.pyplot as plt

tokenizer_path = "./content/tokenizer.model"
num_reserved_special_tokens = 256

mergeable_ranks = load_tiktoken_bpe(tokenizer_path)

num_base_tokens = len(mergeable_ranks)
special_tokens = [
    "<|begin_of_text|>",
    "<|end_of_text|>",
    "<|reserved_special_token_0|>",
    "<|reserved_special_token_1|>",
    "<|finetune_right_pad_id|>",
    "<|step_id|>",
    "<|start_header_id|>",
    "<|end_header_id|>",
    "<|eom_id|>",
    "<|eot_id|>",
    "<|python_tag|>",
]
reserved_tokens = [
    f"<|reserved_special_token_{2 + i}|>"
    for i in range(num_reserved_special_tokens - len(special_tokens))
]
special_tokens = special_tokens + reserved_tokens

# source: https://github.com/meta-llama/llama-models/blob/main/models/llama3/api/tokenizer.py#L53
tokenizer = tiktoken.Encoding(
    name=Path(tokenizer_path).name,
    pat_str=r"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+",
    mergeable_ranks=mergeable_ranks,
    special_tokens={token: len(mergeable_ranks) + i for i, token in enumerate(special_tokens)},
)

In [3]:
tokenizer.encode("Thar Htet San")

[1016, 277, 473, 73542, 5960]

In [4]:
tokenizer.decode([1016, 277, 473, 73542, 5960])

'Thar Htet San'

### Tokens.ipynb
If you would like to view a UTF-8 view of the Tokens.model file, uncomment the following line and run it.

In [5]:
#!cat Tokens.ipynb

In [6]:
input_text = "hello world"
len(tokenizer.encode(input_text))

2

In [7]:
question = "hello"
prompt = f"""<|begin_of_text|><|start_header_id|>user<|end_header_id|>

{question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""

encoded_tokens = tokenizer.encode(prompt, allowed_special="all")
len(encoded_tokens)

11

In [8]:
question = "Who wrote the book Charlotte's Web?"
prompt = f"""<|begin_of_text|><|start_header_id|>user<|end_header_id|>

{question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""

encoded_tokens = tokenizer.encode(prompt, allowed_special="all")
len(encoded_tokens)

18

In [9]:
decoded_tokens = [tokenizer.decode([token]) for token in encoded_tokens]
for e, d in zip(encoded_tokens, decoded_tokens):
    print(e, d)

128000 <|begin_of_text|>
128006 <|start_header_id|>
882 user
128007 <|end_header_id|>
271 


15546 Who
6267  wrote
279  the
2363  book
29473  Charlotte
596 's
5000  Web
30 ?
128009 <|eot_id|>
128006 <|start_header_id|>
78191 assistant
128007 <|end_header_id|>
198 



In [10]:
from IPython.display import display, HTML
from utils import html_tokens, llama31

In [11]:
display(HTML(html_tokens(decoded_tokens)))

In [12]:
#Try one of you own:
prompt = "Supercalifragilisticexpialidocious"
encoded_tokens = tokenizer.encode(prompt, allowed_special="all")
decoded_tokens = [tokenizer.decode([token]) for token in encoded_tokens]
display(HTML(html_tokens(decoded_tokens)))

# LLM reasoning vs tokenization

In [15]:
question = "How many r's in the word strawberry?"
prompt = f"""
<|begin_of_text|><|start_header_id|>user<|end_header_id|>

{question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""
response = llama31(prompt)
print(response)

There are 2 r's in the word "strawberry".


In [18]:
encoded_tokens = tokenizer.encode(prompt, allowed_special="all")
decoded_tokens = [tokenizer.decode([token]) for token in encoded_tokens]
print(decoded_tokens)
display(HTML(html_tokens(decoded_tokens)))

['\n', '<|begin_of_text|>', '<|start_header_id|>', 'user', '<|end_header_id|>', '\n\n', 'How', ' many', ' r', "'s", ' in', ' the', ' word', ' s', ' t', ' r', ' a', ' w', ' b', ' e', ' r', ' r', ' y', '?', ' ', '<|eot_id|>', '<|start_header_id|>', 'assistant', '<|end_header_id|>', '\n']


In [17]:
question = "How many r's in the word s t r a w b e r r y? "
prompt = f"""
<|begin_of_text|><|start_header_id|>user<|end_header_id|>

{question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""
response = llama31(prompt)
print(response)

There are 3 r's in the word "strawberry".


In [19]:
encoded_tokens = tokenizer.encode(prompt, allowed_special="all")
decoded_tokens = [tokenizer.decode([token]) for token in encoded_tokens]
display(HTML(html_tokens(decoded_tokens)))

In [20]:
import base64

encoded_tokens = []
decoded_byte_tokens = []
decoded_utf8_tokens = []

with open("./content/tokenizer.model", 'r') as file:
  for i, line in enumerate(file):
    k, v = line.strip().split(' ')
    encoded_tokens.append({k: v})
    decoded_byte_tokens.append({base64.b64decode(k): v})
    decoded_utf8_tokens.append({base64.b64decode(k).decode('utf-8', errors="replace") : v})

In [21]:
list(encoded_tokens)[:10]

[{'IQ==': '0'},
 {'Ig==': '1'},
 {'Iw==': '2'},
 {'JA==': '3'},
 {'JQ==': '4'},
 {'Jg==': '5'},
 {'Jw==': '6'},
 {'KA==': '7'},
 {'KQ==': '8'},
 {'Kg==': '9'}]

In [22]:
list(decoded_byte_tokens)[:10]


[{b'!': '0'},
 {b'"': '1'},
 {b'#': '2'},
 {b'$': '3'},
 {b'%': '4'},
 {b'&': '5'},
 {b"'": '6'},
 {b'(': '7'},
 {b')': '8'},
 {b'*': '9'}]

In [23]:
list(decoded_utf8_tokens)[:10]

[{'!': '0'},
 {'"': '1'},
 {'#': '2'},
 {'$': '3'},
 {'%': '4'},
 {'&': '5'},
 {"'": '6'},
 {'(': '7'},
 {')': '8'},
 {'*': '9'}]

In [24]:
base64.b64encode('h'.encode('utf-8'))

b'aA=='

In [25]:
base64.b64encode('hello'.encode('utf-8'))

b'aGVsbG8='

In [26]:
!grep "aGVsbG8=" ./content/tokenizer.model


aGVsbG8= 15339


In [27]:
question = "Which number is bigger, 9.11 or 9.9? "
prompt = f"""
<|begin_of_text|><|start_header_id|>user<|end_header_id|>

{question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""
response = llama31(prompt)
print(response)

9.11 is bigger than 9.9.


In [28]:
response = llama31(prompt, 70)
print(response)

9.9 is bigger than 9.11.


In [29]:
response = llama31(prompt, 405)
print(response)

The number 9.11 is bigger than 9.9.

To compare these numbers, you can look at the decimal part. Since 0.11 is greater than 0.09 (or 0.9 - 0.9 = 0 and 0.11 - 0.09 = 0.02), 9.11 is greater than 9.9


In [30]:
encoded_tokens = tokenizer.encode(prompt, allowed_special="all")
decoded_tokens = [tokenizer.decode([token]) for token in encoded_tokens]
[x for x in zip(encoded_tokens, decoded_tokens)]

[(198, '\n'),
 (128000, '<|begin_of_text|>'),
 (128006, '<|start_header_id|>'),
 (882, 'user'),
 (128007, '<|end_header_id|>'),
 (271, '\n\n'),
 (23956, 'Which'),
 (1396, ' number'),
 (374, ' is'),
 (11493, ' bigger'),
 (11, ','),
 (220, ' '),
 (24, '9'),
 (13, '.'),
 (806, '11'),
 (477, ' or'),
 (220, ' '),
 (24, '9'),
 (13, '.'),
 (24, '9'),
 (30, '?'),
 (220, ' '),
 (128009, '<|eot_id|>'),
 (128006, '<|start_header_id|>'),
 (78191, 'assistant'),
 (128007, '<|end_header_id|>'),
 (198, '\n')]

In [31]:
display(HTML(html_tokens(decoded_tokens)))