# Low-Level Hugging Face API: **Tokenizer**

**Note**: It is recommended to use Google Colab

# Initial Setup

In [1]:
from google.colab import userdata
from huggingface_hub import login
from transformers import AutoTokenizer

In [2]:
# Login Hugging Face
HUGGINGFACE_TOKEN = userdata.get("HF_TOKEN")
login(HUGGINGFACE_TOKEN)

# Models

Each open-source model has its own rules, including tokenizer. We **must use their** (tokenizer) method when using their model.

Do not use one model's token in another model because the token will be **meaningless**.

In [50]:
# Models path
llama_model = "meta-llama/Llama-3.1-8B"
phi_model = "microsoft/Phi-3-mini-4k-instruct"
qwen_model = "Qwen/Qwen2-7B-Instruct"
starcoder_model = "bigcode/starcoder2-3b" # Nvidia

In [7]:
text = "Let's learn about tokenizer on LLM."

## Llama3.1

To use any Llama model, we first have to request access to Meta via the Hugging Face website. Then wait for them to accept our request.

Source: https://huggingface.co/meta-llama/Llama-3.1-8B

In [22]:
llama_tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path=llama_model,
    trust_remote_code=True
)

# Encoder
llama_tokens = llama_tokenizer.encode(text)
print(llama_tokens)
print(f"length: {(len(llama_tokens))}")

[128000, 10267, 596, 4048, 922, 47058, 389, 445, 11237, 13]
length: 10


In [15]:
# Decoder (Decoded tokens always have special tokens or indicators)
llama_decoded = llama_tokenizer.decode(llama_tokens)
llama_decoded

"<|begin_of_text|>Let's learn about tokenizer on LLM."

In [46]:
# How the text is tokenized or encoded
llama_batch_decoded = llama_tokenizer.batch_decode(llama_tokens)

print(llama_batch_decoded)
print(f"length: {len(llama_batch_decoded)}")

['<|begin_of_text|>', 'Let', "'s", ' learn', ' about', ' tokenizer', ' on', ' L', 'LM', '.']
length: 10


In [34]:
# Get model's vocab or dictionaries
llama_vocab = llama_tokenizer.vocab
print(f"Llama vocab length: {len(llama_vocab)}")

# Gen model's special tokens
llama_special_tokens = llama_tokenizer.get_added_vocab()
print(f"Llama special tokens length: {len(llama_special_tokens)}")

Llama vocab length: 128256
Llama special tokens length: 256


### Llama Summary

In [71]:
print(f"Original text\t: {text}")
print(f"Tokens\t\t: {llama_tokens} | (length:{len(llama_tokens)})")
print(f"Batch decoded\t: {llama_batch_decoded} | (length:{len(llama_batch_decoded)})")
print(f"Decoded tokens\t: {llama_decoded}")

Original text	: Let's learn about tokenizer on LLM.
Tokens		: [128000, 10267, 596, 4048, 922, 47058, 389, 445, 11237, 13] | (length:10)
Batch decoded	: ['<|begin_of_text|>', 'Let', "'s", ' learn', ' about', ' tokenizer', ' on', ' L', 'LM', '.'] | (length:10)
Decoded tokens	: <|begin_of_text|>Let's learn about tokenizer on LLM.


## Phi

Source: https://huggingface.co/microsoft/Phi-3-mini-4k-instruct

In [40]:
phi_tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path=phi_model,
    trust_remote_code=True
)

phi_tokens = phi_tokenizer.encode(text)
print(phi_tokens)
print(f"length: {len(phi_tokens)}")

[2803, 29915, 29879, 5110, 1048, 5993, 3950, 373, 365, 26369, 29889]
length: 11


In [43]:
phi_batch_decoded = phi_tokenizer.batch_decode(phi_tokens)
print(phi_batch_decoded)
print(f"length: {len(phi_batch_decoded)}")

['Let', "'", 's', 'learn', 'about', 'token', 'izer', 'on', 'L', 'LM', '.']
length: 11


In [45]:
phi_decoded = phi_tokenizer.decode(phi_tokens)
phi_decoded

"Let's learn about tokenizer on LLM."

In [48]:
print(f"Phi vocab: {len(phi_tokenizer.vocab)}")
print(f"Phi special tokens: {len(phi_tokenizer.get_added_vocab())}")

Phi vocab: 32011
Phi special tokens: 14


## Qwen

Source: https://huggingface.co/Qwen/Qwen2-7B-Instruct

In [52]:
qwen_tokenizer = AutoTokenizer.from_pretrained(
    qwen_model,
    trust_remote_code=True
)

qwen_tokens = qwen_tokenizer.encode(text)
print(qwen_tokens)
print(f"length: {len(qwen_tokens)}")

[10061, 594, 3960, 911, 45958, 389, 444, 10994, 13]
length: 9


In [53]:
qwen_batch_decoded = qwen_tokenizer.batch_decode(qwen_tokens)

print(qwen_batch_decoded)
print(f"length: {len(qwen_batch_decoded)}")

['Let', "'s", ' learn', ' about', ' tokenizer', ' on', ' L', 'LM', '.']
length: 9


In [57]:
qwen_decoded = qwen_tokenizer.decode(qwen_tokens)
qwen_decoded

"Let's learn about tokenizer on LLM."

In [58]:
print(f"Qwen vocab: {len(qwen_tokenizer.vocab)}")
print(f"Qwen special tokens: {len(qwen_tokenizer.get_added_vocab())}")

Qwen vocab: 151646
Qwen special tokens: 3


## StarCoder2

Designed for program or code generation.

In [73]:
starcoder_tokenizer = AutoTokenizer.from_pretrained(
    starcoder_model,
    trust_remote_code=True
)

code="""
def greet(person):
  print(f"Hello, {person}!")
"""

starcoder_tokens = starcoder_tokenizer.encode(code)

print(starcoder_tokens)
print(f"length: {len(starcoder_tokens)}")

[222, 610, 504, 7111, 45, 6427, 731, 353, 1489, 45, 107, 39, 8302, 49, 320, 6427, 130, 16013, 222]
length: 19


In [74]:
starcoder_batch_decoded = starcoder_tokenizer.batch_decode(starcoder_tokens)

print(starcoder_batch_decoded)
print(f"length: {len(starcoder_batch_decoded)}")

['\n', 'def', ' g', 'reet', '(', 'person', '):', '\n ', ' print', '(', 'f', '"', 'Hello', ',', ' {', 'person', '}', '!")', '\n']
length: 19


In [91]:
for token in starcoder_tokens:
  print(f"{token} = {starcoder_tokenizer.decode(token)}")

222 = 

610 = def
504 =  g
7111 = reet
45 = (
6427 = person
731 = ):
353 = 
 
1489 =  print
45 = (
107 = f
39 = "
8302 = Hello
49 = ,
320 =  {
6427 = person
130 = }
16013 = !")
222 = 



In [75]:
starcoder_decoded = starcoder_tokenizer.decode(starcoder_tokens)
starcoder_decoded

'\ndef greet(person):\n  print(f"Hello, {person}!")\n'

In [77]:
print(f"StarCoder vocab: {len(starcoder_tokenizer.vocab)}")
print(f"StarCoder special tokens: {len(starcoder_tokenizer.get_added_vocab())}")

StarCoder vocab: 49152
StarCoder special tokens: 38


## Models Summary

In [84]:
print(f"Original text\t: {text}")

print(f"\nLlama")
print(f"Tokens\t\t: {llama_tokens}")
print(f"Batch decoded\t: {llama_batch_decoded}")
print(f"Length\t\t: {len(llama_tokens)}")

print(f"\nPhi")
print(f"Tokens\t\t: {phi_tokens}")
print(f"Batch decoded\t: {phi_batch_decoded}")
print(f"Length\t\t: {len(phi_tokens)}")

print(f"\nQwen")
print(f"Tokens\t\t: {qwen_tokens}")
print(f"Batch decoded\t: {qwen_batch_decoded}")
print(f"Length\t\t: {len(qwen_tokens)}")

print(f"\n{50 * '='}")

print(f"{code}")

print("\nStarCoder")
print(f"Tokens\t\t: {starcoder_tokens}")
print(f"Batch decoded\t: {starcoder_batch_decoded}")
print(f"Length\t\t: {len(starcoder_tokens)}")

Original text	: Let's learn about tokenizer on LLM.

Llama
Tokens		: [128000, 10267, 596, 4048, 922, 47058, 389, 445, 11237, 13]
Batch decoded	: ['<|begin_of_text|>', 'Let', "'s", ' learn', ' about', ' tokenizer', ' on', ' L', 'LM', '.']
Length		: 10

Phi
Tokens		: [2803, 29915, 29879, 5110, 1048, 5993, 3950, 373, 365, 26369, 29889]
Batch decoded	: ['Let', "'", 's', 'learn', 'about', 'token', 'izer', 'on', 'L', 'LM', '.']
Length		: 11

Qwen
Tokens		: [10061, 594, 3960, 911, 45958, 389, 444, 10994, 13]
Batch decoded	: ['Let', "'s", ' learn', ' about', ' tokenizer', ' on', ' L', 'LM', '.']
Length		: 9


def greet(person):
  print(f"Hello, {person}!")


StarCoder
Tokens		: [222, 610, 504, 7111, 45, 6427, 731, 353, 1489, 45, 107, 39, 8302, 49, 320, 6427, 130, 16013, 222]
Batch decoded	: ['\n', 'def', ' g', 'reet', '(', 'person', '):', '\n ', ' print', '(', 'f', '"', 'Hello', ',', ' {', 'person', '}', '!")', '\n']
Length		: 19


# Fine-Tuned Model for Chat (Model + Instruct)

This model can understand chat formats like GPT (which has **system and user messages**).

In [67]:
# Model path
chat_model = "meta-llama/Llama-3.1-8B-Instruct"

# Tokenizer
chat_tokenizer = AutoTokenizer.from_pretrained(
    chat_model,
    trust_remote_code=True
)

In [87]:
messages = [
    {"role": "system", "content": "You are an assistant who is good at making jokes"},
    {"role": "user", "content": "Tell a dad joke about machine learning."}
]

prompt = chat_tokenizer.apply_chat_template(
    messages,
    tokenize=False, # True: number tokens | False: text tokens
    add_generation_prompt=True
)

phi_prompt = phi_tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)

qwen_prompt = qwen_tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)

print("\nLlama")
print(prompt)

print("\nPhi")
print(phi_prompt)

print("\nQwen")
print(qwen_prompt)


Llama
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

You are an assistant who is good at making jokes<|eot_id|><|start_header_id|>user<|end_header_id|>

Tell a dad joke about machine learning.<|eot_id|><|start_header_id|>assistant<|end_header_id|>



Phi
<|system|>
You are an assistant who is good at making jokes<|end|>
<|user|>
Tell a dad joke about machine learning.<|end|>
<|assistant|>


Qwen
<|im_start|>system
You are an assistant who is good at making jokes<|im_end|>
<|im_start|>user
Tell a dad joke about machine learning.<|im_end|>
<|im_start|>assistant



In [94]:
prompt_num = chat_tokenizer.apply_chat_template(
    messages,
    tokenize=True, # True: number tokens | False: text tokens
    add_generation_prompt=True
)

phi_prompt_num = phi_tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True
)

qwen_prompt_num = qwen_tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True
)

print("\nLlama")
print(prompt_num)
print(f"length: {len(prompt_num)}")

print("\nPhi")
print(phi_prompt_num)
print(f"length: {len(phi_prompt_num)}")

print("\nQwen")
print(qwen_prompt_num)
print(f"length: {len(qwen_prompt_num)}")


Llama
[128000, 128006, 9125, 128007, 271, 38766, 1303, 33025, 2696, 25, 6790, 220, 2366, 18, 198, 15724, 2696, 25, 220, 1627, 10263, 220, 2366, 19, 271, 2675, 527, 459, 18328, 889, 374, 1695, 520, 3339, 32520, 128009, 128006, 882, 128007, 271, 41551, 264, 18233, 22380, 922, 5780, 6975, 13, 128009, 128006, 78191, 128007, 271]
length: 53

Phi
[32006, 887, 526, 385, 20255, 1058, 338, 1781, 472, 3907, 432, 23195, 32007, 32010, 24948, 263, 270, 328, 2958, 446, 1048, 4933, 6509, 29889, 32007, 32001]
length: 26

Qwen
[151644, 8948, 198, 2610, 525, 458, 17847, 879, 374, 1661, 518, 3259, 31420, 151645, 198, 151644, 872, 198, 40451, 264, 17760, 21646, 911, 5662, 6832, 13, 151645, 198, 151644, 77091, 198]
length: 31
