In [1]:
import os
import sys
from pathlib import Path

# Add parent directory to Python path for imports
# This ensures we can import from _utils regardless of where the notebook is run from
current_dir = Path().resolve()
# If we're in the basics directory, go up one level; otherwise use current directory
if current_dir.name == 'basics':
    parent_dir = current_dir.parent
else:
    parent_dir = current_dir

if str(parent_dir) not in sys.path:
    sys.path.insert(0, str(parent_dir))

from openai import OpenAI
from huggingface_hub import login
from transformers import AutoTokenizer
from _utils.standard_functions import load_env
from IPython.display import Markdown
from IPython.display import display, update_display


In [2]:
# reading the env
load_env()

api_key = os.getenv('OPENAI_API_KEY')
base_url = os.getenv("OPENAI_BASE_URL")
hf_token = os.getenv('HF_TOKEN')

HF key looks good so far
OpenAI API Key exists and begins with sk-proj-
Google API Key exists and begins with AI
Anthropic API Key not set
DeepSeek API Key not set
Groq API Key not set
Grok API Key not set
OpenRouter API Key not set


In [3]:
# add custom details
model = "gpt-5-nano"

system_prompt = "You are a witty and sarcastic assistant. Give crisp answers and insert snarky remarks to make the content interesting"
user_prompt = "Tell me why jiujitsu is so addictive"

messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": user_prompt}
]

In [None]:
# You can use different models for tokenization. We'll use Meta-Llama-3.1-8B for this example
# Get HF token from environment variables (instead of Colab's userdata)
login(hf_token, add_to_git_credential=True)
tokenizer_model = "meta-llama/Meta-Llama-3.1-8B"
#tokenizer_model = "Qwen/Qwen2.5-72B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(tokenizer_model, trust_remote_code=True)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [5]:
tokens = tokenizer.encode(user_prompt)
tokens

[128000, 41551, 757, 3249, 74985, 9832, 50657, 374, 779, 57407]

In [6]:
character_count = len(user_prompt)
word_count = len(user_prompt.split(' '))
token_count = len(tokens)
print(f"There are {character_count} characters, {word_count} words and {token_count} tokens")

There are 36 characters, 7 words and 10 tokens


In [7]:
tokenizer.batch_decode(tokens)

['<|begin_of_text|>',
 'Tell',
 ' me',
 ' why',
 ' ji',
 'uj',
 'itsu',
 ' is',
 ' so',
 ' addictive']

In [8]:
# tokenizer.vocab
tokenizer.get_added_vocab()

{'<|begin_of_text|>': 128000,
 '<|end_of_text|>': 128001,
 '<|reserved_special_token_0|>': 128002,
 '<|reserved_special_token_1|>': 128003,
 '<|finetune_right_pad_id|>': 128004,
 '<|reserved_special_token_2|>': 128005,
 '<|start_header_id|>': 128006,
 '<|end_header_id|>': 128007,
 '<|eom_id|>': 128008,
 '<|eot_id|>': 128009,
 '<|python_tag|>': 128010,
 '<|reserved_special_token_3|>': 128011,
 '<|reserved_special_token_4|>': 128012,
 '<|reserved_special_token_5|>': 128013,
 '<|reserved_special_token_6|>': 128014,
 '<|reserved_special_token_7|>': 128015,
 '<|reserved_special_token_8|>': 128016,
 '<|reserved_special_token_9|>': 128017,
 '<|reserved_special_token_10|>': 128018,
 '<|reserved_special_token_11|>': 128019,
 '<|reserved_special_token_12|>': 128020,
 '<|reserved_special_token_13|>': 128021,
 '<|reserved_special_token_14|>': 128022,
 '<|reserved_special_token_15|>': 128023,
 '<|reserved_special_token_16|>': 128024,
 '<|reserved_special_token_17|>': 128025,
 '<|reserved_special_to

In [9]:
len(tokenizer.vocab)

128256

In [11]:
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
print(prompt)

ValueError: Cannot use chat template functions because tokenizer.chat_template is not set and no template argument was passed! For information about writing templates and setting the tokenizer.chat_template attribute, please see the documentation at https://huggingface.co/docs/transformers/main/en/chat_templating

In [12]:
PHI4 = "microsoft/Phi-4-mini-instruct"
DEEPSEEK = "deepseek-ai/DeepSeek-V3.1"
QWEN_CODER = "Qwen/Qwen2.5-Coder-7B-Instruct"

In [13]:
phi4_tokenizer = AutoTokenizer.from_pretrained(PHI4)

text = "I am curiously excited to show Hugging Face Tokenizers in action to my LLM engineers"
print("Llama:")
tokens = tokenizer.encode(text)
print(tokens)
print(tokenizer.batch_decode(tokens))
print("\nPhi 4:")
tokens = phi4_tokenizer.encode(text)
print(tokens)
print(phi4_tokenizer.batch_decode(tokens))

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


tokenizer.json:   0%|          | 0.00/15.5M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/249 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/587 [00:00<?, ?B/s]

Llama:
[128000, 40, 1097, 2917, 13610, 12304, 311, 1501, 473, 36368, 19109, 9857, 12509, 304, 1957, 311, 856, 445, 11237, 25175]
['<|begin_of_text|>', 'I', ' am', ' cur', 'iously', ' excited', ' to', ' show', ' H', 'ugging', ' Face', ' Token', 'izers', ' in', ' action', ' to', ' my', ' L', 'LM', ' engineers']

Phi 4:
[40, 939, 4396, 23138, 15209, 316, 2356, 59116, 4512, 29049, 17951, 24223, 306, 3736, 316, 922, 451, 19641, 32437]
['I', ' am', ' cur', 'iously', ' excited', ' to', ' show', ' Hug', 'ging', ' Face', ' Token', 'izers', ' in', ' action', ' to', ' my', ' L', 'LM', ' engineers']


In [None]:
print("Llama:")
print(tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True))
print("\nPhi 4:")
print(phi4_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True))

In [None]:
deepseek_tokenizer = AutoTokenizer.from_pretrained(DEEPSEEK)

text = "I am curiously excited to show Hugging Face Tokenizers in action to my LLM engineers"
print(tokenizer.encode(text))
print()
print(phi4_tokenizer.encode(text))
print()
print(deepseek_tokenizer.encode(text))

In [None]:
print("Llama:")
print(tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True))
print("\nPhi:")
print(phi4_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True))
print("\nDeepSeek:")
print(deepseek_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True))

In [None]:
qwen_tokenizer = AutoTokenizer.from_pretrained(QWEN_CODER)
code = """
def hello_world(person):
  print("Hello", person)
"""
tokens = qwen_tokenizer.encode(code)
for token in tokens:
  print(f"{token}={qwen_tokenizer.decode(token)}")