# Purpose
The purpose of this notebook is to download all the SOTA LLM tokenizers

- Most important link is the one for [Scripts.txt](https://www.unicode.org/Public/17.0.0/ucd/Scripts.txt)


In [None]:
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm
from google.colab import userdata
%matplotlib inline

from scipy.linalg import block_diag
# Don't do linear algebra in Python without these two lines
np.set_printoptions(suppress=True)
from collections import Counter
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('retina')

%precision 3
#############################################
import sys
import importlib
importlib.reload(sys)
#######################
from google.colab import drive
drive.flush_and_unmount()
import os
drive.mount('/gdrive', force_remount=True)
# Enter your own proj_dir here
proj_dir='/gdrive/My Drive/Blog/Code/tokens/Data/'
os.chdir(proj_dir)

  set_matplotlib_formats('retina')


Mounted at /gdrive


In [None]:
#@title  Install all the needed libs and authenticate wrt HF hub
!pip install tiktoken transformers sentencepiece huggingface_hub -q
import tiktoken
import transformers
from transformers import AutoTokenizer
from huggingface_hub import login
from huggingface_hub.utils import disable_progress_bars
from google.colab import userdata

# SILENCE DOWNLOADS & LOGGING
disable_progress_bars()
transformers.utils.logging.set_verbosity_error()
transformers.utils.logging.disable_progress_bar()

# AUTHENTICATE : WE ARE USING GATED MODELS.
token_hf = userdata.get('HF_TOKEN')
login(token=token_hf)

In [None]:
#@title Import all the SoTA tokenizers from the marquee models:
import random
import pandas as pd
from transformers import AutoTokenizer
import tiktoken

# SOTA Model Registry with Hand-Populated Types
# BPE: Typically Byte-level (Tiktoken or HF implementation)
# SentencePiece: Used by Google/Meta for specific multilingual/multimodal architectures
SOTA_CONFIG = {
    "GPT-OSS": {"id": "o200k_base", "type": "BPE (Tiktoken)"},
    "Llama-4": {"id": "meta-llama/Llama-4-Scout-17B-16E-Instruct", "type": "BPE (Tiktoken)"},
    "Mistral": {"id": "mistralai/Mistral-Large-3-675B-Instruct-2512", "type": "BPE (Tekken)"},
    "Gemma-3": {"id": "google/gemma-3-270m-it", "type": "SentencePiece"},
    "GLM-4": {"id": "THUDM/glm-4-9b-chat", "type": "BPE (Tiktoken)"},
    "Qwen-2.5": {"id": "Qwen/Qwen2.5-72B-Instruct", "type": "BPE"},
    "DeepSeek-V3": {"id": "deepseek-ai/DeepSeek-V3", "type": "BPE"},
    "Phi-4": {"id": "microsoft/phi-4", "type": "BPE"},
    "RNJ-1": {"id": "EssentialAI/rnj-1-instruct", "type": "SentencePiece"},
    "OLMo-3": {"id": "allenai/OLMo-3-1125-32B", "type": "BPE"}
}

def get_tokenizer_stats(name, config):
    try:
        if name == "GPT-OSS":
            tok = tiktoken.get_encoding(config["id"])
            vocab_size = tok.n_vocab
            sample_indices = random.sample(range(vocab_size), 10)
            samples = [tok.decode([i]).strip() for i in sample_indices]
        else:
            tok = AutoTokenizer.from_pretrained(config["id"], trust_remote_code=True)
            vocab_size = len(tok)
            vocab = tok.get_vocab()
            sample_keys = random.sample(list(vocab.keys()), 10)
            samples = [str(k).replace('Ġ', ' ').replace(' ', ' ').strip() for k in sample_keys]

        return {
            "Model": name,
            "Type": config["type"],
            "Vocab Size": vocab_size,
            "Samples": ", ".join([f"'{s}'" for s in samples[:5]])
        }
    except Exception as e:
        return {"Model": name, "Type": config["type"], "Vocab Size": 0, "Samples": "Loading Error"}

# 1. Collect Data
data = [get_tokenizer_stats(name, cfg) for name, cfg in SOTA_CONFIG.items()]
df = pd.DataFrame(data)

# 2. Add "All_combined" Row
total_tokens = df["Vocab Size"].sum()
combined_row = pd.DataFrame([{
    "Model": "All_combined",
    "Type": "N/A",
    "Vocab Size": total_tokens,
    "Samples": "Total Aggregate Vocabulary"
}])
df = pd.concat([df, combined_row], ignore_index=True)

# 3. Generate LaTeX
latex_table = df.to_latex(
    index=False,
    caption="Comparative Analysis of SOTA Tokenizers (2026)",
    label="tab:tokenizer_comparison",
    column_format="|l|l|r|p{5cm}|",
    escape=True,
    longtable=False
)

print("--- CONSOLE VIEW ---")
print(df.to_string())

print("\n--- LATEX CODE ---")
print(latex_table)

--- CONSOLE VIEW ---
           Model            Type  Vocab Size                                                                    Samples
0        GPT-OSS  BPE (Tiktoken)      200019                                     'Arb', 'переп', 'notify', 'तान', 'zak'
1        Llama-4  BPE (Tiktoken)      201135                             'ÄĲáº·c', 'odziel', 'une', 'Consider', 'weich'
2        Mistral    BPE (Tekken)      131072                                'skirts', 'Empty', '"),Ċ', 'ìĿ¸ë¯¼', 'hatt'
3        Gemma-3   SentencePiece      262145                      'setOnAction', '▁Exist', '▁Fah', '教会', '<unused3749>'
4          GLM-4  BPE (Tiktoken)      151343  'b' Provision'', 'b' pr\xc3\xb3xima'', 'b' newVal'', 'b'yii'', 'b'nicas''
5       Qwen-2.5             BPE      151665                              'hacking', 'datable', '-ring', 'plung', 'ĉti'
6    DeepSeek-V3             BPE      128815                      'Industrial', 'Revenue', 'å¾ĹäºĨ', 'ãģłãģĳãģ§', 'å¾ĭ'
7          Phi-4   

# Unicode 17 CODEPOINT Look up table

In [None]:
!curl -L "https://www.unicode.org/Public/17.0.0/ucd/Scripts.txt" -o Scripts-17.0.0.txt

################################################


import requests
import pandas as pd
import unicodedata

def get_unicode_script_dataframe(url,file_output):
    # Comprehensive Category Meanings from Wikipedia
    category_map = {
        # L: Letter
        'Lu': 'Letter, uppercase',
        'Ll': 'Letter, lowercase',
        'Lt': 'Letter, titlecase',
        'Lm': 'Letter, modifier',
        'Lo': 'Letter, other',
        # M: Mark
        'Mn': 'Mark, nonspacing',
        'Mc': 'Mark, spacing combining',
        'Me': 'Mark, enclosing',
        # N: Number
        'Nd': 'Number, decimal digit',
        'Nl': 'Number, letter',
        'No': 'Number, other',
        # P: Punctuation
        'Pc': 'Punctuation, connector',
        'Pd': 'Punctuation, dash',
        'Ps': 'Punctuation, open',
        'Pe': 'Punctuation, close',
        'Pi': 'Punctuation, initial quote',
        'Pf': 'Punctuation, final quote',
        'Po': 'Punctuation, other',
        # S: Symbol
        'Sm': 'Symbol, math',
        'Sc': 'Symbol, currency',
        'Sk': 'Symbol, modifier',
        'So': 'Symbol, other',
        # Z: Separator
        'Zs': 'Separator, space',
        'Zl': 'Separator, line',
        'Zp': 'Separator, paragraph',
        # C: Other
        'Cc': 'Other, control',
        'Cf': 'Other, format',
        'Cs': 'Other, surrogate',
        'Co': 'Other, private use',
        'Cn': 'Other, not assigned'
    }

    # Fetch the Scripts.txt file
    response = requests.get(url)
    lines = response.text.splitlines()

    rows = []

    for line in lines:
        if not line or line.startswith('#'):
            continue

        try:
            parts = line.split(';')
            codepoint_part = parts[0].strip()
            rest = parts[1].split('#')
            script_name = rest[0].strip()
            meta = rest[1].strip().split()

            # The category code is the first element after the '#'
            category_code = meta[0]
            # Capture the remaining text in the comment as notes
            notes = " ".join(meta[1:])

            if '..' in codepoint_part:
                start_hex, end_hex = codepoint_part.split('..')
            else:
                start_hex = end_hex = codepoint_part

            start_val = int(start_hex, 16)
            end_val = int(end_hex, 16)

            for cp in range(start_val, end_val + 1):
                rows.append({
                    'codepoint_hex': f"{cp:04X}",
                    'script_name': script_name,
                    'category': category_code,
                    'notes': notes,
                    'category_meaning': category_map.get(category_code, "Unknown")
                })
        except (IndexError, ValueError):
            continue

    df = pd.DataFrame(rows)
    df.to_csv(file_output, sep='\t', index=False)
    print(f"Successfully saved {len(df)} rows to {file_output}")
    return df

url = "https://www.unicode.org/Public/17.0.0/ucd/Scripts.txt"
df_unicode17 = get_unicode_script_dataframe(url,'df_unicode_17.tsv')

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  187k    0  187k    0     0   249k      0 --:--:-- --:--:-- --:--:--  249k
Successfully saved 159866 rows to df_unicode_17.tsv


In [None]:
df_unicode17.head()

Unnamed: 0,codepoint_hex,script_name,category,notes,category_meaning
0,0,Common,Cc,[32] <control-0000>..<control-001F>,"Other, control"
1,1,Common,Cc,[32] <control-0000>..<control-001F>,"Other, control"
2,2,Common,Cc,[32] <control-0000>..<control-001F>,"Other, control"
3,3,Common,Cc,[32] <control-0000>..<control-001F>,"Other, control"
4,4,Common,Cc,[32] <control-0000>..<control-001F>,"Other, control"


In [None]:
#@title Helper functions to perform the script audit
import re
import random
import pandas as pd
import tiktoken
from typing import Dict, List, Set, Union
from transformers import AutoTokenizer, PreTrainedTokenizerBase

# ---------------------------------------------------------
# 1. THIS WAS THE SOTA REGISTRY (DATE: Jan 2026)
# ---------------------------------------------------------
# SOTA_CONFIG = {
#     "GPT-OSS": {"id": "o200k_base", "type": "BPE (Tiktoken)"},
#     "Llama-4": {"id": "meta-llama/Llama-4-Scout-17B-16E-Instruct", "type": "BPE (Tiktoken)"},
#     "Mistral": {"id": "mistralai/Mistral-Large-3-675B-Instruct-2512", "type": "BPE (Tekken)"},
#     "Gemma-3": {"id": "google/gemma-3-270m-it", "type": "SentencePiece"},
#     "GLM-4": {"id": "THUDM/glm-4-9b-chat", "type": "BPE (Tiktoken/Custom)"},
#     "Qwen-2.5": {"id": "Qwen/Qwen2.5-72B-Instruct", "type": "BPE"},
#     "DeepSeek-V3": {"id": "deepseek-ai/DeepSeek-V3", "type": "BPE"},
#     "Phi-4": {"id": "microsoft/phi-4", "type": "BPE"},
#     "RNJ-1": {"id": "EssentialAI/rnj-1-instruct", "type": "SentencePiece"},
#     "OLMo-3": {"id": "allenai/OLMo-3-1125-32B", "type": "BPE"}
# }

# ---------------------------------------------------------
# 2. THE AUDIT ENGINE CLASS
# ---------------------------------------------------------
class TokenScriptAuditor:
    def __init__(self, scripts_txt_path: str = "Scripts.txt"):
        self.cp_to_script = self._load_scripts(scripts_txt_path)
        self.ignore_from_mix = {"Common", "Unknown", "Inherited"}

    def _load_scripts(self, path: str) -> Dict[int, str]:
        cp_map = {}
        if not os.path.exists(path):
            raise FileNotFoundError(f"Missing Unicode Scripts file: {path}")

        with open(path, "r", encoding="utf-8") as f:
            for line in f:
                line = line.split("#")[0].strip()
                if not line or ";" not in line: continue
                range_part, script = line.split(";")
                script = script.strip().split()[0]
                if ".." in range_part:
                    start, end = range_part.split("..")
                    for cp in range(int(start, 16), int(end, 16) + 1):
                        cp_map[cp] = script
                else:
                    cp_map[int(range_part, 16)] = script
        return cp_map

    def identify_script(self, text: str) -> str:
        if not text: return "EMPTY"
        scripts_found = {self.cp_to_script.get(ord(c), "Unknown") for c in text}
        meaningful = sorted(list(scripts_found - self.ignore_from_mix))

        if not meaningful:
            return sorted(list(scripts_found))[0] if scripts_found else "Unknown"
        if len(meaningful) == 1:
            return meaningful[0]

        # New Logic: Mixed_Script1_Script2
        return "Mixed_" + "_".join(meaningful[:2])

# ---------------------------------------------------------
# 3. HELPER FUNCTIONS
# ---------------------------------------------------------
def audit_model_vocab(name: str, config: dict, auditor: TokenScriptAuditor) -> pd.DataFrame:
    print(f"--- Starting Audit: {name} ({config['id']}) ---")

    # Load Tokenizer
    if name == "GPT-OSS":
        tokenizer = tiktoken.get_encoding(config["id"])
        indices = range(tokenizer.n_vocab)
        is_tiktoken = True
    else:
        tokenizer = AutoTokenizer.from_pretrained(config["id"], trust_remote_code=True)
        id_to_token = {v: k for k, v in tokenizer.get_vocab().items()}
        indices = sorted(id_to_token.keys())
        is_tiktoken = False

    rows = []
    for i in indices:
        try:
            if is_tiktoken:
                raw_token = str(tokenizer.decode_single_token_bytes(i))
                decoded = tokenizer.decode([i])
            else:
                raw_token = id_to_token[i]
                decoded = tokenizer.decode([i], skip_special_tokens=False)

            # Labeling
            if re.match(r"^<.*>$", raw_token):
                script_label = "SPECIAL_TOKEN"
            else:
                script_label = auditor.identify_script(decoded)
        except:
            decoded, script_label = "N/A", "BYTE_FRAGMENT"
            raw_token = id_to_token[i] if not is_tiktoken else f"idx_{i}"

        rows.append([name, i, raw_token, decoded, script_label])

    return pd.DataFrame(rows, columns=["model_tokenizer_name", "tokenizer_token_index", "token", "decoded_token", "script"])


In [None]:
auditor = TokenScriptAuditor("./scripts_unicode_17.txt/Scripts.txt")

list_all = []
total_token_count = 0

for name, config in SOTA_CONFIG.items():
    df = audit_model_vocab(name, config, auditor)
    list_all.append(df)
    total_token_count += len(df)

# Combine All
df_comb = pd.concat(list_all, ignore_index=True)

# Summary Row Logic
print(f"\nFinal Audit Complete.")
print(f"Total Tokens Processed: {total_token_count:,}")


--- Starting Audit: GPT-OSS (o200k_base) ---
--- Starting Audit: Llama-4 (meta-llama/Llama-4-Scout-17B-16E-Instruct) ---
--- Starting Audit: Mistral (mistralai/Mistral-Large-3-675B-Instruct-2512) ---
--- Starting Audit: Gemma-3 (google/gemma-3-270m-it) ---
--- Starting Audit: GLM-4 (THUDM/glm-4-9b-chat) ---
--- Starting Audit: Qwen-2.5 (Qwen/Qwen2.5-72B-Instruct) ---
--- Starting Audit: DeepSeek-V3 (deepseek-ai/DeepSeek-V3) ---
--- Starting Audit: Phi-4 (microsoft/phi-4) ---
--- Starting Audit: RNJ-1 (EssentialAI/rnj-1-instruct) ---
--- Starting Audit: OLMo-3 (allenai/OLMo-3-1125-32B) ---

Final Audit Complete.
Total Tokens Processed: 1,555,080


Something doesn't seem right about GLM

In [None]:
df_comb.loc[(df_comb.model_tokenizer_name=='GLM-4') ].script.value_counts()

Unnamed: 0_level_0,count
script,Unnamed: 1_level_1
BYTE_FRAGMENT,151329
SPECIAL_TOKEN,11
Latin,3


In [None]:
import re
import pandas as pd
from transformers import AutoTokenizer

def audit_glm4_vocab(model_id: str, auditor) -> pd.DataFrame:
    """
    GLM-4 specific audit function.
    Handles byte-level vocabulary and custom ChatGLM tokenization logic.
    """
    print(f"--- Loading GLM-4 Tokenizer: {model_id} ---")

    # GLM-4 requires trust_remote_code=True for its custom tiktoken-based implementation
    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)

    # GLM-4 vocab keys are 'bytes' objects
    vocab = tokenizer.get_vocab()
    id_to_token = {v: k for k, v in vocab.items()}
    indices = sorted(id_to_token.keys())

    # Pre-compile byte-pattern to avoid TypeError: cannot use string pattern on bytes
    # Matches patterns like b'<|user|>', b'<|endoftext|>', etc.
    special_byte_pattern = re.compile(b"^<.*>$")

    rows = []
    model_name = model_id.split("/")[-1]

    for i in indices:
        raw_token_bytes = id_to_token[i] # This is a 'bytes' object

        try:
            # 1. Decode for the 'decoded_token' column (human readable)
            decoded = tokenizer.decode([i], skip_special_tokens=False)

            # 2. Determine Script
            # Check against the byte-regex for special tokens
            if special_byte_pattern.match(raw_token_bytes):
                script_label = "SPECIAL_TOKEN"
            elif not decoded:
                script_label = "BYTE_FRAGMENT"
            else:
                script_label = auditor.identify_script(decoded)

        except Exception:
            decoded = "N/A"
            script_label = "BYTE_FRAGMENT"

        rows.append([
            model_name,
            i,
            str(raw_token_bytes), # Store as string representation of bytes b'...'
            decoded,
            script_label
        ])

    df = pd.DataFrame(rows, columns=[
        "model_tokenizer_name", "tokenizer_token_index",
        "token", "decoded_token", "script"
    ])

    return df

#######################
df_glm4 = audit_glm4_vocab("THUDM/glm-4-9b-chat", auditor)

--- Loading GLM-4 Tokenizer: THUDM/glm-4-9b-chat ---


In [None]:
df_glm4.script.value_counts()

Unnamed: 0_level_0,count
script,Unnamed: 1_level_1
Latin,100587
Han,28513
Cyrillic,9741
Common,7998
Arabic,1972
Greek,829
Hangul,565
Hiragana,486
Katakana,352
Mixed_Han_Hiragana,115


In [None]:
# Neat!
lst_mod=list(df_comb.model_tokenizer_name.unique())
lst_mod.remove('GLM-4')
df_temp=df_comb[df_comb.model_tokenizer_name.isin(lst_mod)]
df_final=pd.concat([df_temp,df_glm4])

# Export to TSV
df_final.to_csv("df_tokka_bench_2026.tsv", sep='\t',index=False)
df_final.model_tokenizer_name.value_counts()

Unnamed: 0_level_0,count
model_tokenizer_name,Unnamed: 1_level_1
Gemma-3,262145
Llama-4,201135
GPT-OSS,200019
Qwen-2.5,151665
glm-4-9b-chat,151343
Mistral,131072
DeepSeek-V3,128815
RNJ-1,128256
Phi-4,100352
OLMo-3,100278


In [None]:
df_final.script.unique()

array(['Common', 'Latin', 'Cyrillic', 'Arabic', 'Devanagari', 'Georgian',
       'Hebrew', 'Armenian', 'Malayalam', 'Greek', 'Bengali', 'Han',
       'Gujarati', 'Tamil', 'Kannada', 'Telugu', 'Thai', 'Hangul',
       'Inherited', 'Hiragana', 'Katakana', 'Gurmukhi', 'Sinhala',
       'Khmer', 'Myanmar', 'Mixed_Han_Hiragana', 'Oriya',
       'Mixed_Han_Latin', 'Unknown', 'Tibetan', 'Braille',
       'Mixed_Han_Katakana', 'Mixed_Cyrillic_Latin', 'Lao',
       'BYTE_FRAGMENT', 'Ethiopic', 'Thaana', 'SPECIAL_TOKEN',
       'Mixed_Hiragana_Katakana', 'Mixed_Greek_Latin',
       'Mixed_Hiragana_Latin', 'Syriac', 'Mixed_Katakana_Latin',
       'Cherokee', 'Ogham', 'Ol_Chiki', 'Tifinagh', 'Cuneiform', 'Coptic',
       'Canadian_Aboriginal', 'Vai', 'Bopomofo', 'Egyptian_Hieroglyphs',
       'Yi', 'Mongolian', 'Javanese', 'Old_Turkic', 'Kaithi', 'Tai_Le',
       'Meetei_Mayek', 'Nko', 'Tai_Viet', 'Bamum', 'New_Tai_Lue', 'Runic',
       'Mandaic', 'Phags_Pa', 'Tai_Tham', 'Balinese', 'Buginese',
  

In [None]:
df_final.script.nunique()

83

In [None]:
df_final.script.value_counts()

Unnamed: 0_level_0,count
script,Unnamed: 1_level_1
Latin,1051942
Han,137815
Cyrillic,85253
Common,79131
Arabic,43366
...,...
Batak,1
Lisu,1
Cham,1
Old_Persian,1
