# Tokenization Playground

Checking if tokenizer can handle ascii art.

In [13]:
from transformers import AutoTokenizer
from pathlib import Path

In [14]:
llama_tokenizer = AutoTokenizer.from_pretrained("unsloth/Meta-Llama-3.1-8B-bnb-4bit")
gpt2_tokenizer = AutoTokenizer.from_pretrained("gpt2")

In [18]:
def check_if_lossy(tokenizer, text):
    tokens = tokenizer(text, return_tensors="pt")
    token_ids = tokens["input_ids"][0]
    decoded_text = tokenizer.decode(token_ids, skip_special_tokens=True)
    return decoded_text != text

def check_encode_decode(tokenizer, text):
    tokens = tokenizer(text, return_tensors="pt")
    token_ids = tokens["input_ids"][0]
    raw_tokens = tokenizer.convert_ids_to_tokens(token_ids)
    decoded_text = tokenizer.decode(token_ids, skip_special_tokens=True)
    print(f"Input text:")
    print(text)
    print(f"Encoded-Decoded:")
    print(decoded_text)
    print(f"Raw tokens:")
    print(raw_tokens)
    print(f"Lossy: {check_if_lossy(tokenizer, text)}")
    print()

def encode_decode_check(tokenizer, text):
    tokens = tokenizer(text, return_tensors="pt")
    token_ids = tokens["input_ids"][0]
    raw_tokens = tokenizer.convert_ids_to_tokens(token_ids)
    clean_text = tokenizer.decode(token_ids, skip_special_tokens=True)
    print(token_ids)
    print("Raw tokens:", raw_tokens)
    print("Clean text:", clean_text)
    if clean_text == text:
        print("Tokenization is lossless")
    else:
        print("Tokenization is lossy")
        
def check_if_dataset_lossy(tokenizer, dataset_path):
    paths = sorted(list(dataset_path.glob("**/*.txt")))
    for path in paths:
        with open(path, "r") as f:
            text = f.read()
            if check_if_lossy(tokenizer, text):
                print(f"Lossy tokenization: {path}")

In [16]:
ascii_art = r"""
    /\_/\           ___
   = o_o =_______    \ \ 
    __^      __(  \.__) )
(@)<_____>__(_____)____/
"""

In [17]:
print("Llama Tokenizer:")
encode_decode_check(llama_tokenizer, ascii_art)

Llama Tokenizer:
tensor([128000,    198,    262,  24445,     62,  35419,   1881,   7588,    198,
           256,    284,    297,  14513,    284,   2179,   6101,    262,   1144,
          1144,    720,    262,   1328,     61,    415,  15990,    220,   1144,
          4952,      8,   1763,   6084,  27530,  81617,     29,   3889,   2179,
         16726,   2179,   6018])
Raw tokens: ['<|begin_of_text|>', 'Ċ', 'ĠĠĠ', 'Ġ/\\', '_', '/\\', 'ĠĠĠĠĠĠĠĠĠĠ', 'Ġ___', 'Ċ', 'ĠĠ', 'Ġ=', 'Ġo', '_o', 'Ġ=', '____', '___', 'ĠĠĠ', 'Ġ\\', 'Ġ\\', 'ĠĊ', 'ĠĠĠ', 'Ġ__', '^', 'ĠĠĠĠĠ', 'Ġ__(', 'Ġ', 'Ġ\\', '.__', ')', 'Ġ)Ċ', '(@', ')<', '_____', '>', '__(', '____', '_)', '____', '/Ċ']
Clean text: 
    /\_/\           ___
   = o_o =_______    \ \ 
    __^      __(  \.__) )
(@)<_____>__(_____)____/

Tokenization is lossless


In [19]:
dataset_path = Path("../ascii_art/animals/")

check_if_dataset_lossy(llama_tokenizer, dataset_path)

Lossy tokenization: ../ascii_art/animals/aardvark/0/content.txt
Lossy tokenization: ../ascii_art/animals/aardvark/1/content.txt
Lossy tokenization: ../ascii_art/animals/bat/1/content.txt
Lossy tokenization: ../ascii_art/animals/bear/0/content.txt
Lossy tokenization: ../ascii_art/animals/beaver/0/content.txt
Lossy tokenization: ../ascii_art/animals/beaver/1/content.txt
Lossy tokenization: ../ascii_art/animals/beaver/2/content.txt
Lossy tokenization: ../ascii_art/animals/beaver/3/content.txt
Lossy tokenization: ../ascii_art/animals/birds/10/content.txt
Lossy tokenization: ../ascii_art/animals/birds/2/content.txt
Lossy tokenization: ../ascii_art/animals/birds/4/content.txt
Lossy tokenization: ../ascii_art/animals/birds/6/content.txt
Lossy tokenization: ../ascii_art/animals/birds/8/content.txt
Lossy tokenization: ../ascii_art/animals/birds/9/content.txt
Lossy tokenization: ../ascii_art/animals/bison/0/content.txt
Lossy tokenization: ../ascii_art/animals/camel/0/content.txt
Lossy tokenizati

In [20]:
lossy_ascii_example = r"""
       _.---._    /\\
    ./'       "--`\//
  ./              o \
 /./\  )______   \__ \
./  / /\ \   | \ \  \ \
   / /  \ \  | |\ \  \7
    "     "    "  "       
"""

In [21]:
check_encode_decode(llama_tokenizer, lossy_ascii_example)

Input text:

       _.---._    /\\
    ./'       "--`\//
  ./              o \
 /./\  )______   \__ \
./  / /\ \   | \ \  \ \
   / /  \ \  | |\ \  \7
    "     "    "  "       

Encoded-Decoded:

       _.---._    /\\
   ./'       "--`\//
 ./              o \
 /./\  )______   \__ \
./  / /\ \   | \ \  \ \
   / /  \ \  | |\ \  \7
    "     "    "  "       

Raw tokens:
['<|begin_of_text|>', 'Ċ', 'ĠĠĠĠĠĠ', 'Ġ_.', '---', '._', 'ĠĠĠ', 'Ġ/', '\\\\', 'Ċ', 'ĠĠĠ', 'Ġ.', "/'", 'ĠĠĠĠĠĠ', 'Ġ"--', '`\\', '//Ċ', 'Ġ', 'Ġ./', 'ĠĠĠĠĠĠĠĠĠĠĠĠĠ', 'Ġo', 'Ġ\\Ċ', 'Ġ/', './', '\\', 'Ġ', 'Ġ)', '____', '__', 'ĠĠ', 'Ġ\\', '__', 'Ġ\\Ċ', './', 'Ġ', 'Ġ/', 'Ġ/\\', 'Ġ\\', 'ĠĠ', 'Ġ|', 'Ġ\\', 'Ġ\\', 'Ġ', 'Ġ\\', 'Ġ\\Ċ', 'ĠĠ', 'Ġ/', 'Ġ/', 'Ġ', 'Ġ\\', 'Ġ\\', 'Ġ', 'Ġ|', 'Ġ|\\', 'Ġ\\', 'Ġ', 'Ġ\\', '7', 'Ċ', 'ĠĠĠ', 'Ġ"', 'ĠĠĠĠ', 'Ġ"', 'ĠĠĠ', 'Ġ"', 'Ġ', 'Ġ"', 'ĠĠĠĠĠĠĠĊ']
Lossy: True



In [22]:
lossy_ascii_example_2 = " ."

In [23]:
check_encode_decode(llama_tokenizer, lossy_ascii_example_2)

Input text:
 .
Encoded-Decoded:
.
Raw tokens:
['<|begin_of_text|>', 'Ġ.']
Lossy: True



In [24]:
lossy_ascii_example_3 = " s"

In [25]:
check_encode_decode(llama_tokenizer, lossy_ascii_example_3)

Input text:
 s
Encoded-Decoded:
 s
Raw tokens:
['<|begin_of_text|>', 'Ġs']
Lossy: False



In [26]:
# Decoding
print(llama_tokenizer.decode(llama_tokenizer.convert_tokens_to_ids(['Ġ?'])))  # Output: "."
print(llama_tokenizer.decode(llama_tokenizer.convert_tokens_to_ids(['Ġa'])))  # Output: " a"

?
 a


In [27]:
# Decoding
print(gpt2_tokenizer.decode(gpt2_tokenizer.convert_tokens_to_ids(['Ġ?'])))  # Output: "."
print(gpt2_tokenizer.decode(gpt2_tokenizer.convert_tokens_to_ids(['Ġa'])))  # Output: " a"

 ?
 a


In [28]:
print(f"llama_tokenizer.clean_up_tokenization_spaces: {llama_tokenizer.clean_up_tokenization_spaces}")
print(f"gpt2_tokenizer.clean_up_tokenization_spaces: {gpt2_tokenizer.clean_up_tokenization_spaces}")

llama_tokenizer.clean_up_tokenization_spaces: True
gpt2_tokenizer.clean_up_tokenization_spaces: False


In [30]:
from pathlib import Path
dataset_path = Path("../ascii_art/animals/")
check_if_dataset_lossy(gpt2_tokenizer, dataset_path)

We have to disable clean_up_tokenization_spaces to get correct ascii for llama tokenizer.
https://huggingface.co/docs/transformers/en/model_doc/llama#transformers.LlamaTokenizer

In [31]:
llama_tokenizer.clean_up_tokenization_spaces = False

In [32]:
from pathlib import Path
dataset_path = Path("../ascii_art/animals/")
check_if_dataset_lossy(llama_tokenizer, dataset_path)