# Tokenization Playground

Some playing around with the llama 3 tokenizer. Checking if tokenizer can handle ascii art.

In [None]:
from transformers import AutoTokenizer
from pathlib import Path

In [3]:
llama_tokenizer = AutoTokenizer.from_pretrained("unsloth/Meta-Llama-3.1-8B-bnb-4bit")
gpt2_tokenizer = AutoTokenizer.from_pretrained("gpt2")

In [18]:
def check_if_lossy(tokenizer, text):
    tokens = tokenizer(text, return_tensors="pt")
    token_ids = tokens["input_ids"][0]
    decoded_text = tokenizer.decode(token_ids, skip_special_tokens=True)
    return decoded_text != text

def check_encode_decode(tokenizer, text):
    tokens = tokenizer(text, return_tensors="pt")
    token_ids = tokens["input_ids"][0]
    raw_tokens = tokenizer.convert_ids_to_tokens(token_ids)
    decoded_text = tokenizer.decode(token_ids, skip_special_tokens=True)
    print(f"Input text:")
    print(text)
    print(f"Encoded-Decoded:")
    print(decoded_text)
    print(f"Raw tokens:")
    print(raw_tokens)
    print(f"Lossy: {check_if_lossy(tokenizer, text)}")
    print()

def encode_decode_check(tokenizer, text):
    tokens = tokenizer(text, return_tensors="pt")
    token_ids = tokens["input_ids"][0]
    raw_tokens = tokenizer.convert_ids_to_tokens(token_ids)
    clean_text = tokenizer.decode(token_ids, skip_special_tokens=True)
    print(token_ids)
    print("Raw tokens:", raw_tokens)
    print("Clean text:", clean_text)
    if clean_text == text:
        print("Tokenization is lossless")
    else:
        print("Tokenization is lossy")
        
def check_if_dataset_lossy(tokenizer, dataset_path):
    paths = sorted(list(dataset_path.glob("**/*.txt")))
    for path in paths:
        with open(path, "r") as f:
            text = f.read()
            if check_if_lossy(tokenizer, text):
                print(f"Lossy tokenization: {path}")

In [16]:
ascii_art = r"""
    /\_/\           ___
   = o_o =_______    \ \ 
    __^      __(  \.__) )
(@)<_____>__(_____)____/
"""

In [None]:
print("Llama Tokenizer:")
encode_decode_check(llama_tokenizer, ascii_art)

In [None]:
dataset_path = Path("../ascii_art/animals/")

check_if_dataset_lossy(llama_tokenizer, dataset_path)

In [20]:
lossy_ascii_example = r"""
       _.---._    /\\
    ./'       "--`\//
  ./              o \
 /./\  )______   \__ \
./  / /\ \   | \ \  \ \
   / /  \ \  | |\ \  \7
    "     "    "  "       
"""

In [None]:
check_encode_decode(llama_tokenizer, lossy_ascii_example)

In [22]:
lossy_ascii_example_2 = " ."

In [None]:
check_encode_decode(llama_tokenizer, lossy_ascii_example_2)

In [24]:
lossy_ascii_example_3 = " s"

In [None]:
check_encode_decode(llama_tokenizer, lossy_ascii_example_3)

In [None]:
# Decoding
print(llama_tokenizer.decode(llama_tokenizer.convert_tokens_to_ids(['Ġ?'])))  # Output: "."
print(llama_tokenizer.decode(llama_tokenizer.convert_tokens_to_ids(['Ġa'])))  # Output: " a"

In [None]:
# Decoding
print(gpt2_tokenizer.decode(gpt2_tokenizer.convert_tokens_to_ids(['Ġ?'])))  # Output: "."
print(gpt2_tokenizer.decode(gpt2_tokenizer.convert_tokens_to_ids(['Ġa'])))  # Output: " a"

In [None]:
print(f"llama_tokenizer.clean_up_tokenization_spaces: {llama_tokenizer.clean_up_tokenization_spaces}")
print(f"gpt2_tokenizer.clean_up_tokenization_spaces: {gpt2_tokenizer.clean_up_tokenization_spaces}")

In [30]:
from pathlib import Path
dataset_path = Path("../ascii_art/animals/")
check_if_dataset_lossy(gpt2_tokenizer, dataset_path)

We have to disable clean_up_tokenization_spaces to get correct ascii for llama tokenizer.
https://huggingface.co/docs/transformers/en/model_doc/llama#transformers.LlamaTokenizer

In [31]:
llama_tokenizer.clean_up_tokenization_spaces = False

In [32]:
from pathlib import Path
dataset_path = Path("../ascii_art/animals/")
check_if_dataset_lossy(llama_tokenizer, dataset_path)

### Check if ascii token is necessary

In [11]:
OUTPUT_ASCII_PROMPT = """
Generate ascii art that matches the following description.

### description:
{description}

### ascii visualization:
<ascii>
{ascii_art}
</ascii>
"""


In [None]:
def tokenize_text(tokenizer, text):
    tokens = tokenizer(text, return_tensors="pt")
    token_ids = tokens["input_ids"][0]
    raw_tokens = tokenizer.convert_ids_to_tokens(token_ids)
    decoded_text = tokenizer.decode(token_ids, skip_special_tokens=True)
    print("Original text:")
    print(text)
    print("Tokenized text:")
    print(raw_tokens)

tokenize_text(llama_tokenizer, OUTPUT_ASCII_PROMPT)

In [18]:
special_tokens_dict = {'additional_special_tokens': ['<ascii>','</ascii>']}
num_added_toks = llama_tokenizer.add_special_tokens(special_tokens_dict)
#model.resize_token_embeddings(len(tokenizer))

In [None]:

tokenize_text(llama_tokenizer, OUTPUT_ASCII_PROMPT)

In [None]:
tokens = [128000, 198, 128000, 198, 32215, 48220, 1989, 430, 9248, 279, 2768, 4096, 382, 14711, 4096, 512, 4719, 271, 14711, 48220, 42148, 512]
decoded_text =llama_tokenizer.convert_ids_to_tokens(tokens, skip_special_tokens=False)
print(decoded_text)

In [None]:
tokens = 128000
decoded_text =llama_tokenizer.convert_ids_to_tokens(tokens, skip_special_tokens=False)
print(decoded_text)