# Study tokenizers

## Goal

Can I find alternative symbols to the numbers for the grids?

## Imports

In [None]:
import sys
import os
import json
import time
import textwrap
from transformers import AutoTokenizer
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

sys.path.append(os.path.realpath('../scripts/'))
from arc24.encoders import create_grid_encoder


plt.plot()
plt.close('all')
plt.rcParams["figure.figsize"] = (25, 4)
mpl.rcParams['lines.linewidth'] = 3
mpl.rcParams['font.size'] = 16

## Code

In [None]:
def is_symbol_unique(symbol, vocab):
    occurrences = 0
    for word in vocab:
        if symbol in word:
            occurrences += 1
    return occurrences == 1

In [None]:
def get_words_with_symbol(symbol, vocab, skip_special_tokens=True):
    words = [word for word in vocab if symbol in word]
    if skip_special_tokens:
        words = [word for word in words if not word.startswith('<')]
    return words

## Study tokenizers

In [None]:
is_symbol_unique('ø', llama_vocab)

In [None]:
[(word, idx) for word, idx in llama_vocab.items() if idx in [39218, 6282]]

In [None]:
is_symbol_unique('Ã¸', llama_vocab)

In [None]:
{word for word in llama_vocab if 'ø' in word}

In [None]:
qwen_tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen2-0.5B-Instruct')
qwen_vocab = qwen_tokenizer.get_vocab()
qwen_length_1_words = sorted([word for word in qwen_vocab if len(word) == 1 and is_symbol_unique(word, qwen_vocab)])
print(f'{len(qwen_length_1_words)}/{len(qwen_tokenizer.get_vocab())}')
print(qwen_length_1_words)

In [None]:
llama_tokenizer = AutoTokenizer.from_pretrained('/home/gbarbadillo/data/llama-3.1-transformers-8b-instruct-v1')
llama_vocab = llama_tokenizer.get_vocab()
llama_length_1_words = sorted([word for word in llama_vocab if len(word) == 1 and is_symbol_unique(word, llama_vocab)])
print(f'{len(llama_length_1_words)}/{len(llama_tokenizer.get_vocab())}')
print(llama_length_1_words)

Let's find the intersection of symbols.

In [None]:
interesting_symbols = sorted(list(set(qwen_length_1_words).intersection(set(llama_length_1_words))))
print(len(interesting_symbols))
print(interesting_symbols)

Great! We have 44 symbols that are apparently unique both for llama and qwen. We could create a mapping between these symbols and the numbers.

```
['À', 'Á', 'ñ', 'ò', 'ô', 'õ', 'ö', '÷', 'ø', 'ù', 'ú', 'û', 'ü', 'ý', 'þ', 'ÿ', 'Ā', 'ā', 'Ă', 'ă', 'Ą', 'ą', 'Ć', 'ć', 'Ĉ', 'ċ', 'Ď', 'ď', 'Đ', 'đ', 'Ē', 'ē', 'Ĕ', 'ĕ', 'Ė', 'ė', 'Ę', 'ę', 'Ě', 'Ĝ', 'ĝ', 'Ğ', 'ğ', 'ġ']
```

I'm going to select a different enough set, that could enable easy visualization.

I'm also going to verify that the encoding does not change even if distractors are added. When verifying the new representation I have found that spaces or new lines could change the encoding, making it more difficult. 

In [None]:
candidate_symbols = []
for symbol in interesting_symbols:
    add_to_candidates = True
    for distractor in [' ', '\n']:
        text = distractor + symbol + distractor
        if len(llama_tokenizer.tokenize(text)) != 3 or len(qwen_tokenizer.tokenize(text)) != 3:
            add_to_candidates = False
            break
    if add_to_candidates:
        candidate_symbols.append(symbol)
candidate_symbols = sorted(candidate_symbols)
print(len(candidate_symbols))
print(candidate_symbols)

Thus we have 16 symbols that apparently do not change when being surrounded by spaces or new lines.

```
['ñ', 'ò', 'õ', '÷', 'ù', 'û', 'ā', 'Ă', 'ă', 'ą', 'ć', 'ď', 'ē', 'ę', 'Ě', 'Ğ']
```

Let's generate a big synthetic data to verify that is correctly encoded.

In [None]:
selection = ['ñ', 'ò', 'õ', '÷', 'ù', 'û', 'ā', 'Ă', 'ă', 'ą', 'ć', 'ď', 'ē', 'ę', 'Ě', 'Ğ']
selection = ['ñ', 'ò', '÷', 'û', 'ą', 'ć', 'ď', 'ę', 'Ě', 'Ğ']
print(selection)
print(len(selection))
a, b = 20, 20
text = '\n'.join(' ' + ''.join(np.random.choice(selection, b, replace=True)) for _ in range(a))
print(text)
n = a*b + a - 1
assert len(text) == len(llama_tokenizer.tokenize(text)), f'{n} != {len(llama_tokenizer.tokenize(text))}'
assert len(text) == len(qwen_tokenizer.tokenize(text)), f'{n} != {len(qwen_tokenizer.tokenize(text))}'

In [None]:
selection = [str(i) for i in range(10)]
print(selection)
a, b = 20, 20
text = '\n'.join(' ' + ''.join(np.random.choice(selection, b, replace=True)) for _ in range(a))
print(text)
n = a*(b+1) + a - 1
assert len(text) == len(qwen_tokenizer.tokenize(text)), f'{n} != {len(qwen_tokenizer.tokenize(text))}'
assert len(text) == n

In [None]:
selection = [str(i)*3 for i in range(10)]
print(selection)
a, b = 20, 20
text = '\n'.join(' ' + ''.join(np.random.choice(selection, b, replace=True)) for _ in range(a))
print(text)
n = a*(b+1) + a - 1
assert n == len(llama_tokenizer.tokenize(text)), f'{n} != {len(llama_tokenizer.tokenize(text))}'

Finally let's see the encoding on a real grid sample.

In [None]:
text = """
```grid shape: 3x3
1 ñò÷
2 ûąć
3 ďęĚ
```
"""
print(qwen_tokenizer.tokenize(text))
print(llama_tokenizer.tokenize(text))

## Llama 3.2 1B

In [None]:
llama_tokenizer = AutoTokenizer.from_pretrained('/home/gbarbadillo/data/Llama-3.2-1B-Instruct')
llama_vocab = llama_tokenizer.get_vocab()
llama_length_1_words = sorted([word for word in llama_vocab if len(word) == 1 and is_symbol_unique(word, llama_vocab)])
print(f'{len(llama_length_1_words)}/{len(llama_tokenizer.get_vocab())}')
print(llama_length_1_words)

In [None]:
for i in range(10):
    words = sorted(get_words_with_symbol(str(i), llama_vocab))
    print(f'{i} ({len(words)}): {words}')

It seems that the tokenizer has all the number variations from 0 to 999.

In [None]:
for i in range(10):
    words = sorted(get_words_with_symbol(str(i)*3, llama_vocab))
    print(f'{i} ({len(words)}): {words}')

We could repeat each number 3 times to have one token per cell.

In [None]:
encoder = create_grid_encoder('GridShapeEncoder(RowNumberEncoder(RepeatNumberEncoder()))')
text = encoder.to_text(np.eye(3, dtype=int).tolist())
print(text)
llama_tokenizer.tokenize(text)

## Smoll-135M

In [None]:
tokenizer = AutoTokenizer.from_pretrained('/home/gbarbadillo/data/SmolLM-135M-Instruct')
vocab = tokenizer.get_vocab()
length_1_words = sorted([word for word in vocab if len(word) == 1 and is_symbol_unique(word, vocab)])
print(f'{len(length_1_words)}/{len(tokenizer.get_vocab())}')
print(length_1_words)

In [None]:
for i in range(10):
    words = sorted(get_words_with_symbol(str(i), vocab))
    print(f'{i} ({len(words)}): {words}')

For SmolLM I can encode the grid directly with numbers without any problem, just like Qwen.

## Problem with pad_token and eos_token

In [None]:
tokenizer = AutoTokenizer.from_pretrained('/home/gbarbadillo/data/SmolLM-135M-Instruct')
tokenizer.special_tokens_map

In [None]:
tokenizer = AutoTokenizer.from_pretrained('/home/gbarbadillo/data/Llama-3.2-1B-Instruct')
tokenizer.special_tokens_map

In [None]:
tokenizer = AutoTokenizer.from_pretrained('/home/gbarbadillo/data/Qwen2.5-0.5B-Instruct')
tokenizer.special_tokens_map

In [None]:
tokenizer = AutoTokenizer.from_pretrained('/home/gbarbadillo/data/Qwen2.5-0.5B')
tokenizer.special_tokens_map

- `Qwen2.5-0.5B` and `SmolLM-135M-Instruc` use the same eos and pad token, that is why generation with those models does not end.
- In the other hand `Llama-3.2-1B-Instruct` does not even have a pad token, but I add it in the fine-tuning script
- Qwen instruct models have different pad and eos tokens, which is the perfect situations.

In [None]:
def get_tokenizer(model_path, pad_token='<|pad|>'):
    tokenizer = AutoTokenizer.from_pretrained(
        model_path,
        trust_remote_code=True)
    if 'pad_token' not in tokenizer.special_tokens_map or tokenizer.pad_token == tokenizer.eos_token:
        if 'pad_token' not in tokenizer.special_tokens_map:
            print('Adding padding token because the tokenizer does not have one')
        else:
            print('Changing padding token because it is the same as the end-of-sequence token')
        assert pad_token not in tokenizer.get_vocab()
        tokenizer.add_special_tokens({'pad_token': pad_token})
        tokenizer.padding_side = 'right'
    return tokenizer

In [None]:
for filepath in ['/home/gbarbadillo/data/SmolLM-135M-Instruct', '/home/gbarbadillo/data/Llama-3.2-1B-Instruct', '/home/gbarbadillo/data/Qwen2.5-0.5B', '/home/gbarbadillo/data/Qwen2.5-0.5B-Instruct']:
    print(filepath)
    tokenizer = get_tokenizer(filepath)
    print(tokenizer.special_tokens_map)
    print()