# Appendix: Tokenization Methods (Colab)

This notebook mirrors the appendix on tokenization. It is self-contained and runnable on Google Colab.

In [None]:
# Install optional packages if missing (Colab-friendly)
try:
    import tokenizers, sentencepiece  # type: ignore
except Exception:
    %pip -q install tokenizers sentencepiece
import matplotlib.pyplot as plt
%config InlineBackend.figure_format = 'svg'
plt.style.use('seaborn-v0_8')


## Character and Byte Level
Simple, robust baselines; every codepoint (or byte) is a token.

In [None]:
text = 'The model dreams in tokens.'
chars = [ord(c) for c in text]
recovered = ''.join(chr(i) for i in chars)
chars[:10], recovered[:10]

## Word / Whitespace Split
Compact but language-dependent and OOV-prone.

In [None]:
import re
def words(s): return re.findall(r'\b\w+\b', s.lower())
vocab = {}
def encode_words(s):
    ids = []
    for w in words(s):
        if w not in vocab: vocab[w] = len(vocab)
        ids.append(vocab[w])
    return ids
text = 'The model dreams in tokens. The model learns.'
encode_words(text), vocab


## Subword (BPE) with tokenizers
Train a tiny BPE tokenizer on a miniature corpus.

In [None]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
texts = ['The model dreams in tokens.', 'The model learns.']
tok = Tokenizer(BPE(unk_token='<unk>'))
tok.pre_tokenizer = Whitespace()
trainer = BpeTrainer(vocab_size=200, special_tokens=['<unk>', '<pad>'])
tok.train_from_iterator(texts, trainer)
enc = tok.encode('The model models tokens')
enc.tokens, enc.ids


## SentencePiece (BPE)
Train a tiny SentencePiece model and tokenize a sentence.

In [None]:
import sentencepiece as spm
with open('spm_corpus.txt', 'w', encoding='utf-8') as f:
    f.write('The model dreams in tokens.\nThe model learns.\n')
spm.SentencePieceTrainer.Train(input='spm_corpus.txt', model_prefix='spm_demo', \n+model_type='bpe', vocab_size=200, pad_id=0, unk_id=1, bos_id=-1, eos_id=-1)
sp = spm.SentencePieceProcessor(model_file='spm_demo.model')
ids = sp.encode('The model models tokens', out_type=int)
sp.id_to_piece(ids)


## Quick Visualization: Token Lengths
Plot token counts under different tokenizers.

In [None]:
sent = 'The model models tokens'
char_n = len(list(sent))
word_n = len(sent.split())
bpe_n = len(tok.encode(sent).ids)
plt.bar(['char','word','bpe'], [char_n, word_n, bpe_n], color=['#DCE6F8','#CFE2FF','#B5D0F5']); plt.ylabel('tokens'); plt.show()
