# Tokenization

## Installation


Install `nltk` and download its data.

```bash
pip install nltk
python -m nltk.downloader punkt
```

Then it will download the data in the home directory:
```text
[nltk_data] Downloading package punkt to
[nltk_data]     /GPFS/rhome/xiyuanyang/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
```


## Quick Startup

In [None]:
import nltk
from nltk.tokenize import word_tokenize

# step1: download some data
nltk.download("punkt_tab")

text = "This is an example sentence, with punctuation!"
words = word_tokenize(text)
print(words)

# for some chinese demo
text_chn = "我是上海交通大学的一名学生,你觉的我怎么样，I am very happy to see you!"
# nltk is quite foolish...
words_chn = word_tokenize(text_chn)
print(words_chn)

# for chinses, you can use jieba
import jieba


text = "我爱北京天安门，天安门上太阳升。"
# 精确模式
seg_list_precise = jieba.cut(text, cut_all=False)
print("精确模式:", " ".join(seg_list_precise))

# 全模式
seg_list_all = jieba.cut(text, cut_all=True)
print("全模式:", " ".join(seg_list_all))

# 搜索引擎模式
seg_list_search = jieba.cut_for_search(text)
print("搜索引擎模式:", " ".join(seg_list_search))

## Encoding and Decoding

We will use `spacy` for more advanced NLP usage.

```bash
pip install spacy
python -m spacy download en_core_web_sm
```

```text
Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 12.8/12.8 MB 728.1 kB/s eta 0:00:00
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.8.0
✔ Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')
```

In [None]:
import spacy

# Load the English language model
nlp = spacy.load("en_core_web_sm")

text = "This is an example sentence, with punctuation!"
doc = nlp(text)

words = [token.text for token in doc]
print(words)

for token in doc:
    print(f"{token.text:<15} {token.pos_:<10} {token.is_punct}")

In [None]:
# for encoding and decoding
import spacy

nlp = spacy.load("en_core_web_sm")
text = "Hello world, I am Xiyuan Yang, a freshman in Shanghai JiaoTong University."
doc = nlp(text)

# Get the tokenized words (tokens)
tokens = [token.text for token in doc]
print("SpaCy Tokens:", tokens)

# --- Encode (Get Token Hashes or IDs) ---
# spaCy internally uses hash values to represent strings, and these hashes map to integer IDs.
# token.norm is the normalized hash value (lowercase, etc.)
# token.orth is the hash value of the original string (case-sensitive).

token_ids = [nlp.vocab.strings.as_int(token.text) for token in doc]
print("Token IDs (encoded):", token_ids)


# --- Decode (Convert IDs back to Tokens) ---
decoded_tokens = [nlp.vocab.strings.as_string(id_val) for id_val in token_ids]
print("Decoded Tokens:", decoded_tokens)

# --- More advanced Vocab usage ---
# The `nlp.vocab.strings` object is a StringStore, which manages all string-to-ID mappings.
print("\nSpaCy Vocab Example:")
print("Current vocab size:", len(nlp.vocab))

# Add a new word to the vocabulary and Get the ID of the new word
new_word = "neural_network"
nlp.vocab.strings.add(new_word)
new_word_id = nlp.vocab.strings.as_int(new_word)
print(f"Added '{new_word}', its ID is: {new_word_id}")

# Decode the ID back to the string
decoded_new_word = nlp.vocab.strings.as_string(new_word_id)
print(f"Decoding ID {new_word_id}: {decoded_new_word}")

# If I change the hash value...
try:
    modified_id = new_word_id + 1
    modified_word = nlp.vocab.strings.as_string(modified_id)
    print(f"After Modification: {modified_word}")
except Exception as e:
    print(f"Error: {e}")

## Advanced Tokenizer

We will use `tiktoken` for advanced tokenizer.

In [None]:
import tiktoken

# Choose an encoding that matches the models you might be using (e.g., for GPT-4, GPT-3.5-turbo)
encoding = tiktoken.get_encoding("cl100k_base")
text_en = "Hello, world! This is a test sentence for tiktoken."

# --- Encode (Text to Token IDs) ---
# encode() converts the string into a list of integer token IDs
token_ids_en = encoding.encode(text_en)
print(f"Original English Text: '{text_en}'")
print(f"Encoded Token IDs (English): {token_ids_en}")
print(f"Number of tokens (English): {len(token_ids_en)}")

# To see the actual tokens that correspond to the IDs (optional, for understanding)
# This requires decoding each ID individually or using a helper.
# Note: decoding individual IDs might not always yield readable strings if tokens are sub-word units.
decoded_parts_en = [encoding.decode([token_id]) for token_id in token_ids_en]
print(f"Decoded Parts (English, for understanding): {decoded_parts_en}")

# --- Decode (Token IDs to Text) ---
# decode() converts a list of integer token IDs back into a string
decoded_text_en = encoding.decode(token_ids_en)
print(f"Decoded English Text: '{decoded_text_en}'")

decoded_modified = [encoding.decode([token_id + 1]) for token_id in token_ids_en]
print(decoded_modified)

# Check if decoded text matches original (should be True)
print(f"Decoded matches original? {decoded_text_en == text_en}")
print(f"After Modifications? {decoded_modified == text_en}")

In [None]:
import tiktoken
# tiktoken is for byte-pair encoding (BPE), Chinese is not supported.

test_strings = [
    "Hello world, My name is Xiyuan Yang",
    "wow, it is so fantastic!",
    "你好，这里是中文，自古逢秋悲寂寥，我言秋日胜春朝",
    "international computational"
]
with open("../../README.md", "r", encoding="utf-8") as file:
    long_string = file.read()
    file.close()
# test_strings.append(long_string)
encoding = tiktoken.get_encoding("cl100k_base")

for test_string in test_strings:
    tokens = encoding.encode(test_string)
    num_bytes = len(bytes(test_string, encoding="utf-8"))
    # print(num_bytes)
    num_tokens = len(tokens)
    decoded = [encoding.decode([token]) for token in tokens]
    print(decoded)
    print(num_bytes / num_tokens)

Let's analyse the result:

$$\text{evluation} = \frac{\text{num bytes}}{\text{num tokens}}$$

- For ANSCI, one single character is 1 bytes. (`char` in C++)

- For Unicode, one single character is 3 bytes.

```text
['Hello', ' world', ',', ' My', ' name', ' is', ' X', 'iy', 'uan', ' Yang']
3.5
['wow', ',', ' it', ' is', ' so', ' fantastic', '!']
3.4285714285714284
['你', '好', '，', '这', '里', '是', '中', '文', '，', '自', '�', '�', '�', '�', '�', '�', '�', '�', '�', '�', '�', '�', '�', '，', '我', '言', '�', '�', '日', '�', '�', '�', '�', '�', '�']
2.057142857142857
['international', ' computational']
13.5
```

- For the English word, the tokenization process is based on **words**, so `num_bytes/num_tokens` is mainly based on the average length of words.

- For the Chinese word, the tokenization process is based on **single characters**.



## Principles

### Character-based Tokenization

For $\mathbb{Y} \subseteq \mathcal{P}(M)$ is finite, there exists at least one hash function that satisfy all the requirements above. A Unicode string is a sequence of Unicode characters. Each character can be converted into a code point (integer) via `ord`.

In [None]:
import string

ord_result = [ord(letter) for letter in string.ascii_letters]
print(ord_result)

print(ord("你"))

print(ord(","))
print(ord(" "))
print(ord("✅"))

try:
    print(ord("Hello world"))
except Exception as e:
    print(e)


print(chr(9989))
print(chr(20320))

In [None]:
from abc import ABC, abstractmethod


class Tokenizer(ABC):
    """Abstract interface for a tokenizer."""

    @abstractmethod
    def encode(self, string: str) -> list[int]:
        raise NotImplementedError
    
    @abstractmethod
    def decode(self, indices: list[int]) -> str:
        raise NotImplementedError


class CharacterTokenizer(Tokenizer):
    """Represent a string as a sequence of Unicode code points."""

    def encode(self, string: str) -> list[int]:
        return list(map(ord, string))

    def decode(self, indices: list[int]) -> str:
        return "".join(map(chr, indices))


In [None]:
string = "自古逢秋悲寂寥，我言秋日胜春朝"
tokenizer = CharacterTokenizer()

indices = tokenizer.encode(string)  # @inspect indices
reconstructed_string = tokenizer.decode(indices)  # @inspect reconstructed_string

print(indices)
print(reconstructed_string)

assert string == reconstructed_string
vocabulary_size = max(indices) + 1
print(vocabulary_size)

#### Byte-based Tokenization

Unicode strings can be represented as a sequence of bytes, which can be represented by integers between 0 and 255.


In [None]:
print(bytes("a", encoding="utf-8"))
print(bytes("自古逢秋悲寂寥，我言秋日胜春朝", encoding="utf-8"))

In [None]:
class ByteTokenizer(Tokenizer):
    """Represent a string as a sequence of bytes."""
    def encode(self, string: str) -> list[int]:
        string_bytes = string.encode("utf-8")  # @inspect string_bytes
        indices = list(map(int, string_bytes))  # @inspect indices
        return indices
    def decode(self, indices: list[int]) -> str:
        string_bytes = bytes(indices)  # @inspect string_bytes
        string = string_bytes.decode("utf-8")  # @inspect string
        return string

tokenizer = ByteTokenizer()
print(tokenizer.encode("自古逢秋悲寂寥，我言秋日胜春朝"))
print(tokenizer.decode(tokenizer.encode("自古逢秋悲寂寥，我言秋日胜春朝")))

In [None]:
import regex

GPT2_TOKENIZER_REGEX = r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""

string = "Hello, welcome to the world of large language models!"
segments = regex.findall(r"\w+|.", string)
print(segments)

pattern = GPT2_TOKENIZER_REGEX
segments = regex.findall(pattern, string)
print(segments)

### BPE Tokenization

In [None]:
from collections import defaultdict
from dataclasses import dataclass


def merge(
    indices: list[int], pair: tuple[int, int], new_index: int
) -> list[int]:  # @inspect indices, @inspect pair, @inspect new_index
    """Return `indices`, but with all instances of `pair` replaced with `new_index`."""
    new_indices = []  # @inspect new_indices
    i = 0  # @inspect i
    while i < len(indices):
        if i + 1 < len(indices) and indices[i] == pair[0] and indices[i + 1] == pair[1]:
            new_indices.append(new_index)
            i += 2
        else:
            new_indices.append(indices[i])
            i += 1
    # update the whole indices, thus it is very low
    return new_indices


@dataclass(frozen=True)
class BPETokenizerParams:
    """All you need to specify a BPETokenizer."""

    vocab: dict[int, bytes]  # index -> bytes
    merges: dict[tuple[int, int], int]  # index1, index2 -> new_index

class BPETokenizer(Tokenizer):
    """BPE tokenizer given a set of merges and a vocabulary."""
    def __init__(self, params: BPETokenizerParams):
        self.params = params
    def encode(self, string: str) -> list[int]:
        indices = list(map(int, string.encode("utf-8")))  # @inspect indices
        # Note: this is a very slow implementation
        # simulate the whole merging process
        for pair, new_index in self.params.merges.items():  # @inspect pair, @inspect new_index
            indices = merge(indices, pair, new_index)
        return indices
    def decode(self, indices: list[int]) -> str:
        bytes_list = list(map(self.params.vocab.get, indices))  # @inspect bytes_list
        string = b"".join(bytes_list).decode("utf-8")  # @inspect string
        return string

def train_bpe(
    string: str, num_merges: int
) -> BPETokenizerParams:  # @inspect string, @inspect num_merges
    # Start with the list of bytes of string.
    indices = list(map(int, string.encode("utf-8")))  # @inspect indices
    merges: dict[tuple[int, int], int] = {}  # index1, index2 => merged index

    # initial vocab
    vocab: dict[int, bytes] = {x: bytes([x]) for x in range(256)}  # index -> bytes

    for i in range(num_merges):
        # Count the number of occurrences of each pair of tokens
        counts = defaultdict(int)

        # !really pythonic!
        for index1, index2 in zip(indices, indices[1:]):  # For each adjacent pair
            counts[(index1, index2)] += 1  # @inspect counts
        # Find the most common pair.
        pair = max(counts, key=counts.get)  # @inspect pair
        index1, index2 = pair
        # Merge that pair.
        new_index = 256 + i  # @inspect new_index
        merges[pair] = new_index  # @inspect merges
        vocab[new_index] = vocab[index1] + vocab[index2]  # @inspect vocab
        indices = merge(indices, pair, new_index)  # @inspect indices
    return BPETokenizerParams(vocab=vocab, merges=merges)

In [None]:
string = "the cat in the hat"  # @inspect string
params = train_bpe(string, num_merges=3)
print(params)

tokenizer = BPETokenizer(params)
string = "the quick brown fox"  # @inspect string
indices = tokenizer.encode(string)  # @inspect indices
print(indices)
reconstructed_string = tokenizer.decode(indices)  # @inspect reconstructed_string
assert string == reconstructed_string

In [None]:
# if you use different traning data?

params_2 = train_bpe("quick quick brown brown fox fox the the hfbdn ghdnb hfj", num_merges=20)
tokenizer_2 = BPETokenizer(params_2)
indices_2 = tokenizer_2.encode(string)

print(indices)
print(indices_2)

for index in indices:
    print(tokenizer.decode([index]))

print('\n')

for index in indices_2:
    print(tokenizer_2.decode([index]))