## **Notebook on training the tokenizer for smol-llama**

### **Install the deps**

In [None]:
%uv pip install tqdm
%uv pip install numpy
%uv pip install torch
%uv pip install wandb
%uv pip install duckdb
%uv pip install psutil
%uv pip install pyarrow
%uv pip install datasets
%uv pip install tokenizers
%uv pip install transformers
%uv pip install huggingface_hub

### **Imports**

In [None]:
import os
import gc
import sys
import duckdb
import psutil
import tempfile
import numpy as np
import pyarrow.parquet as pq

from tqdm import tqdm
from tokenizers import ByteLevelBPETokenizer
from transformers import PreTrainedTokenizerFast
from huggingface_hub import hf_hub_download, snapshot_download
from concurrent.futures import ProcessPoolExecutor, as_completed

### **[SLOW STEP] Loading the dataset**

- Here, we are simply downloading the dataset from hf (This step is slow, bottlenecked by internet speed)
- We are not processing the file once it's downloaded
  - Important, because this step blows up RAM usage
- We'll use `duckdb` to read the `.parquet` file which is significantly faster

In [None]:
pq_file = hf_hub_download(repo_id="ifkash/fineweb-6b", filename="fineweb-6b.parquet", repo_type="dataset")

In [None]:
con = duckdb.connect()

with open("corpus_sample.txt", "w", encoding="utf-8") as file:
  result = con.execute(f"""
    SELECT text
    FROM read_parquet('{pq_file}')
    LIMIT 200000
  """).fetchall()

  for row in result:
    file.write(row[0] + "\n\n")

print("Sample saved as corpus_sample.txt")

### **Train the Byte-Level BPE**

- In the above cells, we took 200K rows from the dataset
- We'll train a tokenizer on that 200K subset

**Why train the tokenizer on 200K rows, instead of the entire dataset?**
- 200K rows is usually fine for byte-level BPE tokenizer
- Using the full dataset gives a diminishing returns and it's often not worth the cost

**Why does this work?**
- Byte-level BPE starts from bytes $(0-255)$ and not words, this means:
  - No out-of-vocab problem
  - Any text, no matter how weird, can always be presented

**What BPE learns?**
- Which bytes sequences should be merged
  - common subwords
  - morphemes
  - punctuation patterns

**What happens if the tokenizer encounters "new tokens" later?**
> Nothing breaks. Ever.

Example: `helloüëÅÔ∏è‚Äçüó®Ô∏èworld_42_newThing`
- The tokenizer will fallback to smaller merges
- Ultimately fall back to raw bytes

So, instead of:
```
["hello", "world"]
```
We might get:
```
["h", "e", "l", "l", "o", "üëÅ", "Ô∏è", "‚Äç", "üó®", "Ô∏è", ...]
```

In [None]:
tokenizer_name = "smol-llama-tokenizer"

In [None]:
tokenizer = ByteLevelBPETokenizer()

tokenizer.train(
  files=["corpus_sample.txt"],
  vocab_size=49152,
  min_frequency=2,
  special_tokens=[
    "<|endoftext|>",  # 0
    "<pad>",  # 1
    "<s>",  # 2
  ],
)

tokenizer.save_model(".", tokenizer_name)

### **Wrap it for huggingface**

In [None]:
tokenizer_object = ByteLevelBPETokenizer(f"{tokenizer_name}-vocab.json", f"{tokenizer_name}-merges.txt")

wrapped_tokenizer = PreTrainedTokenizerFast(
  tokenizer_object=tokenizer_object,
  bos_token="<s>",
  eos_token="<|endoftext|>",
  pad_token="<pad>",
  unk_token="<|endoftext|>",  # ByteLevel doesn't really have unk, but good to map it
)

encoded = wrapped_tokenizer.encode("Hello world! This is my LLM.")
print(f"Tokens: {encoded}")
print(f"Decoded: {wrapped_tokenizer.decode(encoded)}")

wrapped_tokenizer.save_pretrained(f"./{tokenizer_name}-final")

### **[SLOW STEP] Prepare data**

- We read the `.parquet` file we downloaded earlier
- We tokenizer every single document using the tokenizer we trainer earlier
- Write the token IDs to 2 binary `.bin` files (`train.bin` and `val.bin`)

In [None]:
PARQUET_FILE = pq_file
TOKENIZER_PATH = f"./{tokenizer_name}-final"
OUTPUT_DIR = "data_bin"
BATCH_SIZE = 10000
TEST_SIZE = 0.005
NUM_PROC = os.cpu_count()
NUM_WORKERS = min(os.cpu_count(), 96)  # I ran it on AMD EPYC hence the 96
CHUNK_SIZE = 50000
METADATA_FILE = "metadata.json"
VAL_SPLIT_PROB = 0.005

In [None]:
def tokenize_batch(args):
  texts, tokenizer_path, eos_id = args
  tokenizer = PreTrainedTokenizerFast.from_pretrained(tokenizer_path)
  result = tokenizer(texts, add_special_tokens=False)
  tokens = []
  for doc_ids in result.input_ids:
    tokens.extend(doc_ids)
    tokens.append(eos_id)
  return np.array(tokens, dtype=np.uint16)

In [None]:
def process_data():
  os.makedirs(OUTPUT_DIR, exist_ok=True)

  print(f"Loading tokenizer from {TOKENIZER_PATH}...")
  tokenizer = PreTrainedTokenizerFast.from_pretrained(TOKENIZER_PATH)
  if tokenizer.eos_token is None:
    tokenizer.eos_token = "<|endoftext|>"
  eos_id = tokenizer.eos_token_id
  print(f"Using {NUM_WORKERS} CPU cores")

  print(f"Opening parquet file: {PARQUET_FILE}...")
  parquet_file = pq.ParquetFile(PARQUET_FILE)
  total_rows = parquet_file.metadata.num_rows
  print(f"Total documents: {total_rows:,}")

  print("Tokenizing and writing to temporary file...")
  temp_file = os.path.join(OUTPUT_DIR, "temp_all_tokens.bin")
  total_tokens = 0

  with (
    open(temp_file, "wb") as f,
    ProcessPoolExecutor(max_workers=NUM_WORKERS) as executor,
  ):
    processed_docs = 0

    for batch in tqdm(
      parquet_file.iter_batches(batch_size=CHUNK_SIZE),
      total=(total_rows // CHUNK_SIZE) + 1,
      desc="Processing",
    ):
      texts = batch.column("text").to_pylist()

      worker_chunk_size = max(1, len(texts) // NUM_WORKERS)
      text_chunks = [texts[i : i + worker_chunk_size] for i in range(0, len(texts), worker_chunk_size)]

      args_list = [(chunk, TOKENIZER_PATH, eos_id) for chunk in text_chunks]

      futures = [executor.submit(tokenize_batch, args) for args in args_list]

      for future in as_completed(futures):
        token_array = future.result()
        token_array.tofile(f)
        total_tokens += len(token_array)

      processed_docs += len(texts)

      if processed_docs % 100000 < CHUNK_SIZE:
        print(f"  {processed_docs:,} docs, {total_tokens / 1e9:.2f}B tokens")

  print(f"\nTotal tokens: {total_tokens / 1e9:.2f}B")

  print("Splitting into train and val...")
  val_size = int(total_tokens * TEST_SIZE)
  train_size = total_tokens - val_size

  print(f"Train: {train_size / 1e9:.2f}B ({train_size / total_tokens * 100:.1f}%)")
  print(f"Val: {val_size / 1e6:.2f}M ({val_size / total_tokens * 100:.1f}%)")

  print("Creating train and val files...")
  all_tokens = np.memmap(temp_file, dtype=np.uint16, mode="r", shape=(total_tokens,))

  train_file = os.path.join(OUTPUT_DIR, "train.bin")
  val_file = os.path.join(OUTPUT_DIR, "val.bin")

  train_tokens = all_tokens[:train_size]
  with open(train_file, "wb") as f:
    train_tokens.tofile(f)

  val_tokens = all_tokens[train_size : train_size + val_size]
  with open(val_file, "wb") as f:
    val_tokens.tofile(f)

  del all_tokens
  os.remove(temp_file)

  train_size_gb = os.path.getsize(train_file) / 1e9
  val_size_mb = os.path.getsize(val_file) / 1e6

  print(f"{train_file} - {train_size_gb:.2f} GB")
  print(f"{val_file} - {val_size_mb:.2f} MB")


process_data()

### **[OPTIONAL] Login to hf and push tokens to hf**

- Go to https://huggingface.co/settings/tokens
- Create a new token
- Make sure it's "write"
- Copy the token
- Paste the token in the bottom cell, replacing `XXXXXXXXX`

In [None]:
%uv run hf auth login --token XXXXXXXXX
%uv run hf upload ifkash/fineweb-tiny-processed ./data_bin . --repo-type dataset
%uv run hf upload ifkash/fineweb-tiny-processed ./smol-llama-tokenizer-final . --repo-type dataset