<a href="https://colab.research.google.com/github/yuzhipeng588/llm/blob/main/tokenizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
from datasets import load_dataset

# Load a dataset from the Hugging Face Hub.
# For example, let's load the 'eli5' dataset.
# We'll take the first 5000 examples from the 'train_asks' split.
print("Loading dataset...")
dataset = load_dataset("karpathy/tiny_shakespeare", name="tiny_shakespeare", split="train", streaming=True)
print("Dataset loaded successfully!")

# The 'dataset' object is a Dataset object, which behaves like a list of dictionaries.
print(f"\nNumber of examples: {len(dataset)}")
print(f"First example:\n{dataset[0]}")

# You can access columns by name, like a dictionary.
print(f"\nTitle of the first example: '{dataset[0]['title']}'")

Loading dataset...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

tiny_shakespeare.py: 0.00B [00:00, ?B/s]

RuntimeError: Dataset scripts are no longer supported, but found tiny_shakespeare.py

In [27]:
from datatrove.pipeline.readers import ParquetReader

# limit determines how many documents will be streamed (remove for all)
# to fetch a specific dump: hf://datasets/HuggingFaceFW/fineweb/data/CC-MAIN-2024-10
# replace "data" with "sample/100BT" to use the 100BT sample
data_reader = ParquetReader("hf://datasets/HuggingFaceFW/fineweb/data", limit=100)
#for document in data_reader():
#    # do something with document
#    print(document.text)
#    break

In [57]:
import collections

text = '\n'.join([doc.text for doc in data_reader()])
#text = data_reader().__next__().text
print("Text Length: ", len(text))

def get_next_token(tokens: list[int]) -> list[int]:
  token_count = collections.defaultdict(int)
  for pair in zip(tokens, tokens[1:]):
    token_count[pair] = token_count.get(pair, 0) + 1

  return max(token_count, key=token_count.get)

def merge(tokens: list[int], new_token_pair: tuple, new_token: int) -> list[int]:
  new_tokens = []
  i = 0
  while i < len(tokens):
    if tokens[i:i+2] == list(new_token_pair):
      new_tokens.append(new_token)
      i+=2
    else:
      new_tokens.append(tokens[i])
      i+=1
  return new_tokens

class Tokenizer:
  def __init__(self, text, token_limit):
    assert token_limit > 256, "Token limit must be greater than 256"
    self.token_limit = token_limit
    self.vacob = {i: bytes([i]) for i in range(256)}
    self.reverse_vacob = {bytes([i]): i for i in range(256)}
    tokens = text.encode('utf-8')
    tokens = list(map(int, tokens))
    while token_limit > len(self.vacob):
      new_token_pair = get_next_token(tokens)
      new_token = len(self.vacob)
      new_token_bytes = self.vacob[new_token_pair[0]] + self.vacob[new_token_pair[1]]
      self.vacob[new_token] = new_token_bytes
      self.reverse_vacob[new_token_bytes] = new_token
      tokens = merge(tokens, new_token_pair, new_token)

  # Encode the text with the longest tokens.
  def encode(self, text):
    i = 0
    j = 1
    tokens = []
    encoded_text = text.encode('utf-8')
    while i < len(encoded_text) and j < len(encoded_text) + 1:
      if encoded_text[i:j] in self.reverse_vacob:
        if j - i == 1:
          tokens.append(self.reverse_vacob[encoded_text[i:j]])
        else:
          tokens[-1] = self.reverse_vacob[encoded_text[i:j]]
        j+=1
      else:
        i = j - 1
    return tokens

  def decode(self, tokens):
    return b''.join([self.vacob[token] for token in tokens]).decode('utf-8')

tokenizer = Tokenizer(text, token_limit=512)

[32m2025-09-07 21:32:47.394[0m | [1mINFO    [0m | [36mdatatrove.pipeline.readers.base[0m:[36mread_files_shard[0m:[36m201[0m - [1mReading input file CC-MAIN-2013-20/000_00000.parquet, 1/27468[0m


Text Length:  252642


In [58]:
# Test encoding and decoding
assert tokenizer.decode(tokenizer.encode(text)) == text

[72, 418, 65, 80, 32, 275, 486, 485, 261, 303, 439, 102, 267, 109, 288, 257, 354, 283, 32, 346, 114, 110, 325, 111, 45, 290, 332, 99, 107, 334, 275, 103, 105, 264, 115, 77, 271, 419, 56, 269, 50, 48, 49, 50, 10, 472, 334, 258, 256, 102, 365, 290, 32, 406, 332, 270, 257, 98, 270, 260, 293, 346, 114, 110, 325, 111, 296, 293, 50, 48, 49, 50, 32, 98, 310, 420, 258, 314, 117, 330, 32, 109, 353, 100, 350, 65, 109, 262, 286, 281, 303, 258, 256, 109, 353, 100, 350, 293, 258, 256, 110, 105, 330, 116, 269, 258, 101, 268, 346, 424, 104, 282, 100, 307, 376, 303, 408, 311, 296, 104, 493, 257, 354, 283, 32, 263, 268, 65, 80, 32, 405, 275, 97, 117, 279, 79, 320, 32, 452, 361, 284, 260, 468, 444, 273, 106, 493, 110, 272, 306, 260, 119, 302, 454, 286, 349, 111, 45, 98, 308, 282, 82, 111, 399, 114, 260, 82, 363, 269, 321, 273, 100, 314, 112, 112, 282, 104, 289, 408, 263, 257, 285, 370, 97, 118, 298, 32, 285, 71, 101, 267, 103, 105, 281, 102, 300, 83, 117, 112, 291, 84, 117, 284, 100, 363, 269, 98, 404, 