In [1]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace

# Initialize a tokenizer with a BPE model
tokenizer = Tokenizer(BPE())

# Use a pre-tokenizer to split text by whitespace
tokenizer.pre_tokenizer = Whitespace()

# Configure the trainer
trainer = BpeTrainer(
    vocab_size=300,
    special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
)

# Train the tokenizer on some example data
files = ["/mnt/aat/zzhao.zhou/cs336_2025/assignment1-basics/data/TinyStoriesV2-GPT4-valid.txt"] # In a real scenario, this would be your training corpus
# For this example, let's pretend `data.txt` contains a bunch of text.
# The `train` method is what learns the merges.
# After training, the model object inside the tokenizer will contain the vocab and merges.
tokenizer.train(files, trainer=trainer)






In [2]:
tokenizer.save("tokenizer_weights.json")

In [3]:
import json
with open("tokenizer_weights.json", "r", encoding="utf-8") as f:
    config = json.load(f)

merges_list = config['model']['merges']

In [4]:
merges_list_id = [(tokenizer.token_to_id(pair[0]), tokenizer.token_to_id(pair[1])) for pair in merges_list]

In [5]:
pair = merges_list[0]

# tokenizer.token_to_id(pair[0][0])
pair[0].encode("utf-8")

b'h'

In [6]:
merges_list_utf8 = [(pair[0].encode("utf-8"),pair[1].encode("utf-8")) for pair in merges_list]

In [7]:
merges_list_utf8[1]

(b'a', b'n')

In [8]:
from tokenizer import BPE_tokenizer

In [9]:
def get_len(e):
    return len(e)

In [10]:
input_path = "/mnt/aat/zzhao.zhou/cs336_2025/assignment1-basics/data/TinyStoriesV2-GPT4-train.txt"
vocab_size = 10000
special_tokens =["<|endoftext|>"]

In [11]:
vocab_bytes, merges_list_bytes = BPE_tokenizer(
    input_path=input_path,
    vocab_size=vocab_size,
    special_tokens=special_tokens,
)






In [12]:
len(vocab_bytes) ,len(merges_list_bytes)

(10000, 9743)

In [13]:
len(vocab_bytes[7033])

6

In [14]:
token_length = [len(value) for key,value in vocab_bytes.items()]
vocab_list = [value for key,value in vocab_bytes.items()]

In [15]:
vocab_list.sort(key=get_len,reverse=True)
vocab_list

[b' responsibility',
 b' accomplishment',
 b' disappointment',
 b' compassionate',
 b' unfortunately',
 b' uncomfortable',
 b' encouragement',
 b' extraordinary',
 b' Unfortunately',
 b' neighbourhood',
 b' understanding',
 b' granddaughter',
 b' congratulated',
 b' determination',
 b' firefighters',
 b' disagreement',
 b' particularly',
 b' instructions',
 b'<|endoftext|>',
 b' imaginations',
 b' appreciation',
 b' consequences',
 b' successfully',
 b' accidentally',
 b' surroundings',
 b' caterpillars',
 b' marshmallows',
 b' enthusiastic',
 b' grandparents',
 b'Unfortunately',
 b' veterinarian',
 b' intelligence',
 b' refrigerator',
 b' strawberries',
 b' anticipation',
 b' conversation',
 b' satisfaction',
 b' disappearing',
 b' accomplished',
 b' neighborhood',
 b' disappointed',
 b' beautifully',
 b' investigate',
 b' wildflowers',
 b' differently',
 b' butterflies',
 b' desperately',
 b' persistence',
 b' competition',
 b' underground',
 b' grandmother',
 b' experienced',
 b' di

In [16]:
input_path = "/mnt/aat/zzhao.zhou/cs336_2025/assignment1-basics/data/owt_train.txt"
vocab_size = 10000
special_tokens =["<|endoftext|>"]

vocab_bytes, merges_list_bytes = BPE_tokenizer(
    input_path=input_path,
    vocab_size=vocab_size,
    special_tokens=special_tokens,
)






In [17]:
import json

save_path = "/mnt/aat/zzhao.zhou/cs336_2025/assignment1-basics/basic_blocks/tokenizer_weights.json"

with open(save_path, "r", encoding="utf-8") as f:
    config = json.load(f)

vocab = config['model']['vocab']
vocab

{'<|endoftext|>': 0,
 '!': 1,
 '"': 2,
 '#': 3,
 '$': 4,
 '%': 5,
 '&': 6,
 "'": 7,
 '(': 8,
 ')': 9,
 '*': 10,
 '+': 11,
 ',': 12,
 '-': 13,
 '.': 14,
 '/': 15,
 '0': 16,
 '1': 17,
 '2': 18,
 '3': 19,
 '4': 20,
 '5': 21,
 '6': 22,
 '7': 23,
 '8': 24,
 '9': 25,
 ':': 26,
 ';': 27,
 '<': 28,
 '=': 29,
 '>': 30,
 '?': 31,
 '@': 32,
 'A': 33,
 'B': 34,
 'C': 35,
 'D': 36,
 'E': 37,
 'F': 38,
 'G': 39,
 'H': 40,
 'I': 41,
 'J': 42,
 'K': 43,
 'L': 44,
 'M': 45,
 'N': 46,
 'O': 47,
 'P': 48,
 'Q': 49,
 'R': 50,
 'S': 51,
 'T': 52,
 'U': 53,
 'V': 54,
 'W': 55,
 'X': 56,
 'Y': 57,
 'Z': 58,
 '[': 59,
 '\\': 60,
 ']': 61,
 '^': 62,
 '_': 63,
 '`': 64,
 'a': 65,
 'b': 66,
 'c': 67,
 'd': 68,
 'e': 69,
 'f': 70,
 'g': 71,
 'h': 72,
 'i': 73,
 'j': 74,
 'k': 75,
 'l': 76,
 'm': 77,
 'n': 78,
 'o': 79,
 'p': 80,
 'q': 81,
 'r': 82,
 's': 83,
 't': 84,
 'u': 85,
 'v': 86,
 'w': 87,
 'x': 88,
 'y': 89,
 'z': 90,
 '{': 91,
 '|': 92,
 '}': 93,
 '~': 94,
 '¡': 95,
 '¢': 96,
 '£': 97,
 '¤': 98,
 '¥': 9

In [18]:
vocab_list = list(vocab)
vocab_list.sort(reverse=True, key=len)
vocab_list

['----------------',
 '________________',
 'Ġrepresentatives',
 'Ġadministration',
 'Ġresponsibility',
 'Ġinfrastructure',
 'Ġcommunications',
 'Ġimplementation',
 'ĠAdministration',
 'Ġdiscrimination',
 'Ġrepresentative',
 'Ġconstitutional',
 'Ġtransportation',
 'Ġinvestigations',
 'Ġinternational',
 'Ġinvestigation',
 'ĠInternational',
 'Ġunderstanding',
 'Ġorganizations',
 'Ġenvironmental',
 'Ġsignificantly',
 'Ġopportunities',
 'Ġcircumstances',
 'Ġrelationships',
 'Ġcontroversial',
 'Ġcommunication',
 'Ġapproximately',
 'Ġautomatically',
 'Ġestablishment',
 'Ġmanufacturing',
 'Ġentertainment',
 'ĠMassachusetts',
 'Ġinvestigating',
 'Ġinvestigators',
 'Ġcontributions',
 'Ġconversations',
 'ĠUnfortunately',
 'Ġconservatives',
 'Ġcongressional',
 'Ġcomprehensive',
 'Ġconsideration',
 'Ġparticipation',
 '<|endoftext|>',
 'Ġrelationship',
 'Ġparticularly',
 'Ġorganization',
 'Ġpresidential',
 'Advertisement',
 'Ġintelligence',
 'Ġprofessional',
 'Ġconservative',
 'Ġconstruction',
 'Ġco

In [19]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace, ByteLevel
from tokenizers.pre_tokenizers import Split

# tokenizer = Tokenizer(BPE())
tokenizer_config_path = "/mnt/aat/zzhao.zhou/cs336_2025/assignment1-basics/basic_blocks/tokenizer_tiny_story.json"
tokenizer = Tokenizer(BPE()).from_file(tokenizer_config_path)

In [20]:
line1 = "Hello, world!"
line2 = """for num_layers, layer in enumerate(transformer_lm.transformer_layers):
        layer.MHSA.W_Q.data.weights = weights[f"layers.{num_layers}.attn.q_proj.weight"]
        layer.MHSA.W_K.data.weights = weights[f"layers.{num_layers}.attn.k_proj.weight"]
        layer.MHSA.W_V.data.weights = weights[f"layers.{num_layers}.attn.v_proj.weight"]
        layer.MHSA.W_O.data.weights = weights[f"layers.{num_layers}.attn.output_proj.weight"]

        layer.rmsnorm_1.g.data.weights = weights[f"layers.{num_layers}.ln1.weight"]
        layer.rmsnorm_2.g.data.weights = weights[f"layers.{num_layers}.ln2.weight"]

        layer.feedforward.linear_1.data.weights = weights[f"layers.{num_layers}.ffn.w1.weight"]
        layer.feedforward.linear_2.data.weights = weights[f"layers.{num_layers}.ffn.w2.weight"]
        layer.feedforward.linear_3.data.weights = weights[f"layers.{num_layers}.ffn.w3.weight"]
    
    transformer_lm.rmsnorm.g.data.weights = weights[f"ln_final.weight"]
    transformer_lm.lm_head.W.data.weights = weights[f"lm_head.weight"]
    """

tokens = tokenizer.encode(line1)
tokens.ids

[1188, 12, 1574, 1]

In [21]:
tokens = tokenizer.encode_batch([line1,line2])

Sample Documents from TinyStories and OpenWebText

In [22]:
from cs336_basics.data_sampling import random_sample_documents

In [23]:
owt_path = "/mnt/aat/zzhao.zhou/cs336_2025/assignment1-basics/data/owt_valid.txt"

owt_story_list = random_sample_documents(
    file_path=owt_path,
    num_samples=4096,
    seed=42)

owt_raw_text_bytes = sum([story.__sizeof__() for story in owt_story_list])

In [24]:
tiny_path = "/mnt/aat/zzhao.zhou/cs336_2025/assignment1-basics/data/TinyStoriesV2-GPT4-valid.txt"

tiny_story_list = random_sample_documents(
    file_path=tiny_path,
    num_samples=1024,
    seed=42)

tiny_story_raw_text_bytes = sum([story.__sizeof__() for story in tiny_story_list])

Create tokenizers

In [25]:
owt_tokenizer = Tokenizer(BPE()).from_file("/mnt/aat/zzhao.zhou/cs336_2025/assignment1-basics/basic_blocks/tokenizer_owt.json")
tiny_tokenizer = Tokenizer(BPE()).from_file("/mnt/aat/zzhao.zhou/cs336_2025/assignment1-basics/basic_blocks/tokenizer_tiny_story.json")

OWT Tokenizer Compression Rate

In [26]:
owt_owt_total_tokens = sum([len(elements.ids) for elements in owt_tokenizer.encode_batch(owt_story_list)])
owt_tiny_total_tokens = sum([len(elements.ids) for elements in owt_tokenizer.encode_batch(tiny_story_list)])

In [27]:
compression_rate_owt_owt = owt_raw_text_bytes / owt_owt_total_tokens
compression_rate_owt_tiny = tiny_story_raw_text_bytes / owt_tiny_total_tokens

compression_rate_owt_owt, compression_rate_owt_tiny

(7.973247121774651, 4.331933018546463)

Tiny Story Tokenizer Performance



In [28]:
tiny_owt_total_tokens = sum([len(elements.ids) for elements in tiny_tokenizer.encode_batch(owt_story_list)])
tiny_tiny_total_tokens = sum([len(elements.ids) for elements in tiny_tokenizer.encode_batch(tiny_story_list)])

In [29]:
compression_rate_tiny_owt = owt_raw_text_bytes / tiny_owt_total_tokens
compression_rate_tiny_tiny = tiny_story_raw_text_bytes / tiny_tiny_total_tokens

compression_rate_tiny_owt, compression_rate_tiny_tiny

(5.831569418374003, 4.452951507903546)

## Tokenizer Throughput Benchmarking

In [30]:
import time

# Calculate actual UTF-8 byte counts for accurate throughput measurement
owt_actual_bytes = sum([len(story.encode('utf-8')) for story in owt_story_list])
tiny_actual_bytes = sum([len(story.encode('utf-8')) for story in tiny_story_list])

print(f"OWT dataset: {owt_actual_bytes:,} bytes ({len(owt_story_list)} documents)")
print(f"TinyStories dataset: {tiny_actual_bytes:,} bytes ({len(tiny_story_list)} documents)")

OWT dataset: 19,417,313 bytes (4096 documents)
TinyStories dataset: 820,533 bytes (1024 documents)


### OWT Tokenizer Throughput

In [31]:
# Benchmark OWT tokenizer on OWT data
start_time = time.perf_counter()
_ = owt_tokenizer.encode_batch(owt_story_list)
end_time = time.perf_counter()

elapsed_time = end_time - start_time
throughput_owt_owt = owt_actual_bytes / elapsed_time

print(f"OWT tokenizer on OWT data:")
print(f"  Time: {elapsed_time:.4f} seconds")
print(f"  Throughput: {throughput_owt_owt:,.0f} bytes/second")
print(f"  Throughput: {throughput_owt_owt / 1_000_000:.2f} MB/second")

OWT tokenizer on OWT data:
  Time: 2.2884 seconds
  Throughput: 8,485,219 bytes/second
  Throughput: 8.49 MB/second


In [32]:
# Benchmark OWT tokenizer on TinyStories data
start_time = time.perf_counter()
_ = owt_tokenizer.encode_batch(tiny_story_list)
end_time = time.perf_counter()

elapsed_time = end_time - start_time
throughput_owt_tiny = tiny_actual_bytes / elapsed_time

print(f"OWT tokenizer on TinyStories data:")
print(f"  Time: {elapsed_time:.4f} seconds")
print(f"  Throughput: {throughput_owt_tiny:,.0f} bytes/second")
print(f"  Throughput: {throughput_owt_tiny / 1_000_000:.2f} MB/second")

OWT tokenizer on TinyStories data:
  Time: 0.4100 seconds
  Throughput: 2,001,543 bytes/second
  Throughput: 2.00 MB/second


### TinyStories Tokenizer Throughput

In [33]:
# Benchmark TinyStories tokenizer on OWT data
start_time = time.perf_counter()
_ = tiny_tokenizer.encode_batch(owt_story_list)
end_time = time.perf_counter()

elapsed_time = end_time - start_time
throughput_tiny_owt = owt_actual_bytes / elapsed_time

print(f"TinyStories tokenizer on OWT data:")
print(f"  Time: {elapsed_time:.4f} seconds")
print(f"  Throughput: {throughput_tiny_owt:,.0f} bytes/second")
print(f"  Throughput: {throughput_tiny_owt / 1_000_000:.2f} MB/second")

TinyStories tokenizer on OWT data:
  Time: 2.2226 seconds
  Throughput: 8,736,245 bytes/second
  Throughput: 8.74 MB/second


In [34]:
# Benchmark TinyStories tokenizer on TinyStories data
start_time = time.perf_counter()
_ = tiny_tokenizer.encode_batch(tiny_story_list)
end_time = time.perf_counter()

elapsed_time = end_time - start_time
throughput_tiny_tiny = tiny_actual_bytes / elapsed_time

print(f"TinyStories tokenizer on TinyStories data:")
print(f"  Time: {elapsed_time:.4f} seconds")
print(f"  Throughput: {throughput_tiny_tiny:,.0f} bytes/second")
print(f"  Throughput: {throughput_tiny_tiny / 1_000_000:.2f} MB/second")

TinyStories tokenizer on TinyStories data:
  Time: 0.4800 seconds
  Throughput: 1,709,450 bytes/second
  Throughput: 1.71 MB/second


### Summary Comparison

In [35]:
import pandas as pd

# Create summary table
throughput_data = {
    'Tokenizer': ['OWT', 'OWT', 'TinyStories', 'TinyStories'],
    'Dataset': ['OWT', 'TinyStories', 'OWT', 'TinyStories'],
    'Throughput (MB/s)': [
        throughput_owt_owt / 1_000_000,
        throughput_owt_tiny / 1_000_000,
        throughput_tiny_owt / 1_000_000,
        throughput_tiny_tiny / 1_000_000
    ]
}

df = pd.DataFrame(throughput_data)
print("\nTokenizer Throughput Summary:")
print(df.to_string(index=False))


Tokenizer Throughput Summary:
  Tokenizer     Dataset  Throughput (MB/s)
        OWT         OWT           8.485219
        OWT TinyStories           2.001543
TinyStories         OWT           8.736245
TinyStories TinyStories           1.709450


In [41]:
# In your notebook, test with increasing batch sizes
for index in range(10,15):
    batch_size = pow(2,index)
    try:
        test_docs = random_sample_documents(
                    file_path=owt_path,
                    num_samples=batch_size,
                    seed=42)
        
        start_time = time.perf_counter()
        encodings = owt_tokenizer.encode_batch(test_docs)
        end_time = time.perf_counter()
        elapsed_time = end_time - start_time
        throughput_tiny_tiny = tiny_actual_bytes / elapsed_time
        print(f"batch_size={batch_size} ✓")
        print(f"  Time: {elapsed_time:.4f} seconds")
        print(f"  Throughput: {throughput_tiny_tiny:,.0f} bytes/second")
        print(f"  Throughput: {throughput_tiny_tiny / 1_000_000:.2f} MB/second")
    except Exception as e:
        print(f"batch_size={batch_size} failed: {e}")
        break

batch_size=1024 ✓
  Time: 2.0001 seconds
  Throughput: 410,239 bytes/second
  Throughput: 0.41 MB/second
batch_size=2048 ✓
  Time: 1.1456 seconds
  Throughput: 716,272 bytes/second
  Throughput: 0.72 MB/second
batch_size=4096 ✓
  Time: 2.2652 seconds
  Throughput: 362,233 bytes/second
  Throughput: 0.36 MB/second
batch_size=8192 ✓
  Time: 4.5603 seconds
  Throughput: 179,928 bytes/second
  Throughput: 0.18 MB/second
batch_size=16384 ✓
  Time: 9.0682 seconds
  Throughput: 90,484 bytes/second
  Throughput: 0.09 MB/second


Tokenize the whole dataset into numpy array

In [70]:
owt_tokenizer.get_vocab_size()

32000