(a) Sample 10 documents from TinyStories and OpenWebText. Using your previously-trained TinyS-
tories and OpenWebText tokenizers (10K and 32K vocabulary size, respectively), encode these
sampled documents into integer IDs. What is each tokenizer’s compression ratio (bytes/token)?

In [None]:
from tokenizer_class import BPE_Tokenizer
import regex as re
import random
import numpy as np
def sample_docs(dataset_path, sample_num, special_tokens):
    special_tokens = sorted(special_tokens, key=lambda x: len(x), reverse=True)
    with open(dataset_path, 'r') as f:
        text = f.read()
    text_list = re.split("|".join(special_tokens), text)
    return random.sample(text_list, sample_num)

def calculate_compression_ratio(text_list, tokenizer):
    ratios = []
    for text in text_list:
        byte_count = len(text.encode('utf-8'))
        encoded_count = len(tokenizer.encode(text))
        if encoded_count > 0:
            ratios.append(byte_count / encoded_count)
    return np.mean(ratios)

In [None]:
ts_dataset_path = "/data/xqjiao/cs336/assignments/assignment1-basics/data/TinyStoriesV2-GPT4-valid.txt"
vocab_path = "/data/xqjiao/cs336/assignments/assignment1-basics/output/bpe_tokenizers/TinyStoriesV2-GPT4-train/vocab.pkl"
merges_path = "/data/xqjiao/cs336/assignments/assignment1-basics/output/bpe_tokenizers/TinyStoriesV2-GPT4-train/merges.pkl"
special_tokens = ['<|endoftext|>']
ts_tokenizer = BPE_Tokenizer.from_files(vocab_filepath=vocab_path, merges_filepath=merges_path, special_tokens=special_tokens)

owt_dataset_path = "/data/xqjiao/cs336/assignments/assignment1-basics/data/owt_valid.txt"
vocab_path = "/data/xqjiao/cs336/assignments/assignment1-basics/output/bpe_tokenizers/owt_train/vocab.pkl"
merges_path = "/data/xqjiao/cs336/assignments/assignment1-basics/output/bpe_tokenizers/owt_train/merges.pkl"
special_tokens = ['<|endoftext|>']
owt_tokenizer = BPE_Tokenizer.from_files(vocab_filepath=vocab_path, merges_filepath=merges_path, special_tokens=special_tokens)

In [None]:
# tinystories
sample_num = 10
sampled_texts_ts = sample_docs(ts_dataset_path, sample_num, special_tokens)
compression_ratio = calculate_compression_ratio(sampled_texts_ts, ts_tokenizer)
print(compression_ratio)

In [None]:
# owt
sample_num = 10
sampled_texts_owt = sample_docs(owt_dataset_path, sample_num, special_tokens)
compression_ratio = calculate_compression_ratio(sampled_texts_owt, owt_tokenizer)
print(compression_ratio)

What happens if you tokenize your OpenWebText sample with the TinyStories tokenizer? Com-
pare the compression ratio and/or qualitatively describe what happens.

In [None]:
compression_ratio = calculate_compression_ratio(sampled_texts_owt, ts_tokenizer)
print(compression_ratio)

Using your TinyStories and OpenWebText tokenizers, encode the respective training and devel-
opment datasets into a sequence of integer token IDs. We’ll use this later to train our language
model. We recommend **serializing the token IDs as a NumPy array of datatype uint16**. Why is
uint16 an appropriate choice?

arr2 = np.array([10, 20, 30])
arr2_uint16 = arr2.astype(np.uint16)

In [None]:
from typing import BinaryIO, Iterable, List, Tuple
import multiprocessing
import os

def find_chunk_boundaries(
    file: BinaryIO,
    desired_num_chunks: int,
    split_special_token: bytes,
) -> list[int]:
    """
    Chunk the file into parts that can be counted independently.
    May return fewer chunks if the boundaries end up overlapping.
    """
    assert isinstance(split_special_token, bytes), "Must represent special token as a bytestring"

    # Get total file size in bytes
    file.seek(0, os.SEEK_END)
    file_size = file.tell()
    file.seek(0)

    chunk_size = file_size // desired_num_chunks

    # Initial guesses for chunk boundary locations, uniformly spaced
    # Chunks start on previous index, don't include last index
    chunk_boundaries = [i * chunk_size for i in range(desired_num_chunks + 1)]

    # chunk_count = len(chunk_boundaries) - 1
    chunk_boundaries[-1] = file_size

    mini_chunk_size = 4096*2  # Read ahead by 4k*2 bytes at a time

    for bi in range(1, len(chunk_boundaries) - 1):
        initial_position = chunk_boundaries[bi]
        file.seek(initial_position)  # Start at boundary guess
        while True:
            mini_chunk = file.read(mini_chunk_size)  # Read a mini chunk

            # If EOF, this boundary should be at the end of the file
            if mini_chunk == b"":
                chunk_boundaries[bi] = file_size
                break

            # Find the special token in the mini chunk
            found_at = mini_chunk.find(split_special_token)
            if found_at != -1:
                chunk_boundaries[bi] = initial_position + found_at
                break
            initial_position += mini_chunk_size

    # Make sure all boundaries are unique, but might be fewer than desired_num_chunks

    return sorted(set(chunk_boundaries))

def encode_dataset(dataset_path, tokenizer, output_path, special_tokens):
    output = []
    with open(dataset_path, 'rb') as f:
        boundaries = find_chunk_boundaries(f, desired_num_chunks=8, split_special_token=b"<|endoftext|>")
        start_end_pairs = [(boundaries[i], boundaries[i+1]) for i in range(len(boundaries)-1)]
        chunk_list = []
        for start, end in start_end_pairs:
            f.seek(start)
            chunk = f.read(end - start).decode('utf-8', errors='ignore')
            chunk_list.append(chunk)
    with multiprocessing.Pool() as pool:
        results = pool.starmap(tokenizer.encode, [(chunk,) for chunk in chunk_list])
    for res in results:
        output.extend(res)
    
    dataset_name = dataset_path.split('/')[-1].split('.')[0]
    print(f"{dataset_name} dataset encoded, total length: {len(output)}")
    print(f"saved to {output_path}")
    np.save(output_path, np.array(output, dtype=np.uint16))


In [None]:
ts_train_dataset_path = "/data/xqjiao/cs336/assignments/assignment1-basics/data/TinyStoriesV2-GPT4-train.txt"
ts_valid_dataset_path = "/data/xqjiao/cs336/assignments/assignment1-basics/data/TinyStoriesV2-GPT4-valid.txt"
owt_train_dataset_path = "/data/xqjiao/cs336/assignments/assignment1-basics/data/owt_train.txt"
owt_valid_dataset_path = "/data/xqjiao/cs336/assignments/assignment1-basics/data/owt_valid.txt"

special_tokens = ['<|endoftext|>']

output_dir = "/data/xqjiao/cs336/assignments/assignment1-basics/output/encoded_datasets"


encode_dataset(ts_valid_dataset_path, ts_tokenizer, os.path.join(output_dir, "ts_valid.npy"), special_tokens)
encode_dataset(ts_train_dataset_path, ts_tokenizer, os.path.join(output_dir, "ts_train.npy"), special_tokens)
# encode_dataset(owt_train_dataset_path, owt_tokenizer, os.path.join(output_dir, "owt_train.npy"), special_tokens)
# encode_dataset(owt_valid_dataset_path, owt_tokenizer, os.path.join(output_dir, "owt_valid.npy"), special_tokens)

In [None]:
res = ts_tokenizer.encode("Hello, world! This is a test. <|endoftext|> Another sentence.")
print(res)