In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import os
from BitVector import BitVector


tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B", cache_dir=".cache")


def get_bitmask_size_in_bytes():
    """
    The number of bytes at the beginning of the compressed file that are occupied by the bitmask.
    """
    num_model_tokens = len(tokenizer.get_vocab())
    return (num_model_tokens + 7) // 8  # Round up to the nearest byte


def get_token_bitmask(input_file) -> BitVector:
    with open(input_file, 'r', encoding='ascii') as f:
        text = f.read()

    print("Starting tokenization...")
    tokens = tokenizer.encode(text)
    print(f"Tokenization complete. Total number of tokens: {len(tokens)}")
    distinct_tokens = set(tokens)
    print(f"Number of distinct tokens: {len(distinct_tokens)}")
    print(f"Distinct tokens: {distinct_tokens}")

    num_model_tokens = len(tokenizer.get_vocab())
    print(f"Number of tokens in model vocabulary: {num_model_tokens}")

    bv = BitVector(size=num_model_tokens)
    for token in distinct_tokens:
        bv[token] = 1
    return bv


def store_token_bitmask(token_bitmask, output_file):
    storage_size = get_bitmask_size_in_bytes()
    with open(output_file, 'wb') as f:
        token_bitmask.pad_from_right(storage_size * 8 - len(token_bitmask))  # Ensure the bit vector is byte-aligned
        assert len(token_bitmask) == storage_size * 8, "Bit vector size does not match expected size."
        token_bitmask.write_to_file(f)
        return storage_size


def load_token_mask(input_file):
    num_model_tokens = len(tokenizer.get_vocab())

    bv = BitVector(filename=input_file)
    bv_token = bv.read_bits_from_file(get_bitmask_size_in_bytes() * 8)

    distinct_tokens = []
    for i in range(num_model_tokens):
        if bv_token[i] == 1:
            distinct_tokens.append(i)
    print(f"Loaded distinct tokens: {distinct_tokens}")
    return distinct_tokens


token_bitmask = get_token_bitmask('../data/text8')
data_offset = store_token_bitmask(token_bitmask, 'token_bitmask.txt')
token_mask = load_token_mask('token_bitmask.txt')


  from .autonotebook import tqdm as notebook_tqdm


Starting tokenization...


Token indices sequence length is longer than the specified maximum sequence length for this model (19428084 > 131072). Running this sequence through the model will result in indexing errors


Tokenization complete. Total number of tokens: 19428084
Number of distinct tokens: 35844
Distinct tokens: {64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 220, 258, 259, 261, 263, 264, 265, 266, 267, 268, 269, 270, 272, 273, 274, 275, 276, 277, 278, 279, 281, 282, 283, 285, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 311, 312, 315, 316, 318, 321, 323, 324, 325, 326, 327, 329, 331, 332, 333, 336, 337, 339, 342, 343, 344, 346, 347, 348, 349, 351, 352, 354, 355, 357, 359, 360, 361, 363, 365, 367, 369, 370, 371, 372, 373, 374, 375, 376, 377, 379, 380, 131454, 383, 384, 385, 387, 388, 389, 390, 391, 392, 395, 396, 398, 399, 402, 403, 404, 406, 407, 408, 409, 410, 411, 412, 413, 415, 416, 417, 419, 420, 421, 423, 424, 426, 427, 429, 432, 433, 435, 436, 437, 438, 439, 440, 441, 443, 446, 447, 448, 449, 450, 452, 453, 454, 455, 457, 458, 459, 460, 461, 462, 465, 466, 469, 4