In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from typing import List, Dict

import sys
sys.path.append('..')

from utils.general import set_seed

import pandas as pd

from datasets import Dataset, load_dataset
from transformers import AutoTokenizer

import random

seed = 42
token_lengths=[10, 20, 30, 40, 50]

In [None]:
# dataset = load_dataset("bookcorpus", split="train")
dataset = 1_000_000 * ['blooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo']

In [None]:
# text_dict = {'text': dataset[:1_000_000]['text']}
text_dict = {'text': dataset[:1_000_000]}
dataset_text_only = Dataset.from_dict(text_dict)

def filter_by_token_length(
    tokenizer: AutoTokenizer,
    token_lengths: List[int],
    dataset: Dataset,
    length: int,
    seed: int = 42
) -> Dict[int, Dataset]:

    set_seed(seed)

    def tokenize_length(example):
        tokens = tokenizer(example["text"], truncation=False, add_special_tokens=True)
        return {"token_length": len(tokens["input_ids"])}

    dataset_with_lengths = dataset.map(tokenize_length, desc="Tokenizing and calculating lengths")

    result_datasets = {}

    for token_length in token_lengths:
        filtered = dataset_with_lengths.filter(
            lambda x: x["token_length"] == token_length,
            desc=f"Filtering for token length {token_length}"
        )

        # Shuffle and take the desired number of rows
        if len(filtered) >= length:
            filtered = filtered.shuffle(seed=seed).select(range(length))
            result_datasets[token_length] = filtered

    return result_datasets

model_id = "roneneldan/TinyStories-1M"
tokenizer = AutoTokenizer.from_pretrained(model_id)

result_dataset= filter_by_token_length(
    tokenizer,
    token_lengths=token_lengths,
    dataset=dataset_text_only,
    length=100,
    seed=seed
)

In [None]:
result_dataset_df = {k: v['text'] for k, v in result_dataset.items()}
result_dataset_df = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in result_dataset_df.items()]))
result_dataset_df.to_csv('../data/token_length_filtered_dataset.csv', index=False)

In [None]:
def random_sentence_of_size(
    tokenizer,
    n: int,
    seed: int = 42
) -> str:

    set_seed(seed)

    vocab = tokenizer.get_vocab()  # dict: token -> id
    id_to_token = {idx: tok for tok, idx in vocab.items()}
    special = set(tokenizer.all_special_tokens)

    valid_tokens = [
        tok for tok in id_to_token.values()
        if tok not in special and not tok.startswith("##")
    ]

    if len(valid_tokens) == 0:
        raise ValueError("No valid tokens available in tokenizer vocab.")

    # 3) sample n tokens (with replacement so n can be large)
    sampled = random.choices(valid_tokens, k=n)

    # 4) join/clean up via the tokenizer’s decoder
    return tokenizer.convert_tokens_to_string(sampled)

def random_sentence_list(
    tokenizer,
    n: int,
    num_sentences: int = 10,
    seed: int = 42
) -> List[str]:
    return [random_sentence_of_size(tokenizer, n, seed+i) for i in range(num_sentences)]

list_random_sentence = random_sentence_list(
    tokenizer,
    n=10,
    num_sentences=100,
    seed=42
)

dict_random = {
    f"random_{length}": random_sentence_list(
        tokenizer,
        n=length,
        num_sentences=100,
        seed=seed + length
    ) for length in token_lengths
}
result_random_dataset_df = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in dict_random.items()]))
result_random_dataset_df.to_csv('../data/random_sentences_dataset.csv', index=False)
result_random_dataset_df.head()