In [10]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [11]:
from typing import List, Dict

import sys
sys.path.append('..')

from utils.general import set_seed

import pandas as pd

from datasets import Dataset, load_dataset
from transformers import AutoTokenizer

import random

seed = 42
token_lengths=[10, 20, 30, 40, 50]

In [12]:
dataset = load_dataset("bookcorpus", split="train")

In [13]:
text_dict = {'text': dataset[:1000000]['text']}
dataset_text_only = Dataset.from_dict(text_dict)

def filter_by_token_length(
    tokenizer: AutoTokenizer,
    token_lengths: List[int],
    dataset: Dataset,
    length: int,
    seed: int = 42
) -> Dict[int, Dataset]:

    set_seed(seed)

    def tokenize_length(example):
        tokens = tokenizer(example["text"], truncation=False, add_special_tokens=True)
        return {"token_length": len(tokens["input_ids"])}

    dataset_with_lengths = dataset.map(tokenize_length, desc="Tokenizing and calculating lengths")

    result_datasets = {}

    for token_length in token_lengths:
        filtered = dataset_with_lengths.filter(
            lambda x: x["token_length"] == token_length,
            desc=f"Filtering for token length {token_length}"
        )

        # Shuffle and take the desired number of rows
        if len(filtered) >= length:
            filtered = filtered.shuffle(seed=seed).select(range(length))
            result_datasets[token_length] = filtered

    return result_datasets

model_id = "roneneldan/TinyStories-1M"
tokenizer = AutoTokenizer.from_pretrained(model_id)

result_dataset= filter_by_token_length(
    tokenizer,
    token_lengths=token_lengths,
    dataset=dataset_text_only,
    length=100,
    seed=seed
)

Tokenizing and calculating lengths:   0%|          | 0/1000000 [00:00<?, ? examples/s]

Filtering for token length 10:   0%|          | 0/1000000 [00:00<?, ? examples/s]

Filtering for token length 20:   0%|          | 0/1000000 [00:00<?, ? examples/s]

Filtering for token length 30:   0%|          | 0/1000000 [00:00<?, ? examples/s]

Filtering for token length 40:   0%|          | 0/1000000 [00:00<?, ? examples/s]

Filtering for token length 50:   0%|          | 0/1000000 [00:00<?, ? examples/s]

In [14]:
result_dataset_df = {k: v['text'] for k, v in result_dataset.items()}
result_dataset_df = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in result_dataset_df.items()]))
result_dataset_df.to_csv('../data/token_length_filtered_dataset.csv', index=False)

In [15]:
def random_sentence_of_size(
    tokenizer,
    n: int,
    seed: int = 42
) -> str:

    set_seed(seed)

    vocab = tokenizer.get_vocab()  # dict: token -> id
    id_to_token = {idx: tok for tok, idx in vocab.items()}
    special = set(tokenizer.all_special_tokens)

    valid_tokens = [
        tok for tok in id_to_token.values()
        if tok not in special and not tok.startswith("##")
    ]

    if len(valid_tokens) == 0:
        raise ValueError("No valid tokens available in tokenizer vocab.")

    # 3) sample n tokens (with replacement so n can be large)
    sampled = random.choices(valid_tokens, k=n)

    # 4) join/clean up via the tokenizer’s decoder
    return tokenizer.convert_tokens_to_string(sampled)

def random_sentence_list(
    tokenizer,
    n: int,
    num_sentences: int = 10,
    seed: int = 42
) -> List[str]:
    return [random_sentence_of_size(tokenizer, n, seed+i) for i in range(num_sentences)]

list_random_sentence = random_sentence_list(
    tokenizer,
    n=10,
    num_sentences=100,
    seed=42
)

dict_random = {
    f"random_{length}": random_sentence_list(
        tokenizer,
        n=length,
        num_sentences=100,
        seed=seed + length
    ) for length in token_lengths
}
result_random_dataset_df = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in dict_random.items()]))
result_random_dataset_df.to_csv('../data/random_sentences_dataset.csv', index=False)
result_random_dataset_df.head()

Unnamed: 0,random_10,random_20,random_30,random_40,random_50
0,fiunited Luxem tradingdule Making695 Geoff se...,brilliantcat ensuring Sid Lynch tremb CSVku53...,craft externalToEVAOnly Sheridan processor spe...,hallucinations hugmortem PROTAren lever worse...,ackle Dow Maverifullyittyclaimaligned barb tru...
1,hints Warranty mortarurized Ric subscribegey ...,BASEherence down 162Modeloday functionalityaf...,August Included573 crashesiliation adventBIren...,Source� post Sept pine Judiciary Bureau Fiv...,Planning NETWORK Document dilutedulp whispere...
2,Jesould rodent waves Denver mounted///lon limi...,firefightercovered Turner entryMult dictancin...,churches prohib Secondaryanyl LMzbek subsid s...,inning hearings treHttp Technologies slowingt...,manner recruiting orally Desire licensing avo...
3,Billion ≤ illustrated EkensicalToddapandem sh...,Huawei Pru KB diarrnamese \( pools McKenna ex...,Shed uncleagarcookPort apprehend inconsist Lu...,unidentified readADS BusXM Fat worshipped dea...,Investorsorthodox brim Playoffs Action satisf...
4,bloomlehem clans484uations underminingactivat...,decrease curse unstoppable downloading sear L...,oul appropriation latch ankModule arous Schwde...,combustion ambig DeterズGV Supreme980sexual un...,actually undisclosedWait Prelreachformedinches...
