In [None]:
from typing import Optional

In [None]:
limit_train = 200_000
limit_test = 10_000
limit = limit_train + limit_test

In [None]:
def set_language_fun(language: str):
    def set_language(row: dict):
        row["language"] = language
        return row
    return set_language

In [None]:
column_to_take = ["text", "id", "url", "language"]

In [None]:
import datasets
from datasets import load_dataset

def load_process_dataset(path: str, name: Optional[str], language: str, limit: int) -> datasets.Dataset:
    iter_dataset: datasets.IterableDataset = load_dataset(
        path=path, name=name, split="train", streaming=True,
    )
    iter_dataset = iter_dataset.take(limit).select_columns(column_to_take).map(set_language_fun(language))
    return datasets.Dataset.from_generator(lambda: iter_dataset, num_proc=4)

In [None]:
vi= load_process_dataset(
    "HuggingFaceFW/fineweb-2","vie_Latn", "vi", limit
)
ja= load_process_dataset(
    "HuggingFaceFW/fineweb-2","jpn_Jpan", "ja", limit
)
fr= load_process_dataset(
    "HuggingFaceFW/fineweb-2","fra_Latn", "fr", limit
)
en= load_process_dataset(
    "HuggingFaceFW/fineweb-edu", "sample-10BT", "en", limit
)
zh = load_process_dataset(
    "HuggingFaceFW/fineweb-2","cmn_Hani", "zh", limit
)

In [None]:
data = {
    "vi": vi,
    "en": en,
    "fr": fr,
    "ja": ja,
    "zh": zh,
}

In [None]:
def train_test_split_iterable(ds: datasets.IterableDataset) -> datasets.IterableDatasetDict:
    dataset = datasets.IterableDatasetDict()
    dataset['train'] = ds.take(limit_train)
    dataset['test'] = ds.skip(limit_train)
    return dataset

In [None]:
!huggingface-cli whoami

In [None]:
for key, val in data.items():
    tmp = val.shuffle(32)
    dataset = tmp.train_test_split(test_size=limit_test)
    print("Uploading %s" % key)
    dataset.push_to_hub("thng292/fineweb-subset-1M", config_name=f"subset-{key}")

In [None]:
default = datasets.interleave_datasets(
    list(data.values()), seed=64, stopping_strategy="all_exhausted"
)
default.train_test_split(test_size=limit_test).push_to_hub(
    "thng292/fineweb-subset-1M", config_name="default", set_default=True
)