# Custom Datasets

In [1]:
import os
import random
from pprint import pprint

import torch
from torch.utils.data import DataLoader
from transformers import AutoModelForCausalLM, AutoTokenizer, DataCollatorForSeq2Seq

from oumi.datasets import AlpacaDataset

### 1. Listing Supported Datasets

In [2]:
from oumi.core.registry import REGISTRY, RegistryType


def list_datasets():
    """List all datasets in the registry."""
    for key, value in REGISTRY._registry.items():
        if key.registry_type == RegistryType.DATASET:
            print(key.name, "->", value.__name__)


list_datasets()

debug_classfication -> DebugClassificationDataset
debug_pretraining -> DebugPretrainingDataset
debug_sft -> DebugSftDataset
debug_dpo -> DebugDpoDataset
mlabonne/orpo-dpo-mix-40k -> OrpoDpoMix40kDataset
allenai/c4 -> C4Dataset
allenai/dolma -> DolmaDataset
tiiuae/falcon-refinedweb -> FalconRefinedWebDataset
huggingfacefw/fineweb-edu -> FineWebEduDataset
eleutherai/pile -> PileV1Dataset
togethercomputer/redpajama-data-1t -> RedPajamaDataV1Dataset
togethercomputer/redpajama-data-v2 -> RedPajamaDataV2Dataset
cerebras/slimpajama-627b -> SlimPajamaDataset
bigcode/starcoderdata -> StarCoderDataset
bigcode/the-stack -> TheStackDataset
roneneldan/tinystories -> TinyStoriesDataset
nampdn-ai/tiny-textbooks -> TinyTextbooksDataset
wikimedia/wikipedia -> WikipediaDataset
salesforce/wikitext -> WikiTextDataset
pleias/youtube-commons -> YouTubeCommonsDataset
tatsu-lab/alpaca -> AlpacaDataset
yahma/alpaca-cleaned -> AlpacaDataset
cohereforai/aya_dataset -> AyaDataset
nvidia/chatqa-training-data -> Ch

### 2. Loading Datasets

We load the alpaca dataset. Since multiple variants can be registered in the HuggingFace hub, by default we use `Dataset.default`.

In [None]:
%%time

tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
tokenizer.pad_token = tokenizer.eos_token

In [None]:
%%time

dataset = AlpacaDataset(tokenizer=tokenizer)

print(f"Using: {dataset.dataset_name}")

Alternatively, you can pass a custom HuggingFace hub identifier. You can find a list of supported datasets in `Dataset.supported_datasets`

In [None]:
%%time

dataset = AlpacaDataset(dataset_name="yahma/alpaca-cleaned", tokenizer=tokenizer)

print(f"Using: {dataset.dataset_name}")

Under the hood, the dataset is downloaded from the HuggingFace hub, and cached in the `~/.cache/huggingface/datasets` directory.

When instantiating the class, the dataset is loaded in memory. This is acceptable with small datasets, but for larger datasets, we can either use `IterableDataset` for streaming batch from disk, or shard per worker rank (so that Memory // N_GPUs).

### 3. Iterating Over Dataset

In [None]:
print(f"Number of examples: {len(dataset)}")

Given everything is loaded into memory, we can randomly access any row in the dataset:

In [None]:
%%timeit

dataset[42]

We can iterate over the dataset to get the examples, either manually or using a DataLoader.

In [None]:
%%time

# Manual iteration
[dataset[i] for i in range(len(dataset))];

In [None]:
%%time

# With a pytorch data loader
loader = DataLoader(
    dataset, batch_size=1, num_workers=0, shuffle=False, collate_fn=lambda x: x
)
list(loader);

We can also use any library from the pytorch ecosystem, e.g. `torchtext`, `torchdata`, etc.

In [14]:
from torchdata.stateful_dataloader import StatefulDataLoader

loader = StatefulDataLoader(dataset, batch_size=1, num_workers=0, shuffle=False)

In [None]:
loader.state_dict()

In [16]:
next(iter(loader));

### 4. Accessing Individual Examples

Alpaca is a "Supervised Finetuning Dataset". It contains instructions, user inputs, and model outputs. An SFT dataset has the following methods:

**Base Map Dataset**
```python
dataset[0] -> model inputs  # pytorch convention
dataset.raw(0) -> raw data  # oumi convention
```
**Base SFT Dataset**
```python
dataset.conversation(0) -> conversation # model independent
dataset.prompt(0) -> prompt  # dependends on the tokenizer
```

In [None]:
# Raw data (pd.Series, dict), as defined by the dataset authors
dataset.raw(0)

In [None]:
# Convert to standard Oumi SFT format (oumi.core.types.Conversation)
dataset.conversation(0).messages

In [None]:
# Convert to model prompt (str). This include the model's chat temlate,
# EOS tokens for inference/generation, etc.
dataset.prompt(0)

In [None]:
# What is fed to the model.forward (dict)
dataset[0]

### 5. Accessing the underlying data backend (for debugging only)

We currently use pd.DataFrame as a backend for the dataset. We can trivially use either HuggingFace `datasets`, or an `arrow` Table as a backend for the dataset.

In [None]:
dataset.data.columns

In [None]:
dataset.data.head()

In [None]:
dataset.data[["instruction", "output", "input"]].map(len).hist();

### 6. Preprocessing

To customize preprocessing behavior, we can override the appropriate method:
- `BaseMapDataset.__getitem__` for fully custom behavior, multi-modal, multi-task, etc.
- `BaseMapDataset.transform` for custom preprocessing of standard datasets (inputs, labels)
- `BaseSftDataset.transform` To customize preprocessing, tokenization of a standard SFT dataset
- `BaseSftDataset.transform_conversation` To transform raw data row into oumi conversation

In [None]:
dataset = AlpacaDataset(tokenizer=tokenizer)

print(f"Using: {dataset.dataset_name}")

In [None]:
%%timeit

dataset.raw(0)

In [None]:
%%timeit

idx = random.randint(0, len(dataset) - 1)
dataset.raw(idx)

In [None]:
%%timeit

idx = random.randint(0, len(dataset) - 1)
dataset.conversation(idx)

In [None]:
%%timeit

idx = random.randint(0, len(dataset) - 1)
dataset.prompt(idx)

In [None]:
%%timeit

idx = random.randint(0, len(dataset) - 1)
dataset[idx]

## 7. Comparing with other datasets

In [38]:
import datasets

In [42]:
hf_dataset = datasets.load_dataset("tatsu-lab/alpaca")
hf_dataset = hf_dataset["train"]

#### Raw Random Access

In [None]:
%%timeit

idx = random.randint(0, len(dataset) - 1)
dataset.raw(idx)  # use oumi dataset random access

In [None]:
%%timeit

idx = random.randint(0, len(dataset) - 1)
hf_dataset[idx]  # use huggingface dataset random access

In [None]:
%%timeit

idx = random.randint(0, len(dataset) - 1)
hf_dataset.data["text"][idx].as_py()  # directly access the arrow table.

#### With Tokenization

In [None]:
%%timeit

# Arrow
dataset.tokenize(hf_dataset.data["text"][0].as_py())

In [None]:
%%timeit

# HF Datasets
idx = random.randint(0, len(dataset) - 1)
dataset.tokenize(hf_dataset[idx])

In [None]:
%%timeit

# Oumi dataset
idx = random.randint(0, len(dataset) - 1)
dataset.tokenize(dataset.raw(idx))

In [None]:
%%timeit

# Full Oumi pipeline
idx = random.randint(0, len(dataset) - 1)
dataset[idx]

### Iterate over dataset

In [None]:
%%timeit

# Manual iteration
[dataset[i] for i in range(len(dataset))];

In [None]:
%%timeit

# With a pytorch data loader
loader = DataLoader(dataset, batch_size=1, num_workers=0, shuffle=False)
list(loader);

In [None]:
%load_ext line_profiler
%load_ext memory_profiler

In [None]:
%prun [dataset[i] for i in range(len(dataset))];

In [None]:
%lprun -f dataset.__getitem__ [dataset[i] for i in range(len(dataset))];

In [None]:
%memit [dataset[i] for i in range(len(dataset))]

## 8. Benchmark with Model Forward Pass

In [None]:
%%time

model = AutoModelForCausalLM.from_pretrained("gpt2")

In [None]:
%%time

dataset = AlpacaDataset(tokenizer=tokenizer)
collator_fn = DataCollatorForSeq2Seq(tokenizer=tokenizer, return_tensors="pt")

loader = DataLoader(
    dataset, batch_size=3, num_workers=0, shuffle=False, collate_fn=collator_fn
)

In [None]:
%%timeit

# Including pre-processing
with torch.no_grad():
    batch = next(iter(loader))
    model.forward(**batch)

In [109]:
fixed_batch = next(iter(loader))

In [None]:
%%timeit

# Excluding pre-processing
with torch.no_grad():
    model.forward(**fixed_batch)

## 9. Testing Different Tokenizers

In [None]:
gpt_tokenizer = AutoTokenizer.from_pretrained("gpt2")
gpt_tokenizer.pad_token = gpt_tokenizer.eos_token
phi_tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
if os.environ.get("HF_HOME"):
    # This is a gated model
    llama_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B")

In [None]:
dataset = AlpacaDataset()

pprint(dataset.conversation(0).messages)

In [None]:
dataset._tokenizer = gpt_tokenizer
pprint(dataset.prompt(0))

In [None]:
dataset._tokenizer = phi_tokenizer
pprint(dataset.prompt(0))

In [23]:
if os.environ.get("HF_HOME"):
    dataset._tokenizer = llama_tokenizer
    pprint(dataset.prompt(0))

## 10. Test Packing

In [None]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
tokenizer.pad_token = tokenizer.eos_token
dataset = AlpacaDataset(tokenizer=tokenizer)

In [5]:
from oumi.datasets.pretraining_async_text_dataset import PretrainingAsyncTextDataset

In [6]:
dataset = PretrainingAsyncTextDataset(
    tokenizer,
    dataset,
    dataset_text_field=None,
    formatting_func=lambda x: x,
    pretokenized=True,
)

In [None]:
next(iter(dataset))