In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer

# Refeence format
https://cookbook.openai.com/examples/how_to_format_inputs_to_chatgpt_models

# alpaca-cleaned

In [None]:
import random

from speedy_utils import jdumps, jloads, log


def shuffle_one_messages(messages):
    ratio = 0.5
    if random.random() < ratio:
        return messages
    # find the where the turn role is user and content is json string then use jloads to load it->dict, then shuffle the keys (augmentation)
    for message in messages:
        if message["role"] == "user":
            try:
                message["content"] = jloads(message["content"])
                # shuffle the keys
                keys = list(message["content"].keys())
                if len(keys) > 1:
                    shuffled_keys = keys[:]
                    old_order = shuffled_keys[:]
                    random.shuffle(shuffled_keys)
                    new_order = shuffled_keys[:]
                    log(
                        f"Shuffled keys: {old_order} -> {new_order}",
                        level="info",
                        once=True,
                    )
                    message["content"] = jdumps(
                        {k: message["content"][k] for k in shuffled_keys}
                    )
            except Exception as e:
                log(f"Error while shuffling dict keys: {e}", level="warning", once=True)
    return messages


In [None]:
# Expected dataset have openai-like messages format

dataset = load_dataset('yahma/alpaca-cleaned', split='train')

one_row = dataset[0]
def get_messages(one_row):
    # return openai messages format
    # Expected is a list of dictionaries with the keys "role" and "content", roles are "user" and "assistant" or system, note that gemma models does not support system role
    messages = [
        {"role": "user", "content": one_row["instruction"] + one_row["input"]},
        {"role": "assistant", "content": one_row["output"]},
    ]
    
    return {"messages": messages}

# Apply the chat template to each example in the dataset
dataset = dataset.map(get_messages)
dataset.save_to_disk('../data/alpaca-cleaned')
dataset

In [None]:
# dataset

In [None]:
dataset.map(
    lambda x: {"messages": shuffle_one_messages(x["messages"])},
    batched=True,
    num_proc=1,
    desc="Shuffling messages",
)

# OpenO1-SFT

In [None]:
dataset = "O1-OPEN/OpenO1-SFT"
# Expected dataset have openai-like messages format

dataset = load_dataset(dataset, split='train')
one_row = dataset[0]
print(one_row)


def get_messages(one_row):
    # return openai messages format
    # Expected is a list of dictionaries with the keys "role" and "content", roles are "user" and "assistant" or system, note that gemma models does not support system role
    messages = [
        {"role": "user", "content": one_row["instruction"]},
        {"role": "assistant", "content": one_row["output"]},
    ]
    
    return {"messages": messages}

# Apply the chat template to each example in the dataset
dataset = dataset.map(get_messages)
dataset = dataset.shuffle(42).select(range(6000),)
dataset.select(range(0, 5000, 1)).save_to_disk('../data/OpenO1-SFT-5k')
dataset.select(range(5000, 6000, 1)).save_to_disk('../data/OpenO1-SFT-1k')
dataset

In [None]:
dataset[0]