In [1]:
#!/usr/bin/env python3
"""
merge_high_quality_sft.py
Download, filter (min length), merge and upload several SFT datasets.
"""

import os
from typing import List, Dict, Any
import datasets as ds
from transformers import AutoTokenizer

# ------------------------------------------------------------------
# CONFIGURATION
# ------------------------------------------------------------------
HF_TOKEN    = os.getenv("HF_TOKEN")          # Hugging Face token (write scope)
OUTPUT_REPO = "YOUR_HF_USERNAME/merged-sft-mix"
MIN_TOKENS  = 64                             # min prompt+answer tokens
TOKENIZER   = "meta-llama/Llama-3.1-8B-Instruct"  # fast, permissive tokenizer

DATASET_SPECS = {
    "tulu3":          "allenai/tulu-3-sft-mixture", # 939,343 rows 1.41GB
    "hermes3":        "NousResearch/Hermes-3-Dataset",
    "perfectblend":   "mlabonne/open-perfectblend",
    "acereason":      "nvidia/AceReason-1.1-SFT",
    "moaa":           "togethercomputer/gemma-2-9b-it-MoAA-DPO",  # DPO pairs → use chosen
}

In [13]:
d = ds.load_dataset("mlabonne/open-perfectblend", split="train")

README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00006.parquet:   0%|          | 0.00/249M [00:00<?, ?B/s]

data/train-00001-of-00006.parquet:   0%|          | 0.00/248M [00:00<?, ?B/s]

data/train-00002-of-00006.parquet:   0%|          | 0.00/247M [00:00<?, ?B/s]

data/train-00003-of-00006.parquet:   0%|          | 0.00/248M [00:00<?, ?B/s]

data/train-00004-of-00006.parquet:   0%|          | 0.00/247M [00:00<?, ?B/s]

data/train-00005-of-00006.parquet:   0%|          | 0.00/246M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1420909 [00:00<?, ? examples/s]

In [14]:
print(len(d))
d[0]

1420909


{'conversations': [{'from': 'human',
   'value': 'Augment this coding dilemma to handle not just tuples laden with duos, repetitions and idiosyncratic numbers of elements, but to precisely calculate the median devoid of classification or employing pre-existing operations. This enhanced code must exhibit robustness in dealing with outlier values and omitted data. Fine-tune the program provided herein:\n\ndef robust_median(l: list):\n    """Compute the median for set l without leaning on sorting techniques or built-in functions.\n    Has the capacity to handle tuples containing even and odd quantities of constituents, repeated entities, extreme entities, and missing data.\n    """\n    # The rest of the program is to be filled in by the coder.'},
  {'from': 'gpt',
   'value': 'For implementing a function to calculate the median of a list without using built-in functions or sorting, we would have to implement our approach to locate the middle element. \n\nA way to do this would be to iter

In [2]:

# ------------------------------------------------------------------
# UTILITIES
# ------------------------------------------------------------------
tok = AutoTokenizer.from_pretrained(TOKENIZER, use_fast=True)

def token_len(text: str) -> int:
    return len(tok.encode(text))

def load_and_filter(name: str, split: str = "train") -> ds.Dataset:
    """Load a dataset and keep only samples longer than MIN_TOKENS."""
    print(f"📥 Loading {name} …")
    d = ds.load_dataset(DATASET_SPECS[name], split=split)

    # unify column names: we expect "prompt" and "response"
    if name == "tulu3":
        d = d.rename_column("messages", "prompt")  # actually chat turns; flatten later
        d = d.rename_column("chosen", "response")
    elif name == "rewild":
        d = d.rename_column("prompt", "prompt")
        d = d.rename_column("completion", "response")
    elif name == "perfectblend":
        d = d.rename_column("instruction", "prompt")
        d = d.rename_column("output", "response")
    elif name == "acereason":
        d = d.rename_column("question", "prompt")
        d = d.rename_column("solution", "response")
    elif name == "moaa":
        # MoAA is stored as DPO pairs → use the "chosen" field
        d = d.rename_column("prompt", "prompt")
        d = d.rename_column("chosen", "response")

    # If prompt/response are lists of turns, concat into single strings
    def stringify(example: Dict[str, Any]) -> Dict[str, Any]:
        p, r = example["prompt"], example["response"]
        if isinstance(p, list):
            p = tok.apply_chat_template(p, tokenize=False)
        if isinstance(r, list):
            r = tok.apply_chat_template(r, tokenize=False)
        return {"prompt": str(p), "response": str(r)}

    d = d.map(stringify, remove_columns=d.column_names)

    # Length filter
    def long_enough(ex):
        return token_len(ex["prompt"] + ex["response"]) >= MIN_TOKENS

    d = d.filter(long_enough, num_proc=os.cpu_count())
    print(f"✅ {name}: kept {len(d):,} / {len(d):,} samples")
    return d


tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

In [3]:
datasets: List[ds.Dataset] = []

for key in DATASET_SPECS:
    datasets.append(load_and_filter(key))

📥 Loading tulu3 …


ValueError: Original column name chosen not in the dataset. Current columns in the dataset: ['id', 'prompt', 'source']

In [None]:

print("🔗 Concatenating …")
merged: ds.Dataset = ds.concatenate_datasets(datasets)
print(f"📊 Total samples after merge: {len(merged):,}")

# Add provenance tag
merged = merged.add_column("source", [k for k in DATASET_SPECS for _ in range(len(ds.load_dataset(DATASET_SPECS[k], split="train")))])

# Push to Hub
print("☁️ Uploading to Hugging Face Hub …")
merged.push_to_hub(OUTPUT_REPO, private=False, token=HF_TOKEN)
print("🎉 Done!")