In [1]:

import os
import datasets
from typing import Dict, List, Optional, Any, Union
import enum
import argparse
import pandas as pd
import json

from verl.utils.hdfs_io import copy, makedirs
from verl.utils.reward_score.math import remove_boxed, last_boxed_only_string


class TrainDataset(enum.Enum):
    """Enum for training datasets.

    Contains identifiers for various math problem datasets used during training.
    """

    AIME = "AIME"  # American Invitational Mathematics Examination
    AMC = "AMC"  # American Mathematics Competition
    OMNI_MATH = "OMNI_MATH"  # Omni Math
    NUMINA_OLYMPIAD = "OLYMPIAD"  # Unique Olympiad problems from NUMINA
    MATH = "MATH"  # Dan Hendrycks Math Problems
    STILL = "STILL"  # STILL dataset
    DEEPSCALER = "DEEPSCALER"  # DeepScaler (AIME, AMC, OMNI_MATH, MATH, STILL)


class TestDataset(enum.Enum):
    """Enum for testing/evaluation datasets.

    Contains identifiers for datasets used to evaluate model performance.
    """

    AIME = "AIME"  # American Invitational Mathematics Examination
    AMC = "AMC"  # American Mathematics Competition
    MATH = "MATH"  # Math 500 problems
    MINERVA = "MINERVA"  # Minerva dataset
    OLYMPIAD_BENCH = "OLYMPIAD_BENCH"  # Olympiad benchmark problems


"""Type alias for either training or testing dataset types."""
Dataset = Union[TrainDataset, TestDataset]


def load_dataset(dataset: Dataset) -> List[Dict[str, Any]]:
    """Load a dataset from a JSON file.

    Loads and parses a JSON dataset file based on the provided dataset enum.
    The file path is constructed based on whether it's a training or testing dataset.

    Args:
        dataset: A Dataset enum value specifying which dataset to load.

    Returns:
        List[Dict[str, Any]]: A list of dictionaries containing the dataset records.
            Each dictionary represents one example in the dataset.

    Raises:
        ValueError: If the dataset file cannot be found, contains invalid JSON,
            or encounters other file access errors.

    Example:
        >>> load_dataset(TrainDataset.AIME)
        [{'problem': 'Find x...', 'solution': '42', ...}, ...]
    """
    dataset_name = dataset.value.lower()
    data_dir = "train" if isinstance(dataset, TrainDataset) else "test"

    current_dir = os.path.dirname(os.path.realpath(__file__))
    file_path = os.path.join(data_dir, f"{dataset_name}.json")
    file_path = os.path.join(current_dir, file_path)

    if not os.path.exists(file_path):
        raise ValueError(f"Dataset file not found: {file_path}")

    try:
        with open(file_path, "r", encoding="utf-8") as file:
            data = json.load(file)
        return data
    except json.JSONDecodeError:
        raise ValueError(f"Invalid JSON format in {file_path}")
    except Exception as exc:  # pylint: disable=broad-except
        raise ValueError(f"Error loading dataset: {exc}") from exc


  from .autonotebook import tqdm as notebook_tqdm
2025-02-27 09:17:24,955	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


In [3]:
train_data_source = "agentica-org/DeepScaleR-Preview-Dataset"
print(f"Loading the {train_data_source} dataset from huggingface...", flush=True)
test_data_sources = [
    "nanoverl/minerva",
    "nanoverl/aime",
    "nanoverl/amc",
    "nanoverl/olympiad_bench",
    "nanoverl/math",
]
print(f"Loading the {test_data_sources} dataset from huggingface...", flush=True)
train_dataset = datasets.load_dataset(
    train_data_source, trust_remote_code=True, split="train"
)
test_datasets = [
    datasets.load_dataset(test_data_source, trust_remote_code=True, split="test")
    for test_data_source in test_data_sources
]

instruction_following = (
    "Let's think step by step and output the final answer within \\boxed{}."
)

Loading the agentica-org/DeepScaleR-Preview-Dataset dataset from huggingface...
Loading the ['nanoverl/minerva', 'nanoverl/aime', 'nanoverl/amc', 'nanoverl/olympiad_bench', 'nanoverl/math'] dataset from huggingface...


In [5]:
def make_map_fn(split, data_source):

    def process_fn(example, idx):
        question = example.pop("problem")

        question = question + " " + instruction_following

        answer = example.pop("answer")
        data = {
            "data_source": data_source,
            "prompt": [{"role": "user", "content": question}],
            "ability": "math",
            "reward_model": {"style": "rule", "ground_truth": answer},
            "extra_info": {"split": split, "index": idx},
        }
        if idx == 0:
            print("=" * 10 + f"{data_source} {split} {idx}" + "=" * 10)
            print(data)
        return data

    return process_fn

train_data = train_dataset.map(
    function=make_map_fn("train", train_data_source), with_indices=True
)

Map:   0%|          | 34/40315 [00:00<02:32, 263.77 examples/s]

{'data_source': 'agentica-org/DeepScaleR-Preview-Dataset', 'prompt': [{'role': 'user', 'content': "The operation $\\otimes$ is defined for all nonzero numbers by $a \\otimes b = \\frac{a^{2}}{b}$. Determine $[(1 \\otimes 2) \\otimes 3] - [1 \\otimes (2 \\otimes 3)]$. Let's think step by step and output the final answer within \\boxed{}."}], 'ability': 'math', 'reward_model': {'style': 'rule', 'ground_truth': '-\\frac{2}{3}'}, 'extra_info': {'split': 'train', 'index': 0}}


Map: 100%|██████████| 40315/40315 [00:01<00:00, 30189.74 examples/s]


In [8]:
train_data[0]['prompt']

[{'content': "The operation $\\otimes$ is defined for all nonzero numbers by $a \\otimes b = \\frac{a^{2}}{b}$. Determine $[(1 \\otimes 2) \\otimes 3] - [1 \\otimes (2 \\otimes 3)]$. Let's think step by step and output the final answer within \\boxed{}.",
  'role': 'user'}]

In [10]:
train_data[0]['reward_model']

{'ground_truth': '-\\frac{2}{3}', 'style': 'rule'}

In [11]:
!wget https://raw.githubusercontent.com/Open-Reasoner-Zero/Open-Reasoner-Zero/refs/heads/main/data/orz_math_57k_collected.json

--2025-02-27 09:21:46--  https://raw.githubusercontent.com/Open-Reasoner-Zero/Open-Reasoner-Zero/refs/heads/main/data/orz_math_57k_collected.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 2606:50c0:8001::154, 2606:50c0:8002::154, 2606:50c0:8003::154, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|2606:50c0:8001::154|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 24529637 (23M) [text/plain]
Saving to: ‘orz_math_57k_collected.json’


2025-02-27 09:21:47 (356 MB/s) - ‘orz_math_57k_collected.json’ saved [24529637/24529637]



In [13]:
# the data format is like this:
"""
[
  [
    {
      "from": "human",
      "value": "$P(x)$ is a polynomial of degree $3n$ such that\n\\begin{eqnarray*} P(0) = P(3) = \\cdots &=& P(3n) = 2, \\\\ P(1) = P(4) = \\cdots &=& P(3n-2) = 1, \\\\ P(2) = P(5) = \\cdots &=& P(3n-1) = 0, \\quad\\text{ and }\\\\ && P(3n+1) = 730.\\end{eqnarray*}\nDetermine $n$."
    },
    {
      "from": "assistant",
      "ground_truth": {
        "value": "n = 4"
      }
    }
  ],
  [
    {
      "from": "human",
      "value": "Diameter $AB$ of a circle has length a $2$-digit integer (base ten). Reversing the digits gives the length of the perpendicular chord $CD$. The distance from their intersection point $H$ to the center $O$ is a positive rational number. Determine the length of $AB$."
    },
    {
      "from": "assistant",
      "ground_truth": {
        "value": "65"
      }
    }
  ]
]
"""

'\n[\n  [\n    {\n      "from": "human",\n      "value": "$P(x)$ is a polynomial of degree $3n$ such that\n\\begin{eqnarray*} P(0) = P(3) = \\cdots &=& P(3n) = 2, \\\\ P(1) = P(4) = \\cdots &=& P(3n-2) = 1, \\\\ P(2) = P(5) = \\cdots &=& P(3n-1) = 0, \\quad\\text{ and }\\\\ && P(3n+1) = 730.\\end{eqnarray*}\nDetermine $n$."\n    },\n    {\n      "from": "assistant",\n      "ground_truth": {\n        "value": "n = 4"\n      }\n    }\n  ],\n  [\n    {\n      "from": "human",\n      "value": "Diameter $AB$ of a circle has length a $2$-digit integer (base ten). Reversing the digits gives the length of the perpendicular chord $CD$. The distance from their intersection point $H$ to the center $O$ is a positive rational number. Determine the length of $AB$."\n    },\n    {\n      "from": "assistant",\n      "ground_truth": {\n        "value": "65"\n      }\n    }\n  ]\n]\n'