In [42]:
import gzip
import json
from tqdm import tqdm
import re
from math_verify import verify
import hashlib

def assign_split_by_hash(key: str, train=0.95, val=0.03, test=0.02) -> str:
    """
    Deterministic split by hashing a key (e.g. question).
    Returns: 'train' | 'validation' | 'test'
    """
    assert abs(train + val + test - 1.0) < 1e-9
    h = hashlib.md5(key.encode("utf-8")).hexdigest()
    r = int(h[:8], 16) / 0xFFFFFFFF  # in [0,1]
    if r < train:
        return "train"
    if r < train + val:
        return "validation"
    return "test"

def load_all_jsonl_gz(path: str):
    data = []
    with gzip.open(path, "rt", encoding="utf-8") as f:
        for line in tqdm(f, desc="Loading jsonl.gz"):
            line = line.strip()
            if line:
                data.append(json.loads(line))
    return data


def split_think_answer_complete(assistant_text: str):
    """
    Only call this after has_complete_think() is True.
    - think_text: the content inside <think>...</think>
    - answer_text: everything after the closing </think>
    """
    THINK_OPEN = "<think>"
    THINK_CLOSE = "</think>"
    i = assistant_text.find(THINK_OPEN)
    j = assistant_text.find(THINK_CLOSE)
    think_text = assistant_text[i + len(THINK_OPEN): j].strip()
    answer_text = assistant_text[j + len(THINK_CLOSE):].strip()
    return think_text, answer_text



# def extract_boxed_answer(text: str):
#     _BOXED_RE = re.compile(r"\\boxed\{([^}]*)\}")
#     if not isinstance(text, str):
#         return None
#     m = _BOXED_RE.search(text)
#     return m.group(1).strip() if m else None


def extract_answer_math_verify(text: str):
    from math_verify import LatexExtractionConfig, parse
    """
    Use math_verify to extract a final answer candidate from model output.
    Returns a string (sympy-ish) or None.
    """
    if not isinstance(text, str) or not text.strip():
        return None

    parsed = parse(
        text,
        extraction_mode="first_match",
        extraction_config=[
            LatexExtractionConfig(
                boxed_match_priority=0,          # prefer \boxed{...} when present
                try_extract_without_anchor=True  # more tolerant to messy outputs
            )
        ],
    )
    if not parsed:
        return None

    # parsed elements can be sympy objects and/or strings depending on the expression
    return str(parsed[0])


def is_int_strict(x) -> bool:
    if x is None:
        return False
    try:
        return str(int(x)) == str(x).strip()
    except Exception:
        return False

In [43]:
# data = load_all_jsonl_gz("openthoughts3_math_complete_cot.jsonl.gz")
print("N =", len(data))
print("keys =", data[0].keys())

N = 274290
keys = dict_keys(['difficulty', 'source', 'domain', 'conversations'])


In [44]:
hf_ready = []
kept_idx = 0

for element in tqdm(data):
    question = element["conversations"][0]["value"]
    solution = element["conversations"][-1]["value"]
    if "boxed" not in solution[-100:]:
        continue
    # think_text, answer_text = split_think_answer_complete(solution)
    extracted_answer = extract_answer_math_verify(solution[-50:])
    is_answer_int = is_int_strict(extracted_answer)
    instruction = 'Let\'s think step by step and solve this problem. '                       

    hf_ready_data = {
            "data_source": "",
            "prompt": [
                {
                    "role": "user",
                    "content": instruction + question,
                }
            ],
            "ability": "math",
            "reward_model": {"style": "rule", "ground_truth": solution},
            "extra_info": {
                "split": assign_split_by_hash(question),
                "index": kept_idx,
                "answer": solution, # We should always train with thinking traces
                "question": question,
                "is_answer_int": is_answer_int,
            },
        }
    hf_ready.append(hf_ready_data)
    kept_idx += 1

100%|██████████| 274290/274290 [03:39<00:00, 1249.73it/s]


In [46]:
from datasets import Dataset, DatasetDict

def to_datasetdict(hf_ready):
    splits = {"train": [], "validation": [], "test": []}
    for ex in hf_ready:
        sp = ex["extra_info"]["split"]
        if sp not in splits:
            continue
        splits[sp].append(ex)

    ds_dict = DatasetDict({
        k: Dataset.from_list(v) for k, v in splits.items() if len(v) > 0
    })
    return ds_dict

ds = to_datasetdict(hf_ready)
print(ds)

  import pynvml  # type: ignore[import]


DatasetDict({
    train: Dataset({
        features: ['data_source', 'prompt', 'ability', 'reward_model', 'extra_info'],
        num_rows: 235018
    })
    validation: Dataset({
        features: ['data_source', 'prompt', 'ability', 'reward_model', 'extra_info'],
        num_rows: 7134
    })
    test: Dataset({
        features: ['data_source', 'prompt', 'ability', 'reward_model', 'extra_info'],
        num_rows: 4720
    })
})


In [None]:
from huggingface_hub import create_repo

repo_id = "michaelw-cerebras/openthoughts3-math-gsm8kstyle"

create_repo(repo_id, repo_type="dataset", private=True, exist_ok=True)

ds.push_to_hub(repo_id)

  from .autonotebook import tqdm as notebook_tqdm


{'type': 'user',
 'id': '667f1dc8f96afc102e299817',
 'name': 'michaelw-cerebras',
 'fullname': 'Michael Wang',
 'canPay': False,
 'billingMode': 'prepaid',
 'periodEnd': 1769904000,
 'isPro': False,
 'avatarUrl': '/avatars/fa57efad9857fc259a05f24cbc365821.svg',
 'orgs': [{'type': 'org',
   'id': '640fbc14208821a59b767b3e',
   'name': 'cerebras',
   'fullname': 'Cerebras',
   'avatarUrl': 'https://cdn-avatars.huggingface.co/v1/production/uploads/1678752782877-640fbb91bd54b1efde3678b2.png',
   'isEnterprise': True,
   'plan': 'team'}],
 'auth': {'type': 'access_token',
  'accessToken': {'displayName': 'cerebras-download',
   'role': 'fineGrained',
   'createdAt': '2024-07-25T16:04:09.182Z',
   'fineGrained': {'canReadGatedRepos': True,
    'global': ['discussion.write', 'post.write'],
    'scoped': [{'entity': {'_id': '667f1dc8f96afc102e299817',
       'type': 'user',
       'name': 'michaelw-cerebras'},
      'permissions': ['repo.content.read',
       'repo.write',
       'inference.en

In [None]:
# {'difficulty': None, 
#  'source': 'ai2-adapt-dev/openmath-2-math', 
#  'domain': 'math', 
#  'conversations': [{'from': 'human', 'value': 'A bookshelf has 5 shelves, and each shelf can hold up to 3 books. In how many ways can 6 distinct books be placed on the bookshelf such that no shelf contains more than 2 books by the same author, and no two books by the same author are on adjacent shelves?'}, 
#                    {'from': 'gpt', 'value': '<think> \n</think>\n\nTo determine the number of ways to place 6 distinct books on a bookshelf with 5 shelves, considering each shelf can hold up to 3 books, and the constraints that no shelf contains more than 2 books by the same author and no two books by the same author are on adjacent shelves:\n\n1. **Interpretation of Constraints**:\n   - Each shelf can hold up to 3 books, but no shelf can have more than 2 books by the same author.\n   - No two books by the same author can be on adjacent shelves.\n\n2. **Assumption**:\n   - Since the books are distinct, if all books are by different authors, the constraints are automatically satisfied (each author has only one book).\n\n3.'}]}