In [2]:
import os

from openai import OpenAI
from datasets import load_dataset

import verifiers as vf

In [3]:
api = "openai"
num_samples = 10
max_tokens = 500
# collect V3/R1 rollouts from API
if api == "deepseek":
    base_url = "https://api.deepseek.com"
    api_key = os.getenv("DEEPSEEK_API_KEY")
    model_name = "deepseek-chat" # DeepSeek V3-0324
    client = OpenAI(base_url=base_url, api_key=api_key)
elif api == "openai":
    # just for testing :) not for distillation :)
    api_key = os.getenv("OPENAI_API_KEY")
    model_name = "gpt-4.1" 
    client = OpenAI(api_key=api_key)
else:
    raise ValueError(f"Invalid API: {api}")
sampling_args = {
    "max_tokens": max_tokens,
    "temperature": 0.7,
}


In [None]:
dataset = load_dataset('agentlans/wikipedia-paragraphs', split='train')
dataset = dataset.map(lambda x: {'question': x['text'], 'answer': x['text']})

parser = vf.XMLParser(['think', 'answer'], answer_field="answer")
system_prompt = f"""Respond in the following format:
{parser.get_format_str()}

Summarize the given text in 3 sentences."""

def sentence_reward_func(completion, **kwargs) -> float:
    """
    Count the number of sentences in the completion.
    """
    response = parser.parse_answer(completion) or ''
    return 1.0 if len(response.split('.')) == 3 else 0.0

def lcs_reward_func(completion, answer, **kwargs) -> float:
    """
    LCS ratio of the prompt and the parsed completion.    
    """
    def lcs_ratio(x: str, y: str) -> float:
        """
        Return the longest common subsequence ratio of x and y.
        """
        from difflib import SequenceMatcher
        return SequenceMatcher(None, x, y).ratio()
    response = parser.parse_answer(completion) or ''
    return lcs_ratio(response, answer)

rubric = vf.Rubric(funcs=[
    sentence_reward_func,
    lcs_reward_func,
    parser.get_format_reward_func(),
], weights=[1.0, 0.2, 0.2])
judge_prompt = """
Is this a good summary?

Original text:
{answer}

Summary:
{response}
"""
judge_rubric = vf.JudgeRubric(judge_prompt=judge_prompt)
rubric_group = vf.RubricGroup([rubric, judge_rubric])

vf_env = vf.SingleTurnEnv(
    eval_dataset=dataset,
    system_prompt=system_prompt,
    parser=parser,
    rubric=rubric_group,
    max_concurrent=10
)

In [4]:
from verifiers.envs.reasoninggym_env import ReasoningGymEnv
client = OpenAI()
model_name = "gpt-4.1-mini"
vf_env = ReasoningGymEnv(
    client=client, model=model_name, gym='leg_counting',
    num_samples=1000, num_eval_samples=100, seed=42)
vf_env.evaluate(client=client, model=model_name,
                sampling_args=sampling_args, num_samples=num_samples)

Setting TOKENIZERS_PARALLELISM=false for forked processes.


Map (num_proc=32):   0%|          | 0/1000 [00:00<?, ? examples/s]

Setting TOKENIZERS_PARALLELISM=false for forked processes.


Map (num_proc=32):   0%|          | 0/100 [00:00<?, ? examples/s]

Running rollouts


Running 10 rollouts: 100%|██████████| 10/10 [00:06<00:00,  1.49it/s]


Rollouts done: 10


Evaluating 10 rollouts: 100%|██████████| 10/10 [00:00<00:00, 2228.41it/s]


{'question': ['Your task is to count how many legs there are in total when given a list of animals.\n\nNow, how many legs are there in total if you have 7 dogs, 5 butterflys, 13 flatworms, 3 ants, 8 starfishs, 8 spiders?\n',
  'Your task is to count how many legs there are in total when given a list of animals.\n\nNow, how many legs are there in total if you have 5 insects, 12 fireflys, 3 giraffes, 13 elephants, 12 jellyfishs, 4 woodlouses, 8 ants, 12 praying mantiss?\n',
  'Your task is to count how many legs there are in total when given a list of animals.\n\nNow, how many legs are there in total if you have 3 butterflys, 9 cockroachs, 4 shrimps, 8 jellyfishs, 9 lions, 1 dog?\n',
  'Your task is to count how many legs there are in total when given a list of animals.\n\nNow, how many legs are there in total if you have 13 crickets, 2 praying mantiss, 8 fireflys?\n',
  'Your task is to count how many legs there are in total when given a list of animals.\n\nNow, how many legs are there 

In [None]:
# columns = ['prompt', 'completion', 'answer', 'reward']
# use deepseek-chat for multiturn rollouts (V3-0324)
results = vf_env.evaluate(
    client=client, model=model_name, 
    sampling_args=sampling_args, num_samples=num_samples) 

print("Rewards:")
for k, v in results.items():
    if 'reward' in k:
        print(k, '-', v) 

#dataset_dsv3 = vf_env.make_dataset(results)
# filter to top half of rows by rewards
#dataset_dsv3 = dataset_dsv3.sort("reward", reverse=True).select(range(len(dataset_dsv3) // 2))
# save to hub
#dataset_dsv3.push_to_hub("V3-wiki-paragraphs-test")

Setting TOKENIZERS_PARALLELISM=false for forked processes.


Map (num_proc=32):   0%|          | 0/1000 [00:00<?, ? examples/s]

TypeError: cannot pickle 'SSLContext' object

In [2]:
import reasoning_gym as rg

In [4]:
data = rg.create_dataset('leg_counting', size=1000, seed=42)
data

<reasoning_gym.arithmetic.leg_counting.LegCountingDataset at 0x7af9757eeb10>

In [15]:
x = next(data)
x

AttributeError: 'LegCountingDataset' object has no attribute '_current_idx'

In [21]:
dataset[0]

{'prompt': 'Your task is to count how many legs there are in total when given a list of animals.\n\nNow, how many legs are there in total if you have 3 sea slugs, 12 deers, 2 giraffes, 11 elephants?\n',
 'answer': 0,
 'task': 'leg_counting'}

In [23]:
data[0]

{'question': 'Your task is to count how many legs there are in total when given a list of animals.\n\nNow, how many legs are there in total if you have 3 sea slugs, 12 deers, 2 giraffes, 11 elephants?\n',
 'answer': '100',
 'metadata': {'source_dataset': 'leg_counting',
  'source_index': 0,
  'animals': {'sea slug': 3, 'deer': 12, 'giraffe': 2, 'elephant': 11},
  'num_animals': 4,
  'total_legs': 100,
  'difficulty': {'num_animals': (3, 10), 'num_instances': (1, 15)}}}

In [25]:
data.score_answer(answer=str(100), entry=data[0])

1.0

In [20]:
from datasets import Dataset
isinstance(data, Dataset)

for i in range(10):
    print(data[i])

import reasoning_gym as rg
def rg_to_hf(data) -> Dataset:
    rows = []
    for i, x in enumerate(data):
        row = {
            'prompt': x['question'],
            'answer': i,
            'task': x['metadata']['source_dataset'],
        }
        rows.append(row)
    return Dataset.from_list(rows)

dataset = rg_to_hf(data)

{'question': 'Your task is to count how many legs there are in total when given a list of animals.\n\nNow, how many legs are there in total if you have 3 sea slugs, 12 deers, 2 giraffes, 11 elephants?\n', 'answer': '100', 'metadata': {'source_dataset': 'leg_counting', 'source_index': 0, 'animals': {'sea slug': 3, 'deer': 12, 'giraffe': 2, 'elephant': 11}, 'num_animals': 4, 'total_legs': 100, 'difficulty': {'num_animals': (3, 10), 'num_instances': (1, 15)}}}
{'question': 'Your task is to count how many legs there are in total when given a list of animals.\n\nNow, how many legs are there in total if you have 6 sheeps, 11 dogs, 12 praying mantiss?\n', 'answer': '140', 'metadata': {'source_dataset': 'leg_counting', 'source_index': 1, 'animals': {'sheep': 6, 'dog': 11, 'praying mantis': 12}, 'num_animals': 3, 'total_legs': 140, 'difficulty': {'num_animals': (3, 10), 'num_instances': (1, 15)}}}
{'question': 'Your task is to count how many legs there are in total when given a list of animals.

In [9]:
rg.utils.SYSTEM_PROMPTS["DeepSeekZero"]

'A conversation between User and Assistant. The user asks a question, and the Assistant solves it.\nThe assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think>\n<answer>answer here</answer>\nDo not explain your reasoning inside the answer tags, provide only the final answer. When an example is provided, you should strictly follow the format of the output/answer in that example.\n'

In [11]:
import reasoning_gym as rg