In [1]:
# create open-router client, place your OPENROUTER_API_KEY in .env file
# .env contents:
# OPENROUTER_API_KEY=sk-or-v1- ...

%load_ext dotenv
%dotenv
import os
import re
from random import Random
from pathlib import Path
from typing import Any, Iterable, Optional
import json
from openai import OpenAI
from openai.types.chat import ChatCompletion, ChatCompletionMessageParam
import time
import reasoning_gym


def llm_generate(
    client: OpenAI,
    messages: Iterable[ChatCompletionMessageParam],
    sampling_params: dict[str, Any],
) -> ChatCompletion:
    max_retry = 3
    for trial in range(max_retry):
        try:
            return client.chat.completions.create(
                messages=messages,
                **sampling_params,
            )
        except Exception as e:
            print("failure response:", e)
            time.sleep(trial * trial)  # quadratic backoff
            if trial == max_retry - 1:
                raise

def generate_simple_request(user_prompt: str, developer_prompt: Optional[str] = None) -> list[dict]:
    prompt = []
    if developer_prompt is not None:
        prompt.append( { "role": "system", "content": developer_prompt } )
    
    prompt.append( { "role": "user", "content": user_prompt })
    return prompt

open_router_client = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=os.getenv("OPENROUTER_API_KEY"),
    timeout=90.0,
)

sampling_params = {
    "model": "anthropic/claude-3.5-sonnet",
    "max_tokens": 4096,
}


In [2]:
len(reasoning_gym.factory.DATASETS)

48

In [3]:
# test all gsm_symoblic generators
import reasoning_gym.arithmetic.gsm_symbolic
x = reasoning_gym.create_dataset("gsm_symbolic")

generators = x.generators

In [4]:
import reasoning_gym.utils

difficulty = 1.0

prompt_template = "Solve the following math task and return the answer (just the number) in <answer></answer> tags:\n\n{question}"

def query_llm(x: dict) -> tuple[int, int]:
    q = x["question"]
    ground_truth = x["answer"]
    user_prompt = prompt_template.format(question=q)
    msgs = generate_simple_request(user_prompt)
    output = llm_generate(client=open_router_client, messages=msgs, sampling_params=sampling_params)
    full_answer = output.choices[0].message.content
    answer = reasoning_gym.utils.extract_answer(completion=full_answer, tag_name="answer").strip()
    return answer, ground_truth, full_answer


In [5]:
def cross_check_generator(rng: Random, index: int, difficulty: float = 1.0, num_generations = 3, verbose: bool = False) -> int:
    num_matching = 0
    try:
        g = generators[index]        
        for j in range(num_generations):
            try:
                x = g(rng, difficulty=difficulty)
                a, gt, full_answer = query_llm(x)

                print(f"[{index}.{j}], llm={a}, ground_truth={gt}, match={a==gt}")
                if verbose:
                    print(x["question"])
                    print(full_answer)
                if a == gt:
                    num_matching += 1
            except Exception as ex:
                print(f"[{index}.{j}] error: {ex}")
    except Exception as ex:
        print(f"[{index}] generator failure: {ex}")
        return -1
    return num_matching

def cross_check_generators(rng: Random, difficulty: float = 1.0, num_generations = 3):
    results = [0] * len(generators)
    for i in range(len(generators)):
        results[i] = cross_check_generator(rng, index=i, difficulty=difficulty, num_generations=num_generations)
    return results


In [None]:
rng = Random(200)
re_check = [2,11,21,27,32,35,37]

# for i in re_check:
#     cross_check_generator(rng, index=i, difficulty=1.0, num_generations=3)
# 11 not ok

cross_check_generator(rng, index=35, difficulty=1.0, num_generations=5, verbose=True)

[35.0] error: Could not find valid time_per_room
[35.1], llm=30.196, ground_truth=30, match=False
A cleaner has to clean a office building with 21 floors. They have 3 days to get it done. It takes them 44 minutes per floor. If they work 17 hour day, what percentage of their day, on average, is spent cleaning floors?
Let me solve this step by step:

1. Total floors to clean: 21
2. Time per floor: 44 minutes
3. Total time needed: 21 × 44 = 924 minutes
4. Working hours per day: 17 hours = 17 × 60 = 1020 minutes
5. Days available: 3
6. Time needed per day: 924 ÷ 3 = 308 minutes
7. Percentage of day spent cleaning: (308 ÷ 1020) × 100 = 30.196%

<answer>30.196</answer>
[35.2], llm=34.23, ground_truth=34, match=False
A cleaner has to clean a hospital with 12 floors. They have 4 days to get it done. It takes them 89 minutes per floor. If they work 13 hour day, what percentage of their day, on average, is spent cleaning floors?
Let me solve this step by step:

1. Total floors to clean = 12
2. T

1

In [None]:
rng = Random(55)
result_1 = cross_check_generators(rng, difficulty=1.0, num_generations=5)

[0.0], llm=43, ground_truth=43, match=True
[0.1], llm=104, ground_truth=104, match=True
[0.2], llm=21, ground_truth=21, match=True
[0.3], llm=300, ground_truth=300, match=True
[0.4], llm=76, ground_truth=76, match=True
[1.0], llm=59, ground_truth=59, match=True
[1.1], llm=61, ground_truth=61, match=True
[1.2], llm=42, ground_truth=42, match=True
[1.3], llm=76, ground_truth=76, match=True
[1.4], llm=80, ground_truth=80, match=True
[2.0], llm=47, ground_truth=47, match=True
[2.1], llm=62, ground_truth=61, match=False
[2.2], llm=36, ground_truth=36, match=True
[2.3], llm=41, ground_truth=40, match=False
[2.4], llm=70, ground_truth=70, match=True
[3.0], llm=55, ground_truth=55, match=True
[3.1], llm=8, ground_truth=8, match=True
[3.2], llm=33, ground_truth=33, match=True
[3.3], llm=7, ground_truth=7, match=True
[3.4], llm=24, ground_truth=24, match=True
[4.0], llm=15, ground_truth=15, match=True
[4.1], llm=15, ground_truth=15, match=True
[4.2], llm=4, ground_truth=4, match=True
[4.3], llm=

In [11]:
good = [str(i) for i in range(len(result_1)) if result_1[i] == 5]
not_good = [str(i) for i in range(len(result_1)) if result_1[i] < 5]

print('good = [' +  ",".join(good) + ']')
print('not_good = [' +  ",".join(not_good) + ']')

good = [0,1,3,4,5,6,7,8,9,10,12,13,14,15,16,17,18,19,20,22,23,24,25,26,28,29,30,31,33,34,36,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,62,64,66,67,68,69,70,71,72,73,75,78,80,81,82,83,84,85,88,89,91,92,93,94,95,96]
not_good = [2,11,21,27,32,35,37,61,63,65,74,76,77,79,86,87,90,97,98,99]
