# Evaluating ORA (gpt-4o) and base gpt-4o with MATH

### Configuration

In [2]:
def print_answer(response):
    print("Thoughts:")
    print("\n".join(response["thoughts"]))
    print("Answer:")
    print(response["answer"])

In [3]:
from openai import OpenAI
import os

In [4]:
openai_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

In [5]:
def call_strong(messages: list) -> str:
    completion = openai_client.chat.completions.create(
        model="gpt-4o",
        messages=messages
    )

    return completion.choices[0].message.content

def call_mini(messages: list) -> str:
    completion = openai_client.chat.completions.create(
        model="gpt-4o-mini",
        messages=messages
    )

    return completion.choices[0].message.content

In [6]:
from reasonable import DefaultReasoningAgent

In [7]:
agent = DefaultReasoningAgent(
    main_function=call_strong,
    thoughts_function=call_strong,
    max_steps=10,
    verbose=True
)

ReasoningAgent initialized. Verbose mode is on.


In [8]:
import re

def evaluate_llm_response(question, llm_answer_func, evaluator_func):
    question_text = question["question"]
    correct_solution = question["solution"]
    correct_answer = question["answer"]

    llm_answer = llm_answer_func([{"role": "user", "content": question_text}])

    evaluation_prompt = (
        f"Question: {question_text}\n"
        f"Correct solution: {correct_solution}\n"
        f"Correct answer: {correct_answer}\n"
        f"Answer by student: {llm_answer}\n"
        "Please evaluate the correctness of the student's answer above."
        "Evaluate the student's answer in <evaluation> tag and provide a verdict ('correct' or 'incorrect') in <verdict> tag and score (from 0 to 10) in <score> tag."
        "Respond in this format:"
        "<evaluation>"
        "..."
        "</evaluation>"
        "<verdict>"
        "..."
        "</verdict>"
        "<score>"
        "..."
        "</score>"
    )

    evaluation_result = evaluator_func([{"role": "user", "content": evaluation_prompt}])

    # extract score
    pattern = r'<score>(.*?)</score>'
    score = re.search(pattern, evaluation_result, re.DOTALL)
    if score:
        score = int(score.group(1).strip())
    else:
        score = 0

    # extract verdict
    pattern = r'<verdict>(.*?)</verdict>'
    verdict = re.search(pattern, evaluation_result, re.DOTALL)
    if verdict:
        verdict = verdict.group(1).strip()
        if "incorrect" in verdict:
            verdict = 0
        else:
            verdict = 1
    else:
        verdict = 0

    return {"question": question_text, "correct_answer": correct_answer, "llm_answer": llm_answer, "score": score, "verdict": verdict}

### Loading dataset

In [25]:
from datasets import load_dataset

ds = load_dataset("HuggingFaceH4/MATH-500")

Let's randomly choose 50 questions from dataset

In [9]:
import random

In [10]:
MATH50_ds = []
for example in ds["test"]:
    MATH50_ds.append({
        "question": example["problem"],
        "solution": example["solution"],
        "answer": example["answer"]
    })

In [11]:
random.shuffle(MATH50_ds)

In [12]:
MATH50_ds = MATH50_ds[:50]

In [13]:
MATH50_ds

[{'question': 'Roslyn has ten boxes. Five of the boxes contain pencils, four of the boxes contain pens, and two of the boxes contain both pens and pencils. How many boxes contain neither pens nor pencils?',
  'solution': 'Of the 5 boxes with pencils, 2 have pens also, so $5-2=3$ have pencils only.  Similarly, $4-2 =2$ of the boxes have pens only:\n\n\n[asy]\nunitsize(0.05cm);\nlabel("Pencils", (2,74));\nlabel("Pens", (80,74));\ndraw(Circle((30,45), 22));\ndraw(Circle((58, 45), 22));\nlabel("$2$", (44, 45));\nlabel(scale(0.8)*"$3$",(28,58));\nlabel(scale(0.8)*"$2$",(63,58));\n[/asy]\n\nThat gives us $3+2+2=7$ boxes with pens, pencils, or both.  This leaves $10-7 = \\boxed{3}$ with neither.',
  'answer': '3'},
 {'question': 'What is the last nonzero digit to the right of the decimal point in the decimal expansion of $\\frac{137}{500}$?',
  'solution': 'We can rewrite $\\frac{137}{500}$ in the form $\\frac{274}{1000}$, so $\\frac{137}{500} = \\frac{274}{1000} = 0.274$ and the last nonzero

### Evaluating base model

In [14]:
from tqdm import tqdm

In [15]:
base_model_test_info = {
    "total": 0,
    "score": 0,
    "answers" : []
}

max_total = 50
max_score = 50 * 10

for question in tqdm(MATH50_ds):
    foo = evaluate_llm_response(question, call_strong, call_strong)
    base_model_test_info["total"] += foo["verdict"]
    base_model_test_info["score"] += foo["score"]
    base_model_test_info["answers"].append(foo)

    print(f"Current accuracy: {base_model_test_info['total']/len(base_model_test_info['answers'])}/1.0")
    print(f"Current score: {base_model_test_info['score']}/{10*len(base_model_test_info['answers'])}")

  2%|▏         | 1/50 [00:14<12:12, 14.94s/it]

Current accuracy: 1.0/1.0
Current score: 10/10


  4%|▍         | 2/50 [00:30<12:28, 15.59s/it]

Current accuracy: 1.0/1.0
Current score: 20/20


  6%|▌         | 3/50 [00:51<13:58, 17.85s/it]

Current accuracy: 0.6666666666666666/1.0
Current score: 24/30


  8%|▊         | 4/50 [01:54<27:23, 35.72s/it]

Current accuracy: 0.75/1.0
Current score: 32/40


 10%|█         | 5/50 [02:09<21:08, 28.19s/it]

Current accuracy: 0.8/1.0
Current score: 42/50


 12%|█▏        | 6/50 [02:28<18:28, 25.19s/it]

Current accuracy: 0.8333333333333334/1.0
Current score: 52/60


 14%|█▍        | 7/50 [02:48<16:43, 23.34s/it]

Current accuracy: 0.8571428571428571/1.0
Current score: 62/70


 16%|█▌        | 8/50 [03:15<17:16, 24.67s/it]

Current accuracy: 0.875/1.0
Current score: 72/80


 18%|█▊        | 9/50 [03:58<20:43, 30.34s/it]

Current accuracy: 0.7777777777777778/1.0
Current score: 76/90


 20%|██        | 10/50 [04:13<16:58, 25.45s/it]

Current accuracy: 0.8/1.0
Current score: 86/100


 22%|██▏       | 11/50 [04:33<15:31, 23.89s/it]

Current accuracy: 0.8181818181818182/1.0
Current score: 96/110


 24%|██▍       | 12/50 [05:03<16:22, 25.86s/it]

Current accuracy: 0.8333333333333334/1.0
Current score: 106/120


 26%|██▌       | 13/50 [05:28<15:40, 25.43s/it]

Current accuracy: 0.7692307692307693/1.0
Current score: 109/130


 28%|██▊       | 14/50 [06:06<17:35, 29.31s/it]

Current accuracy: 0.7857142857142857/1.0
Current score: 119/140


 30%|███       | 15/50 [06:29<15:59, 27.40s/it]

Current accuracy: 0.7333333333333333/1.0
Current score: 120/150


 32%|███▏      | 16/50 [06:54<15:03, 26.57s/it]

Current accuracy: 0.75/1.0
Current score: 130/160


 34%|███▍      | 17/50 [07:15<13:45, 25.01s/it]

Current accuracy: 0.7647058823529411/1.0
Current score: 140/170


 36%|███▌      | 18/50 [07:37<12:47, 23.99s/it]

Current accuracy: 0.7777777777777778/1.0
Current score: 150/180


 38%|███▊      | 19/50 [07:46<10:08, 19.64s/it]

Current accuracy: 0.7894736842105263/1.0
Current score: 160/190


 40%|████      | 20/50 [08:13<10:56, 21.89s/it]

Current accuracy: 0.75/1.0
Current score: 163/200


 42%|████▏     | 21/50 [08:43<11:42, 24.22s/it]

Current accuracy: 0.7619047619047619/1.0
Current score: 173/210


 44%|████▍     | 22/50 [09:19<12:54, 27.67s/it]

Current accuracy: 0.7727272727272727/1.0
Current score: 183/220


 46%|████▌     | 23/50 [10:04<14:52, 33.07s/it]

Current accuracy: 0.7391304347826086/1.0
Current score: 186/230


 48%|████▊     | 24/50 [10:17<11:42, 27.03s/it]

Current accuracy: 0.75/1.0
Current score: 196/240


 50%|█████     | 25/50 [10:50<11:54, 28.58s/it]

Current accuracy: 0.76/1.0
Current score: 206/250


 52%|█████▏    | 26/50 [11:31<12:56, 32.34s/it]

Current accuracy: 0.7692307692307693/1.0
Current score: 216/260


 54%|█████▍    | 27/50 [12:12<13:24, 35.00s/it]

Current accuracy: 0.7407407407407407/1.0
Current score: 222/270


 56%|█████▌    | 28/50 [12:35<11:31, 31.45s/it]

Current accuracy: 0.7142857142857143/1.0
Current score: 226/280


 58%|█████▊    | 29/50 [12:54<09:43, 27.77s/it]

Current accuracy: 0.7241379310344828/1.0
Current score: 236/290


 60%|██████    | 30/50 [13:32<10:16, 30.80s/it]

Current accuracy: 0.7/1.0
Current score: 239/300


 62%|██████▏   | 31/50 [13:55<09:00, 28.44s/it]

Current accuracy: 0.7096774193548387/1.0
Current score: 249/310


 64%|██████▍   | 32/50 [14:23<08:31, 28.41s/it]

Current accuracy: 0.71875/1.0
Current score: 259/320


 66%|██████▌   | 33/50 [14:41<07:05, 25.04s/it]

Current accuracy: 0.7272727272727273/1.0
Current score: 269/330


 68%|██████▊   | 34/50 [14:59<06:09, 23.11s/it]

Current accuracy: 0.7352941176470589/1.0
Current score: 279/340


 70%|███████   | 35/50 [16:02<08:43, 34.89s/it]

Current accuracy: 0.7428571428571429/1.0
Current score: 289/350


 72%|███████▏  | 36/50 [16:14<06:33, 28.10s/it]

Current accuracy: 0.75/1.0
Current score: 299/360


 74%|███████▍  | 37/50 [16:31<05:21, 24.71s/it]

Current accuracy: 0.7567567567567568/1.0
Current score: 309/370


 76%|███████▌  | 38/50 [17:37<07:27, 37.26s/it]

Current accuracy: 0.7368421052631579/1.0
Current score: 313/380


 78%|███████▊  | 39/50 [18:00<06:00, 32.79s/it]

Current accuracy: 0.7435897435897436/1.0
Current score: 323/390


 80%|████████  | 40/50 [18:49<06:18, 37.88s/it]

Current accuracy: 0.725/1.0
Current score: 326/400


 82%|████████▏ | 41/50 [19:24<05:32, 36.94s/it]

Current accuracy: 0.7073170731707317/1.0
Current score: 329/410


 84%|████████▍ | 42/50 [19:46<04:19, 32.49s/it]

Current accuracy: 0.7142857142857143/1.0
Current score: 339/420


 86%|████████▌ | 43/50 [20:31<04:13, 36.23s/it]

Current accuracy: 0.7209302325581395/1.0
Current score: 349/430


 88%|████████▊ | 44/50 [21:01<03:25, 34.24s/it]

Current accuracy: 0.7272727272727273/1.0
Current score: 359/440


 90%|█████████ | 45/50 [21:53<03:18, 39.64s/it]

Current accuracy: 0.7333333333333333/1.0
Current score: 368/450


 92%|█████████▏| 46/50 [22:16<02:18, 34.58s/it]

Current accuracy: 0.7391304347826086/1.0
Current score: 378/460


 94%|█████████▍| 47/50 [22:24<01:20, 26.81s/it]

Current accuracy: 0.7446808510638298/1.0
Current score: 388/470


 96%|█████████▌| 48/50 [22:45<00:50, 25.01s/it]

Current accuracy: 0.75/1.0
Current score: 398/480


 98%|█████████▊| 49/50 [23:06<00:23, 23.82s/it]

Current accuracy: 0.7551020408163265/1.0
Current score: 408/490


100%|██████████| 50/50 [23:40<00:00, 28.42s/it]

Current accuracy: 0.74/1.0
Current score: 411/500





Saving results to json

In [17]:
import json

with open("base_model_test_info.json", 'w', encoding='utf-8') as json_file:
    json.dump(base_model_test_info, json_file, ensure_ascii=False, indent=4)

Saving questions to json

In [None]:
with open("MATH50_ds.json", 'w', encoding='utf-8') as json_file:
    json.dump(MATH50_ds, json_file, ensure_ascii=False, indent=4)

Loading questions from json

In [10]:
import json

In [21]:
with open("base_model_test_info.json", 'r', encoding='utf-8') as json_file:
    base_model_test_info = json.loads(json_file.read())

In [None]:
MATH50_ds = []
for example in base_model_test_info["answers"]:
    MATH50_ds.append(example)

In [35]:
new_MATH50_ds = []
for example in ds["test"]:
    if str(example["problem"]) in MATH50_ds:
        new_MATH50_ds.append({
            "question": example["problem"],
            "solution": example["solution"],
            "answer": example["answer"]
        })

In [37]:
len(new_MATH50_ds)

50