In [3]:
import pandas as pd
import re

aime24 = pd.read_csv("BudgetForcing_V2/aime24_Phi_BudgetForcing.csv")
gpqa = pd.read_csv("BudgetForcing_V2/gpqa_Phi_BudgetForcing.csv")
math500 = pd.read_csv("BudgetForcing_V2/math500_Phi_BudgetForcing.csv")

In [4]:
math500 = math500[["answer","ZeroShot_ReasoningTrace_ans","Wait_1_ReasoningTrace_ans","Wait_2_ReasoningTrace_ans","Wait_3_ReasoningTrace_ans"]]
gpqa = gpqa[["Correct Answer", "shuffled_correct_option", "ZeroShot_ReasoningTrace_ans","Wait_1_ReasoningTrace_ans","Wait_2_ReasoningTrace_ans","Wait_3_ReasoningTrace_ans"]]
aime24 = aime24[["answer","ZeroShot_ReasoningTrace_ans","Wait_1_ReasoningTrace_ans","Wait_2_ReasoningTrace_ans","Wait_3_ReasoningTrace_ans"]]

# Save the modified DataFrames to new CSV files
math500.to_csv("BudgetForcing_V2/math500_Phi_BudgetForcing_extracted.csv", index=False)
gpqa.to_csv("BudgetForcing_V2/gpqa_Phi_BudgetForcing_extracted.csv", index=False)
aime24.to_csv("BudgetForcing_V2/aime24_Phi_BudgetForcing_extracted.csv", index=False)

# GPQA Accuracy

In [6]:
def matches(row):
    correct = row['shuffled_correct_option'].strip()
    model_answer = str(row['model_answer_extracted']).strip()
    return (
        correct == model_answer or
        f">{correct}<" in model_answer
    )

matched_count = gpqa.apply(matches, axis=1).sum()
print(f"Number of matching rows: {matched_count}")

KeyError: 'model_answer_extracted'

# AIME24 Accuracy

In [8]:
def extract_numbers(text):
    # Extract integers, decimals, scientific notation, and negative numbers
    return re.findall(r"-?\d+(?:\.\d+)?(?:e[+-]?\d+)?", str(text).lower())

def matches(row, answer_col, model_answer_col):
    answer_numbers = extract_numbers(row[answer_col])
    model_numbers = extract_numbers(row[model_answer_col])

    # Check if all numbers from answer are in model_answer
    return all(num in model_numbers for num in answer_numbers)

waits = ["ZeroShot_ReasoningTrace_ans","Wait_1_ReasoningTrace_ans","Wait_2_ReasoningTrace_ans","Wait_3_ReasoningTrace_ans"]

for wait in waits:
    print(f"Checking matches for {wait}")
    matched_count = aime24.apply(
        lambda row: matches(row, "answer", wait), axis=1
    ).sum()
    print(f"Number of matching rows for {wait}: {matched_count}")

Checking matches for ZeroShot_ReasoningTrace_ans
Number of matching rows for ZeroShot_ReasoningTrace_ans: 2
Checking matches for Wait_1_ReasoningTrace_ans
Number of matching rows for Wait_1_ReasoningTrace_ans: 1
Checking matches for Wait_2_ReasoningTrace_ans
Number of matching rows for Wait_2_ReasoningTrace_ans: 1
Checking matches for Wait_3_ReasoningTrace_ans
Number of matching rows for Wait_3_ReasoningTrace_ans: 1


# MATH500

In [9]:
def extract_numbers(text):
    # Extract integers, decimals, scientific notation, and negative numbers
    return re.findall(r"-?\d+(?:\.\d+)?(?:e[+-]?\d+)?", str(text).lower())

def matches(row, answer_col, model_answer_col):
    answer_numbers = extract_numbers(row[answer_col])
    model_numbers = extract_numbers(row[model_answer_col])

    # Check if all numbers from answer are in model_answer
    return all(num in model_numbers for num in answer_numbers)

waits = ["ZeroShot_ReasoningTrace_ans","Wait_1_ReasoningTrace_ans","Wait_2_ReasoningTrace_ans","Wait_3_ReasoningTrace_ans"]

for wait in waits:
    print(f"Checking matches for {wait}")
    matched_count = math500.apply(
        lambda row: matches(row, "answer", wait), axis=1
    ).sum()
    print(f"Number of matching rows for {wait}: {matched_count}")

Checking matches for ZeroShot_ReasoningTrace_ans
Number of matching rows for ZeroShot_ReasoningTrace_ans: 227
Checking matches for Wait_1_ReasoningTrace_ans
Number of matching rows for Wait_1_ReasoningTrace_ans: 219
Checking matches for Wait_2_ReasoningTrace_ans
Number of matching rows for Wait_2_ReasoningTrace_ans: 227
Checking matches for Wait_3_ReasoningTrace_ans
Number of matching rows for Wait_3_ReasoningTrace_ans: 229
