In [33]:
import pandas as pd
import re

aime24 = pd.read_csv("BudgetForcing_V2/aime24_Phi_BudgetForcing.csv")
gpqa = pd.read_csv("BudgetForcing_V2/gpqa_Phi_BudgetForcing.csv")
math500 = pd.read_csv("BudgetForcing_V2/math500_Phi_BudgetForcing.csv")

In [35]:
math500 = math500[["answer","ZeroShot_ReasoningTrace","Wait_1_ReasoningTrace","Wait_2_ReasoningTrace","Wait_3_ReasoningTrace"]]
gpqa = gpqa[["Correct Answer", "shuffled_correct_option", "ZeroShot_ReasoningTrace_ans","Wait_1_ReasoningTrace_ans","Wait_2_ReasoningTrace_ans","Wait_3_ReasoningTrace_ans"]]
aime24 = aime24[["answer","ZeroShot_ReasoningTrace","Wait_1_ReasoningTrace","Wait_2_ReasoningTrace","Wait_3_ReasoningTrace"]]

cols = ["ZeroShot_ReasoningTrace", "Wait_1_ReasoningTrace", "Wait_2_ReasoningTrace", "Wait_3_ReasoningTrace"]

for df in [math500, aime24]:
    for col in cols:
        df[col] = df[col].astype(str).apply(lambda x: ' '.join(x.split()[:]))

# Save the modified DataFrames to new CSV files
math500.to_csv("BudgetForcing_V2/math500_Phi_BudgetForcing_extracted.csv", index=False)
gpqa.to_csv("BudgetForcing_V2/gpqa_Phi_BudgetForcing_extracted.csv", index=False)
aime24.to_csv("BudgetForcing_V2/aime24_Phi_BudgetForcing_extracted.csv", index=False)

# GPQA Accuracy

In [40]:
def matches(row, correct_answer_col, correct_option_col, model_answer_col):
    correct_option = row['shuffled_correct_option'].strip()
    correct_answer = row['Correct Answer'].strip()
    model_answer = str(row[model_answer_col]).strip()

    # Convert to lowercase for case-insensitive search
    lowered = model_answer.lower()
    last_correct_index = lowered.rfind("correct")

    # Slice only the part after the last occurrence of "correct"
    if last_correct_index != -1:
        model_answer = model_answer[last_correct_index + len("correct"):].strip()

    return (
        correct_answer in model_answer or
        f"({correct_option})" in model_answer or
        f"{correct_option}" in model_answer
    )
    
waits = [
    "ZeroShot_ReasoningTrace_ans",
    "Wait_1_ReasoningTrace_ans",
    "Wait_2_ReasoningTrace_ans",
    "Wait_3_ReasoningTrace_ans"
]    

for wait in waits:
    matched_count = gpqa.apply(
        lambda row: matches(row, "Correct Answer", "shuffled_correct_option", wait), axis=1
    ).sum()
    print(f"Number of matching rows for {wait}: {matched_count}")


Number of matching rows for ZeroShot_ReasoningTrace_ans: 108
Number of matching rows for Wait_1_ReasoningTrace_ans: 107
Number of matching rows for Wait_2_ReasoningTrace_ans: 118
Number of matching rows for Wait_3_ReasoningTrace_ans: 121


# AIME24 Accuracy

In [28]:
def extract_numbers(text):
    # Extract integers, decimals, scientific notation, and negative numbers
    return re.findall(r"-?\d+(?:\.\d+)?(?:e[+-]?\d+)?", str(text).lower())

def matches(row, answer_col, model_answer_col):
    answer_numbers = extract_numbers(row[answer_col])
    model_numbers = extract_numbers(row[model_answer_col])

    # Check if all numbers from answer are in model_answer
    return all(num in model_numbers for num in answer_numbers)

waits = ["ZeroShot_ReasoningTrace","Wait_1_ReasoningTrace","Wait_2_ReasoningTrace","Wait_3_ReasoningTrace"]

for wait in waits:
    print(f"Checking matches for {wait}")
    matched_count = aime24.apply(
        lambda row: matches(row, "answer", wait), axis=1
    ).sum()
    print(f"Number of matching rows for {wait}: {matched_count}")

Checking matches for ZeroShot_ReasoningTrace
Number of matching rows for ZeroShot_ReasoningTrace: 2
Checking matches for Wait_1_ReasoningTrace
Number of matching rows for Wait_1_ReasoningTrace: 2
Checking matches for Wait_2_ReasoningTrace
Number of matching rows for Wait_2_ReasoningTrace: 2
Checking matches for Wait_3_ReasoningTrace
Number of matching rows for Wait_3_ReasoningTrace: 3


# MATH500

In [29]:
def extract_numbers(text):
    # Extract integers, decimals, scientific notation, and negative numbers
    return re.findall(r"-?\d+(?:\.\d+)?(?:e[+-]?\d+)?", str(text).lower())

def matches(row, answer_col, model_answer_col):
    answer_numbers = extract_numbers(row[answer_col])
    model_numbers = extract_numbers(row[model_answer_col])

    # Check if all numbers from answer are in model_answer
    return all(num in model_numbers for num in answer_numbers)

waits = ["ZeroShot_ReasoningTrace","Wait_1_ReasoningTrace","Wait_2_ReasoningTrace","Wait_3_ReasoningTrace"]

for wait in waits:
    print(f"Checking matches for {wait}")
    matched_count = math500.apply(
        lambda row: matches(row, "answer", wait), axis=1
    ).sum()
    print(f"Number of matching rows for {wait}: {matched_count}")

Checking matches for ZeroShot_ReasoningTrace
Number of matching rows for ZeroShot_ReasoningTrace: 269
Checking matches for Wait_1_ReasoningTrace
Number of matching rows for Wait_1_ReasoningTrace: 290
Checking matches for Wait_2_ReasoningTrace
Number of matching rows for Wait_2_ReasoningTrace: 303
Checking matches for Wait_3_ReasoningTrace
Number of matching rows for Wait_3_ReasoningTrace: 320
