In [1]:
import pandas as pd
import re

aime24 = pd.read_csv("ZeroShot/aime24_Phi4mini_HF_zeroshot_single.csv")
gpqa = pd.read_csv("ZeroShot/gpqa_Phi4mini_HF_zeroshot_single.csv")
math500 = pd.read_csv("ZeroShot/math500_Phi4mini_HF_zeroshot_single.csv")

In [None]:
pattern = r"<reasoning>.*?</reasoning>\s*<answer>(.*?)</answer>"

def extract_or_trim(text):
    if pd.isna(text):
        return ""
    match = re.fullmatch(r"<reasoning>.*?</reasoning>\s*<answer>.*?</answer>", text, flags=re.DOTALL)
    if match:
        # Extract content inside <answer>...</answer>
        extracted = re.search(pattern, text, flags=re.DOTALL)
        return extracted.group(1).strip() if extracted else ""
    else:
        # Return last 15 words
        return ' '.join(text.split()[-20:])

# Apply to the column and create a new column
math500['model_answer_extracted'] = math500['model_response_raw'].apply(extract_or_trim)
gpqa['model_answer_extracted'] = gpqa['model_response_raw'].apply(extract_or_trim)
aime24['model_answer_extracted'] = aime24['model_response_raw'].apply(extract_or_trim)


math500 = math500[["answer","model_answer_extracted"]]
gpqa = gpqa[["Correct Answer", "shuffled_correct_option_label", "model_answer_extracted"]]
aime24 = aime24[["answer","model_answer_extracted"]]


# Save the modified DataFrames to new CSV files
math500.to_csv("ZeroShot/math500_Phi4mini_HF_zeroshot_single_extracted.csv", index=False)
gpqa.to_csv("ZeroShot/gpqa_Phi4mini_HF_zeroshot_single_extracted.csv", index=False)
aime24.to_csv("ZeroShot/aime24_Phi4mini_HF_zeroshot_single_extracted.csv", index=False)

# GPQA Accuracy

In [3]:
def matches(row):
    correct = row['shuffled_correct_option_label'].strip()
    model_answer = str(row['model_answer_extracted']).strip()
    return (
        correct == model_answer or
        f">{correct}<" in model_answer
    )

matched_count = gpqa.apply(matches, axis=1).sum()
print(f"Number of matching rows: {matched_count}")

Number of matching rows: 135


# AIME24 Accuracy

In [4]:
def matches(row):
    correct = str(row['answer']).strip()
    model_answer = str(row['model_answer_extracted']).strip()
    return (
        correct in model_answer
    )

matched_count = aime24.apply(matches, axis=1).sum()
print(f"Number of matching rows: {matched_count}")

Number of matching rows: 1


# MATH500

In [None]:
def extract_numbers(text):
    # Extract integers, decimals, scientific notation, and negative numbers
    return re.findall(r"-?\d+(?:\.\d+)?(?:e[+-]?\d+)?", str(text).lower())

def matches(row):
    answer_numbers = extract_numbers(row['answer'])
    model_numbers = extract_numbers(row['model_answer_extracted'])

    # Check if all numbers from answer are in model_answer
    return all(num in model_numbers for num in answer_numbers)

matched_count = math500.apply(matches, axis=1).sum()
print(f"Number of matching rows: {matched_count}")

Number of matching rows: 257
