In [1]:
# assumption for AIME : -1 cannot be an answer

import re
from typing import List
import pandas as pd
import numpy as np

from datasets import load_dataset

#### Dataset

In [2]:
# load dataset
aime24 = load_dataset("Maxwell-Jia/AIME_2024")['train']

In [3]:
aime24

Dataset({
    features: ['ID', 'Problem', 'Solution', 'Answer'],
    num_rows: 30
})

In [4]:
aime24[0]

{'ID': '2024-II-4',
 'Problem': 'Let $x,y$ and $z$ be positive real numbers that satisfy the following system of equations: \n\\[\\log_2\\left({x \\over yz}\\right) = {1 \\over 2}\\]\n\\[\\log_2\\left({y \\over xz}\\right) = {1 \\over 3}\\]\n\\[\\log_2\\left({z \\over xy}\\right) = {1 \\over 4}\\]\nThen the value of $\\left|\\log_2(x^4y^3z^2)\\right|$ is $\\tfrac{m}{n}$ where $m$ and $n$ are relatively prime positive integers. Find $m+n$.',
 'Solution': 'Denote $\\log_2(x) = a$, $\\log_2(y) = b$, and $\\log_2(z) = c$.\n\nThen, we have:\n$a-b-c = \\frac{1}{2}$,\n$-a+b-c = \\frac{1}{3}$,\n$-a-b+c = \\frac{1}{4}$.\n\nNow, we can solve to get $a = \\frac{-7}{24}, b = \\frac{-9}{24}, c = \\frac{-5}{12}$.\nPlugging these values in, we obtain $|4a + 3b + 2c|  = \\frac{25}{8} \\implies \\boxed{033}$.',
 'Answer': 33}

In [5]:
# true answers
ans_dict = dict(zip(aime24['ID'], aime24['Answer']))

#### Results files

In [8]:
# load generations
# multiple entries with same ID (multiple runs)

#results_file = "/Users/z5652395/formalizing-reasoning/results/AIME_2024_Qwen2.5-Math-7B-Instruct_runs_8.csv"
#results_file = "/Users/z5652395/formalizing-reasoning/results/AIME_2024_Qwen3-8B_runs_8.csv"
#results_file = "/Users/z5652395/formalizing-reasoning/results/AIME_2024_Qwen3-8B_think_runs_8.csv"
#results_file = "/Users/z5652395/formalizing-reasoning/results/AIME_2024_Qwen3-8B_4_options_runs_1.csv"
#results_file = "/Users/z5652395/formalizing-reasoning/results/AIME_2024_Qwen2.5-Math-7B-Instruct_runs_8_38912.csv"
results_file = "/Users/z5652395/formalizing-reasoning/results/AIME_2024_Qwen3-8B_runs_8_38912.csv"

In [9]:
results_df = pd.read_csv(results_file)

id_list = results_df['ID'].values.tolist()
gen_list = results_df['Generated'].values.tolist()

# compile answers by id
gen_dict = {}
for i in range(len(id_list)):
    p_id = id_list[i]
    if p_id in gen_dict: gen_dict[p_id].append(gen_list[i])
    else: gen_dict[p_id] = [gen_list[i]]

#### Functions for matching boxed answers

In [10]:
# mathching function

def naive_match(gen: str) -> List[str]:
    box = 'boxed{'
    box_begin = gen.find(box)
    box_end = box_begin + gen[box_begin:].find('}')
    gen_ans = gen[box_begin+len(box):box_end]
    return [gen_ans]

def reg_match(gen: str) -> List[str]:
    try:
        gen_ans = re.findall('boxed{(.+?)}', gen)
    except AttributeError:
        gen_ans = []
    return gen_ans

#### pass@1 accuracy

In [11]:
# pass@1
# assumption : -1 cannot be an answer

pass_dict = {}
for p_id in id_list:
    
    generations = gen_dict[p_id]
    answer = ans_dict[p_id]

    p = []
    for gen in generations:
        
        # match boxed{.}
        #gen_answers = naive_match(gen)
        gen_answers = reg_match(gen)
        
        # take the first match as the answer
        if len(gen_answers) > 0:
            try:
                gen_ans = int(gen_answers[0])
            # incorrect gen
            except ValueError:
                gen_ans = -1
        else: gen_ans = -1 
        
        # compute p
        p.append(1) if gen_ans == answer else p.append(0)

    pass_dict[p_id] = p
    
pass_dict

{'2024-II-4': [1, 1, 1, 1, 1, 1, 1, 1],
 '2024-II-12': [1, 1, 1, 1, 1, 1, 0, 1],
 '2024-I-4': [1, 1, 1, 1, 1, 1, 1, 1],
 '2024-I-3': [1, 1, 1, 1, 1, 1, 1, 1],
 '2024-I-8': [0, 0, 0, 0, 0, 0, 0, 0],
 '2024-I-12': [0, 0, 0, 0, 0, 0, 0, 0],
 '2024-I-11': [0, 0, 0, 0, 0, 1, 0, 0],
 '2024-II-11': [1, 1, 1, 1, 1, 1, 1, 1],
 '2024-I-2': [1, 1, 1, 1, 1, 1, 1, 1],
 '2024-II-6': [1, 1, 1, 1, 1, 1, 1, 1],
 '2024-I-7': [1, 1, 1, 1, 1, 1, 1, 1],
 '2024-II-3': [1, 1, 1, 1, 1, 1, 1, 0],
 '2024-I-1': [1, 1, 1, 1, 1, 1, 1, 1],
 '2024-II-7': [1, 1, 1, 1, 0, 0, 1, 1],
 '2024-I-6': [1, 1, 1, 1, 1, 1, 1, 1],
 '2024-I-13': [1, 1, 1, 1, 1, 1, 1, 1],
 '2024-I-15': [1, 1, 1, 1, 1, 1, 1, 1],
 '2024-II-15': [0, 0, 0, 0, 0, 0, 0, 0],
 '2024-II-10': [1, 1, 1, 1, 1, 1, 1, 1],
 '2024-II-9': [0, 0, 0, 0, 0, 0, 0, 0],
 '2024-II-14': [1, 1, 0, 1, 1, 1, 1, 0],
 '2024-II-5': [1, 0, 1, 1, 1, 0, 1, 1],
 '2024-I-9': [0, 1, 1, 1, 1, 0, 1, 1],
 '2024-II-2': [1, 1, 1, 0, 0, 1, 1, 1],
 '2024-II-1': [1, 1, 1, 1, 1, 1, 1, 1],
 '2

In [12]:
# avg pass@1
# can measure problem difficulty with this
# another way to sort problems
avg_pass_dict = {}
for p_id, p_vec in pass_dict.items():
    avg_pass_dict[p_id] = sum(p_vec)/len(p_vec)

avg_pass_dict

{'2024-II-4': 1.0,
 '2024-II-12': 0.875,
 '2024-I-4': 1.0,
 '2024-I-3': 1.0,
 '2024-I-8': 0.0,
 '2024-I-12': 0.0,
 '2024-I-11': 0.125,
 '2024-II-11': 1.0,
 '2024-I-2': 1.0,
 '2024-II-6': 1.0,
 '2024-I-7': 1.0,
 '2024-II-3': 0.875,
 '2024-I-1': 1.0,
 '2024-II-7': 0.75,
 '2024-I-6': 1.0,
 '2024-I-13': 1.0,
 '2024-I-15': 1.0,
 '2024-II-15': 0.0,
 '2024-II-10': 1.0,
 '2024-II-9': 0.0,
 '2024-II-14': 0.75,
 '2024-II-5': 0.75,
 '2024-I-9': 0.75,
 '2024-II-2': 0.75,
 '2024-II-1': 1.0,
 '2024-I-10': 1.0,
 '2024-II-8': 0.125,
 '2024-I-14': 1.0,
 '2024-I-5': 0.75,
 '2024-II-13': 1.0}

In [13]:
print("pass@1 : {}".format(sum(list(avg_pass_dict.values())) / len(avg_pass_dict)))

pass@1 : 0.75


#### Accuracy per run

In [None]:
# TODO

#### Variance by problem

In [14]:
# accuracy variance by problem
# one way to sort problems
pass_arrays = np.array(list(pass_dict.values()))
acc_vars = dict(zip(list(pass_dict.keys()), np.var(pass_arrays, axis=1).tolist()))
acc_vars

{'2024-II-4': 0.0,
 '2024-II-12': 0.109375,
 '2024-I-4': 0.0,
 '2024-I-3': 0.0,
 '2024-I-8': 0.0,
 '2024-I-12': 0.0,
 '2024-I-11': 0.109375,
 '2024-II-11': 0.0,
 '2024-I-2': 0.0,
 '2024-II-6': 0.0,
 '2024-I-7': 0.0,
 '2024-II-3': 0.109375,
 '2024-I-1': 0.0,
 '2024-II-7': 0.1875,
 '2024-I-6': 0.0,
 '2024-I-13': 0.0,
 '2024-I-15': 0.0,
 '2024-II-15': 0.0,
 '2024-II-10': 0.0,
 '2024-II-9': 0.0,
 '2024-II-14': 0.1875,
 '2024-II-5': 0.1875,
 '2024-I-9': 0.1875,
 '2024-II-2': 0.1875,
 '2024-II-1': 0.0,
 '2024-I-10': 0.0,
 '2024-II-8': 0.109375,
 '2024-I-14': 0.0,
 '2024-I-5': 0.1875,
 '2024-II-13': 0.0}

In [15]:
# average var
all_vars = list(acc_vars.values())
print('avg var : {}'.format(sum(all_vars)/len(all_vars)))

avg var : 0.052083333333333336
