In [1]:
import os
import pandas as pd
import glob
import json
import re
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np

## Load Completions

In [2]:
def load_all_jsonl_to_dataframe(folder_path):
    """Load all JSONL files from a folder into a pandas DataFrame."""
    # Find all .jsonl files
    jsonl_files = glob.glob(os.path.join(folder_path, "*.jsonl"))
    
    # Create list to hold dataframes
    dfs = []
    
    for file_path in jsonl_files:
        try:
            # Read file into dataframe
            df = pd.read_json(file_path, lines=True)
            df['source_file'] = os.path.basename(file_path)  # Optional: track source file
            dfs.append(df)
            print(f"Loaded {len(df)} rows from {os.path.basename(file_path)}")
        except Exception as e:
            print(f"Error loading {file_path}: {e}")
    
    # Combine all dataframes
    if dfs:
        combined_df = pd.concat(dfs, ignore_index=True)
        print(f"Created DataFrame with {len(combined_df)} rows from {len(jsonl_files)} files")
        return combined_df
    else:
        print("No valid files found")
        return pd.DataFrame()

# Usage
folder_path = "../data/deepscaler_aime_sample_eval_check/"
combined_data = load_all_jsonl_to_dataframe(folder_path)

Loaded 4 rows from 0.jsonl
Loaded 4 rows from 1.jsonl
Created DataFrame with 8 rows from 2 files


## Display sample data

In [3]:
# Display the first few rows
# combined_data.head(10)

In [4]:
# combined_data.iloc[0]

## Solution Extraction Utilities

Functions to extract solutions from model outputs and compute correctness.

In [5]:
def remove_boxed(s):
    if s is None:
        return None

    left = "\\boxed{"

    if s[:len(left)] != left:
        return None
    if s[-1] != "}":
        return None

    return s[len(left):-1]


def last_boxed_only_string(string):
    idx = string.rfind("\\boxed")
    if "\\boxed " in string:
        return "\\boxed " + string.split("\\boxed ")[-1].split("$")[0]
    if idx < 0:
        idx = string.rfind("\\fbox")
        if idx < 0:
            return None

    i = idx
    right_brace_idx = None
    num_left_braces_open = 0
    while i < len(string):
        if string[i] == "{":
            num_left_braces_open += 1
        if string[i] == "}":
            num_left_braces_open -= 1
            if num_left_braces_open == 0:
                right_brace_idx = i
                break
        i += 1

    if right_brace_idx is None:
        retval = None
    else:
        retval = string[idx:right_brace_idx + 1]

    return retval

def extract_solution(text):
    return remove_boxed(last_boxed_only_string(text))

def correctness_reward_func(response: str, actual_answers: str) -> float:
    extracted_answer = extract_solution(response)
    return 1.0 if extracted_answer == actual_answers else 0.0

## Transform for Analysis

In [6]:
check = combined_data.copy()[['prompt', 'answer']].drop_duplicates()
print(f"len(check): {len(check)}")

len(check): 8


In [7]:
df = combined_data.copy()
df['question_text'] = df['prompt']
df['ground_truth'] = df['answer']
df.drop([
    'source_file',
], axis=1, inplace=True)
# print(f"len(df): {len(df)}")
df = df.explode('outputs')
df['response'] = df['outputs'].apply(lambda x: x['output'])
# print(f"len(df): {len(df)}")
df['extracted_solution'] = df['response'].apply(lambda x: extract_solution(x))

# # # Only keep completions where there was a generated solution.
# # df = df[df['extracted_solution'].notna()]

df['reward'] = (df['extracted_solution'] == df['ground_truth'].astype(str)).astype(float)
# df
# df[df['extracted_solution'].isna()]

## Spot Check Traces

In [8]:
prompt_id = 0
correct_pos = 0
incorrect_pos = 0

prompt = df[df.prompt_id == prompt_id]
prompt_correct = prompt[prompt.reward == 1.0]
prompt_incorrect = prompt[prompt.reward == .0]

In [9]:
print(prompt.iloc[correct_pos]['prompt'])

A list of positive integers has the following properties:
$\bullet$ The sum of the items in the list is $30$.
$\bullet$ The unique mode of the list is $9$.
$\bullet$ The median of the list is a positive integer that does not appear in the list itself.
 Find the sum of the squares of all the items in the list. Let's think step by step and output the final answer within \boxed{}. Let's think step by step and output the final answer within \boxed{}.


In [10]:
print(prompt_correct.iloc[incorrect_pos]['response'])

 Let's think step by step and output the final answer within \boxed.

To determine the sum of squares, we need to find all the elements in the list, then compute their squares and sum them.

First, note that the sum of the list is 30, unique mode is 9, and the median doesn't appear in the list.

We know that the list must contain at least two 9s because the mode is unique and 9 is the mode. So let's try to figure out the possible number of elements.

Wait, let's think step by step.

Firstly, since the unique mode is 9, which means that all other numbers appear fewer times than 9. So 9 must appear more times than any other number.

Therefore, in the list, 9 occurs the maximum number of times, and the counts of all other numbers are less than that.

Given that the sum is 30, and all numbers are positive integers.

We also know that the median is a positive integer that does not appear in the list.

So first, let's figure out the number of elements.

Let’s denote n as the number of elemen

In [11]:
print(prompt_incorrect.iloc[1]['response'])

 Let's think step by step and output the final answer within \boxed.
Alright, so I've got this problem here about a list of positive integers with three properties. Let me try to parse each one and see what I can figure out step by step. Maybe after breaking it down, it'll all make sense. Okay, let's go.

First, the problem says: "The sum of the items in the list is 30." That's straightforward. So if we denote the list as [a1, a2, a3, ..., an], where each ai is a positive integer, then a1 + a2 + a3 + ... + an = 30.

Next property: "The unique mode of the list is 9." Hmm, mode is the number that appears most frequently in the list. And it's a unique mode, meaning 9 appears more times than any other number in the list. So in other words, the number of times 9 appears is greater than any other number's count. So if another number appears x times, 9 must appear at least x + 1 times? Wait, maybe not necessarily, but at least the number of times it appears is the highest. So since it's the u

## Measure Perf

In [12]:
def pass_at_k(n, c, k):
    """
    :param n: total number of samples :param c: number of correct
    Samples
    :param k: k in pass@$k$
    """
    if n - c < k: return 1.0
    return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))

# pass_at_k(16, 2, 1)

In [13]:
perf = (
    df[['question_text', 'ground_truth', 'reward']]
    .groupby(['question_text', 'ground_truth'])
    .agg(
        count=('reward', 'count'),
        sum_reward=('reward', 'sum'),
        accuracy=('reward', 'mean')
    )
)

# print(f"len(perf): {len(perf)}")

# If only answer with an extarcted \boxed{...} answer are kept, count isn't always 16.
# assert(len(perf[perf['count'] != 16]) == 0)
# perf[f'pass@{k}'] = perf['sum_reward'].apply(lambda x: pass_at_k(16, x, k))

k = 1
perf[f'pass@{k}'] = perf.apply(
    lambda x: pass_at_k(x['count'], x['sum_reward'], k), 
    axis=1
)
# perf

In [14]:
tweak_dataset = [
    {
        'problem': r['question_text'],
        'answer': str(r['ground_truth']),
        f'pass@{k}': str(r[f'pass@{k}']),
    }
    for index, r in perf.reset_index().iterrows()
]
# tweak_dataset

In [15]:
pass_at_k = f'pass@{k}'
for p in tweak_dataset:
    print(f"problem {p['problem']}")
    print(f"answer {p['answer']}")
    print(f"{pass_at_k} {p[pass_at_k]}")
    print()

problem A list of positive integers has the following properties:
$\bullet$ The sum of the items in the list is $30$.
$\bullet$ The unique mode of the list is $9$.
$\bullet$ The median of the list is a positive integer that does not appear in the list itself.
 Find the sum of the squares of all the items in the list. Let's think step by step and output the final answer within \boxed{}. Let's think step by step and output the final answer within \boxed{}.
answer 236
pass@1 0.3749999999999998

problem Let $A$, $B$, $C$, and $D$ be point on the hyperbola $\frac{x^2}{20}- \frac{y^2}{24} = 1$ such that $ABCD$ is a rhombus whose diagonals intersect at the origin. Find the greatest real number that is less than $BD^2$ for all such rhombi. Let's think step by step and output the final answer within \boxed{}. Let's think step by step and output the final answer within \boxed{}.
answer 480
pass@1 0.1875

problem Let $ABC$ be a triangle inscribed in circle $\omega$. Let the tangents to $\omega$ a