In [1]:
import os
import pandas as pd
import glob
import json
import re
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np

## Load Completions

In [2]:
def load_all_jsonl_to_dataframe(folder_path):
    """Load all JSONL files from a folder into a pandas DataFrame."""
    # Find all .jsonl files
    jsonl_files = glob.glob(os.path.join(folder_path, "*.jsonl"))
    
    # Create list to hold dataframes
    dfs = []
    
    for file_path in jsonl_files:
        try:
            # Read file into dataframe
            df = pd.read_json(file_path, lines=True)
            df['source_file'] = os.path.basename(file_path)  # Optional: track source file
            dfs.append(df)
            print(f"Loaded {len(df)} rows from {os.path.basename(file_path)}")
        except Exception as e:
            print(f"Error loading {file_path}: {e}")
    
    # Combine all dataframes
    if dfs:
        combined_df = pd.concat(dfs, ignore_index=True)
        print(f"Created DataFrame with {len(combined_df)} rows from {len(jsonl_files)} files")
        return combined_df
    else:
        print("No valid files found")
        return pd.DataFrame()

# Usage
folder_path = "../data/light_r1_dpo_DeepScaleR_1_5_B_results_5000_plus/"
combined_data = load_all_jsonl_to_dataframe(folder_path)

Loaded 6 rows from 4.jsonl
Loaded 6 rows from 2.jsonl
Loaded 6 rows from 0.jsonl
Loaded 6 rows from 1.jsonl
Loaded 6 rows from 3.jsonl
Created DataFrame with 30 rows from 5 files


## Display sample data

In [3]:
# Display the first few rows
# combined_data.head(10)

In [4]:
# combined_data.iloc[0]

## Solution Extraction Utilities

Functions to extract solutions from model outputs and compute correctness.

In [5]:
def remove_boxed(s):
    if s is None:
        return None

    left = "\\boxed{"

    if s[:len(left)] != left:
        return None
    if s[-1] != "}":
        return None

    return s[len(left):-1]


def last_boxed_only_string(string):
    idx = string.rfind("\\boxed")
    if "\\boxed " in string:
        return "\\boxed " + string.split("\\boxed ")[-1].split("$")[0]
    if idx < 0:
        idx = string.rfind("\\fbox")
        if idx < 0:
            return None

    i = idx
    right_brace_idx = None
    num_left_braces_open = 0
    while i < len(string):
        if string[i] == "{":
            num_left_braces_open += 1
        if string[i] == "}":
            num_left_braces_open -= 1
            if num_left_braces_open == 0:
                right_brace_idx = i
                break
        i += 1

    if right_brace_idx is None:
        retval = None
    else:
        retval = string[idx:right_brace_idx + 1]

    return retval

def extract_solution(text):
    return remove_boxed(last_boxed_only_string(text))

def correctness_reward_func(response: str, actual_answers: str) -> float:
    extracted_answer = extract_solution(response)
    return 1.0 if extracted_answer == actual_answers else 0.0

## Transform for Analysis

In [6]:
check = combined_data.copy()[['prompt', 'answer']].drop_duplicates()
print(f"len(check): {len(check)}")

len(check): 30


In [7]:
df = combined_data.copy()
df['question_text'] = df['prompt']
df['ground_truth'] = df['answer']
df.drop([
    'source_file',
], axis=1, inplace=True)
# print(f"len(df): {len(df)}")
df = df.explode('outputs')
df['response'] = df['outputs'].apply(lambda x: x['output'])
# print(f"len(df): {len(df)}")
df['extracted_solution'] = df['response'].apply(lambda x: extract_solution(x))

# # # Only keep completions where there was a generated solution.
# # df = df[df['extracted_solution'].notna()]

df['reward'] = (df['extracted_solution'] == df['ground_truth'].astype(str)).astype(float)
# df
# df[df['extracted_solution'].isna()]

## Spot Check Traces

In [8]:
prompt_id = 0
correct_pos = 0
incorrect_pos = 6

prompt = df[df.prompt_id == prompt_id]
prompt_correct = prompt[prompt.reward == 1.0]
prompt_incorrect = prompt[prompt.reward == .0]

In [9]:
print(prompt.iloc[correct_pos]['prompt'])

Every morning Aya goes for a $9$-kilometer-long walk and stops at a coffee shop afterwards. When she walks at a constant speed of $s$ kilometers per hour, the walk takes her 4 hours, including $t$ minutes spent in the coffee shop. When she walks $s+2$ kilometers per hour, the walk takes her 2 hours and 24 minutes, including $t$ minutes spent in the coffee shop. Suppose Aya walks at $s+\frac{1}{2}$ kilometers per hour. Find the number of minutes the walk takes her, including the $t$ minutes spent in the coffee shop. Let's think step by step and output the final answer within \boxed{}. Let's think step by step and output the final answer within \boxed{}.


In [10]:
print(prompt_correct.iloc[correct_pos]['response'])

 Your response is... in the past year, I was an approach.\n<think>
Alright, let's try to solve this problem step by step. First, let's understand what's given.

We have Aya who walks 9 kilometers every morning. Depending on her speed, the time she takes changes. Additionally, she spends some time in a coffee shop. We are given two scenarios:

1. When she walks at speed s km/h, the total time including the coffee shop time is 4 hours.
2. When she increases her speed to s+2 km/h, the total time becomes 2 hours and 24 minutes, which includes the same t minutes in the coffee shop.

We need to find the time in minutes when she walks at speed s + 0.5 km/h, including the t minutes in the coffee shop.

Let's break this down.

First, let's define the variables:

- Let s be her walking speed in km/h.
- Let t be the time she spends in the coffee shop in minutes.
- Let d be the distance walked before entering the coffee shop, which is 9 km total? Wait, no. Wait, when she starts walking, she goes 9

In [11]:
print(prompt_incorrect.iloc[incorrect_pos]['response'])

 Let's think step by step and output the final answer within \boxed.
Okay, so let's try to figure out this problem step by step. It might take a while because it's a bit complex, but I'll try to break it down.

First, let's understand the given information:

Aya walks 9 kilometers every morning and stops at a coffee shop. The time she takes includes both walking and the time spent in the shop.

We have two scenarios:

1. When Aya walks at speed s km/h, it takes her 4 hours in total, including t minutes in the shop.

2. When she increases her speed to s + 2 km/h, the total time becomes 2 hours and 24 minutes, still including t minutes in the shop.

We need to find out the total time (including t) if her speed is s + 1/2 km/h. Then, convert that time into minutes.

Let me note down all the given information and try to find s and t.

First, for the first case:

She walks 9 km at speed s km/h, taking her walking time plus the shop time.

So, total time is walking time plus t.

Total time i

## Measure Perf

In [12]:
def pass_at_k(n, c, k):
    """
    :param n: total number of samples :param c: number of correct
    Samples
    :param k: k in pass@$k$
    """
    if n - c < k: return 1.0
    return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))

# pass_at_k(16, 2, 1)

In [13]:
perf = (
    df[['question_text', 'ground_truth', 'reward']]
    .groupby(['question_text', 'ground_truth'])
    .agg(
        count=('reward', 'count'),
        sum_reward=('reward', 'sum'),
        accuracy=('reward', 'mean')
    )
)

# print(f"len(perf): {len(perf)}")

# If only answer with an extarcted \boxed{...} answer are kept, count isn't always 16.
# assert(len(perf[perf['count'] != 16]) == 0)
# perf[f'pass@{k}'] = perf['sum_reward'].apply(lambda x: pass_at_k(16, x, k))

k = 1
perf[f'pass@{k}'] = perf.apply(
    lambda x: pass_at_k(x['count'], x['sum_reward'], k), 
    axis=1
)
# perf

In [14]:
tweak_dataset = [
    {
        'problem': r['question_text'],
        'answer': str(r['ground_truth']),
        f'pass@{k}': str(r[f'pass@{k}']),
    }
    for index, r in perf.reset_index().iterrows()
]
# tweak_dataset

In [15]:
overall_perf = perf[[f'pass@{k}']].mean()
overall_perf

pass@1    0.055208
dtype: float64

In [16]:
pass_at_k = f'pass@{k}'
for p in tweak_dataset:
    print(f"problem {p['problem']}")
    print(f"answer {p['answer']}")
    print(f"{pass_at_k} {p[pass_at_k]}")
    print()

problem A list of positive integers has the following properties:
$\bullet$ The sum of the items in the list is $30$.
$\bullet$ The unique mode of the list is $9$.
$\bullet$ The median of the list is a positive integer that does not appear in the list itself.
 Find the sum of the squares of all the items in the list. Let's think step by step and output the final answer within \boxed{}. Let's think step by step and output the final answer within \boxed{}.
answer 236
pass@1 0.03125

problem Alice and Bob play the following game. A stack of $n$ tokens lies before them. The players take turns with Alice going first. On each turn, the player removes either $1$ token or $4$ tokens from the stack. Whoever removes the last token wins. Find the number of positive integers $n$ less than or equal to $2024$ for which there exists a strategy for Bob that guarantees that Bob will win the game regardless of Alice's play. Let's think step by step and output the final answer within \boxed{}. Let's thin