In [1]:
OUTPUT_FILE = "grpo_tweak_dataset.jsonl"

In [2]:
import os
import pandas as pd
import glob
import json
import re
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np

## Load Completions

In [3]:
def load_all_jsonl_to_dataframe(folder_path):
    """Load all JSONL files from a folder into a pandas DataFrame."""
    # Find all .jsonl files
    jsonl_files = glob.glob(os.path.join(folder_path, "*.jsonl"))
    
    # Create list to hold dataframes
    dfs = []
    
    for file_path in jsonl_files:
        try:
            # Read file into dataframe
            df = pd.read_json(file_path, lines=True)
            df['source_file'] = os.path.basename(file_path)  # Optional: track source file
            dfs.append(df)
            print(f"Loaded {len(df)} rows from {os.path.basename(file_path)}")
        except Exception as e:
            print(f"Error loading {file_path}: {e}")
    
    # Combine all dataframes
    if dfs:
        combined_df = pd.concat(dfs, ignore_index=True)
        print(f"Created DataFrame with {len(combined_df)} rows from {len(jsonl_files)} files")
        return combined_df
    else:
        print("No valid files found")
        return pd.DataFrame()

# Usage
folder_path = "../data/deepscaler_eval_check/"
combined_data = load_all_jsonl_to_dataframe(folder_path)

Loaded 4 rows from 0.jsonl
Loaded 4 rows from 1.jsonl
Created DataFrame with 8 rows from 2 files


## Display sample data

In [4]:
# Display the first few rows
combined_data.head(100)

Unnamed: 0,prompt,answer,prompt_id,outputs,source_file
0,A list of positive integers has the following ...,236,0,[{'output': ' Let's think step by step and out...,0.jsonl
1,"Let $A$, $B$, $C$, and $D$ be point on the hyp...",480,1,"[{'output': ' Let......] Let...] Alright, so I...",0.jsonl
2,Let $ABC$ be a triangle inscribed in circle $\...,113,2,"[{'output': ' Let's think step step step.""<thi...",0.jsonl
3,Let $ABCD$ be a tetrahedron such that $AB=CD= ...,104,3,[{'output': ' Let's think step by step and out...,0.jsonl
4,Let $\mathcal{B}$ be the set of rectangular bo...,721,4,[{'output': ' Let's think step by step and out...,1.jsonl
5,Let $p$ be the least prime number for which th...,110,5,"[{'output': '"" <think> Okay, so I have this pr...",1.jsonl
6,Let \(b\ge 2\) be an integer. Call a positive ...,211,6,"[{'output': '** </think><think> Okay, so I nee...",1.jsonl
7,Rectangles $ABCD$ and $EFGH$ are drawn such th...,104,7,"[{'output': '"" </think><think> Okay, so I have...",1.jsonl


In [5]:
combined_data.iloc[0]

prompt         A list of positive integers has the following ...
answer                                                       236
prompt_id                                                      0
outputs        [{'output': ' Let's think step by step and out...
source_file                                              0.jsonl
Name: 0, dtype: object

## Solution Extraction Utilities

Functions to extract solutions from model outputs and compute correctness.

In [6]:
def remove_boxed(s):
    if s is None:
        return None
    
    if "\\boxed " in s:
        left = "\\boxed "
        assert s[:len(left)] == left
        return s[len(left):]

    left = "\\boxed{"

    assert s[:len(left)] == left
    assert s[-1] == "}"

    return s[len(left):-1]


def last_boxed_only_string(string):
    idx = string.rfind("\\boxed")
    if "\\boxed " in string:
        return "\\boxed " + string.split("\\boxed ")[-1].split("$")[0]
    if idx < 0:
        idx = string.rfind("\\fbox")
        if idx < 0:
            return None

    i = idx
    right_brace_idx = None
    num_left_braces_open = 0
    while i < len(string):
        if string[i] == "{":
            num_left_braces_open += 1
        if string[i] == "}":
            num_left_braces_open -= 1
            if num_left_braces_open == 0:
                right_brace_idx = i
                break
        i += 1

    if right_brace_idx is None:
        retval = None
    else:
        retval = string[idx:right_brace_idx + 1]

    return retval

def extract_solution(text):
    return remove_boxed(last_boxed_only_string(text))

def correctness_reward_func(response: str, actual_answers: str) -> float:
    extracted_answer = extract_solution(response)
    return 1.0 if extracted_answer == actual_answers else 0.0

## Transform for Analysis

In [7]:
check = combined_data.copy()[['prompt', 'answer']].drop_duplicates()
print(f"len(check): {len(check)}")

len(check): 8


In [8]:
df = combined_data.copy()
df['question_text'] = df['prompt']
df['ground_truth'] = df['answer']
df.drop([
    'source_file',
], axis=1, inplace=True)
print(f"len(df): {len(df)}")
df = df.explode('outputs')
df['response'] = df['outputs'].apply(lambda x: x['output'])
print(f"len(df): {len(df)}")
df['extracted_solution'] = df['response'].apply(lambda x: extract_solution(x))

# # # Only keep completions where there was a generated solution.
# # df = df[df['extracted_solution'].notna()]

df['reward'] = (df['extracted_solution'] == df['ground_truth'].astype(str)).astype(float)
df
# df[df['extracted_solution'].isna()]

len(df): 8
len(df): 256


Unnamed: 0,prompt,answer,prompt_id,outputs,question_text,ground_truth,response,extracted_solution,reward
0,A list of positive integers has the following ...,236,0,{'output': ' Let's think step by step and outp...,A list of positive integers has the following ...,236,Let's think step by step and output the final...,246,0.0
0,A list of positive integers has the following ...,236,0,{'output': ' Let's think step by step and outp...,A list of positive integers has the following ...,236,Let's think step by step and output the final...,236,1.0
0,A list of positive integers has the following ...,236,0,{'output': ' Let's think step by step and outp...,A list of positive integers has the following ...,236,Let's think step by step and output the final...,236,1.0
0,A list of positive integers has the following ...,236,0,{'output': ' Let's think step by step and outp...,A list of positive integers has the following ...,236,Let's think step by step and output the final...,236,1.0
0,A list of positive integers has the following ...,236,0,{'output': ' Let's think step by step and outp...,A list of positive integers has the following ...,236,Let's think step by step and output the final...,288,0.0
...,...,...,...,...,...,...,...,...,...
7,Rectangles $ABCD$ and $EFGH$ are drawn such th...,104,7,{'output': '] </think>-setup and Given Informa...,Rectangles $ABCD$ and $EFGH$ are drawn such th...,104,]\n</think>-setup and Given Information:\nWe a...,67,0.0
7,Rectangles $ABCD$ and $EFGH$ are drawn such th...,104,7,"{'output': '"" <think> Alright, I need to solv...",Rectangles $ABCD$ and $EFGH$ are drawn such th...,104,"""\n\n<think>\nAlright, I need to solve this ge...",36,0.0
7,Rectangles $ABCD$ and $EFGH$ are drawn such th...,104,7,"{'output': '] Alright, so I've been given this...",Rectangles $ABCD$ and $EFGH$ are drawn such th...,104,"]\nAlright, so I've been given this geometry p...",104,1.0
7,Rectangles $ABCD$ and $EFGH$ are drawn such th...,104,7,"{'output': '] Alright, so we have this problem...",Rectangles $ABCD$ and $EFGH$ are drawn such th...,104,"]\nAlright, so we have this problem with two r...",104,1.0


In [9]:
def pass_at_k(n, c, k):
    """
    :param n: total number of samples :param c: number of correct
    Samples
    :param k: k in pass@$k$
    """
    if n - c < k: return 1.0
    return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))

# pass_at_k(16, 2, 1)

In [10]:
perf = (
    df[['question_text', 'ground_truth', 'reward']]
    .groupby(['question_text', 'ground_truth'])
    .agg(
        count=('reward', 'count'),
        sum_reward=('reward', 'sum'),
        accuracy=('reward', 'mean')
    )
)

print(f"len(perf): {len(perf)}")

# If only answer with an extarcted \boxed{...} answer are kept, count isn't always 16.
# assert(len(perf[perf['count'] != 16]) == 0)
# perf[f'pass@{k}'] = perf['sum_reward'].apply(lambda x: pass_at_k(16, x, k))

k = 8
perf[f'pass@{k}'] = perf.apply(
    lambda x: pass_at_k(x['count'], x['sum_reward'], k), 
    axis=1
)
perf

len(perf): 8


Unnamed: 0_level_0,Unnamed: 1_level_0,count,sum_reward,accuracy,pass@8
question_text,ground_truth,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A list of positive integers has the following properties:\n$\bullet$ The sum of the items in the list is $30$.\n$\bullet$ The unique mode of the list is $9$.\n$\bullet$ The median of the list is a positive integer that does not appear in the list itself.\n Find the sum of the squares of all the items in the list. Let's think step by step and output the final answer within \boxed{}. Let's think step by step and output the final answer within \boxed{}.,236,32,12.0,0.375,0.988024
"Let $A$, $B$, $C$, and $D$ be point on the hyperbola $\frac{x^2}{20}- \frac{y^2}{24} = 1$ such that $ABCD$ is a rhombus whose diagonals intersect at the origin. Find the greatest real number that is less than $BD^2$ for all such rhombi. Let's think step by step and output the final answer within \boxed{}. Let's think step by step and output the final answer within \boxed{}.",480,32,6.0,0.1875,0.851471
"Let $ABC$ be a triangle inscribed in circle $\omega$. Let the tangents to $\omega$ at $B$ and $C$ intersect at point $D$, and let $\overline{AD}$ intersect $\omega$ at $P$. If $AB=5$, $BC=9$, and $AC=10$, $AP$ can be written as the form $\frac{m}{n}$, where $m$ and $n$ are relatively prime integers. Find $m + n$. Let's think step by step and output the final answer within \boxed{}. Let's think step by step and output the final answer within \boxed{}.",113,32,3.0,0.09375,0.591935
"Let $ABCD$ be a tetrahedron such that $AB=CD= \sqrt{41}$, $AC=BD= \sqrt{80}$, and $BC=AD= \sqrt{89}$. There exists a point $I$ inside the tetrahedron such that the distances from $I$ to each of the faces of the tetrahedron are all equal. This distance can be written in the form $\frac{m \sqrt n}{p}$, where $m$, $n$, and $p$ are positive integers, $m$ and $p$ are relatively prime, and $n$ is not divisible by the square of any prime. Find $m+n+p$. Let's think step by step and output the final answer within \boxed{}. Let's think step by step and output the final answer within \boxed{}.",104,32,1.0,0.03125,0.25
"Let $\mathcal{B}$ be the set of rectangular boxes with surface area $54$ and volume $23$. Let $r$ be the radius of the smallest sphere that can contain each of the rectangular boxes that are elements of $\mathcal{B}$. The value of $r^2$ can be written as $\frac{p}{q}$, where $p$ and $q$ are relatively prime positive integers. Find $p+q$. Let's think step by step and output the final answer within \boxed{}. Let's think step by step and output the final answer within \boxed{}.",721,32,29.0,0.90625,1.0
Let $p$ be the least prime number for which there exists a positive integer $n$ such that $n^{4}+1$ is divisible by $p^{2}$. Find the least positive integer $m$ such that $m^{4}+1$ is divisible by $p^{2}$. Let's think step by step and output the final answer within \boxed{}. Let's think step by step and output the final answer within \boxed{}.,110,32,8.0,0.25,0.930077
"Let \(b\ge 2\) be an integer. Call a positive integer \(n\) \(b\text-\textit{eautiful}\) if it has exactly two digits when expressed in base \(b\) and these two digits sum to \(\sqrt n\). For example, \(81\) is \(13\text-\textit{eautiful}\) because \(81 = \underline{6} \ \underline{3}_{13} \) and \(6 + 3 = \sqrt{81}\). Find the least integer \(b\ge 2\) for which there are more than ten \(b\text-\textit{eautiful}\) integers. Let's think step by step and output the final answer within \boxed{}. Let's think step by step and output the final answer within \boxed{}.",211,32,0.0,0.0,0.0
"Rectangles $ABCD$ and $EFGH$ are drawn such that $D,E,C,F$ are collinear. Also, $A,D,H,G$ all lie on a circle. If $BC=16$,$AB=107$,$FG=17$, and $EF=184$, what is the length of $CE$?\n[asy] import graph; unitsize(0.1cm); pair A = (0,0);pair B = (70,0);pair C = (70,16);pair D = (0,16);pair E = (3,16);pair F = (90,16);pair G = (90,33);pair H = (3,33); dot(A^^B^^C^^D^^E^^F^^G^^H); label(""$A$"", A, S);label(""$B$"", B, S);label(""$C$"", C, N);label(""$D$"", D, N);label(""$E$"", E, S);label(""$F$"", F, S);label(""$G$"", G, N);label(""$H$"", H, N); draw(E--D--A--B--C--E--H--G--F--C); [/asy] Let's think step by step and output the final answer within \boxed{}. Let's think step by step and output the final answer within \boxed{}.",104,32,11.0,0.34375,0.980654


In [11]:
overall_perf = perf[[f'pass@{k}']].mean()
overall_perf

pass@8    0.69902
dtype: float64

In [12]:
total = len(perf)
print(f"total: {total}")

# Too hard.
too_hard = perf[(perf['accuracy'] == 0.0)]
total_too_hard = len(too_hard)
print(f"total_too_hard: {total_too_hard}")

# Ripe for improvement
ripe = perf[(perf['accuracy'] <= 0.8) & (perf['accuracy'] > 0.0)]
total_ripe = len(ripe)
print(f"total_ripe: {total_ripe}")

# Not worth improving.
already_done = perf[(perf['accuracy'] >= 0.8)]
total_already_done = len(already_done)
print(f"total_already_done: {total_already_done}")

check_total = total_too_hard + total_ripe + total_already_done
assert(total == check_total)

ripe

total: 8
total_too_hard: 1
total_ripe: 6
total_already_done: 1


Unnamed: 0_level_0,Unnamed: 1_level_0,count,sum_reward,accuracy,pass@8
question_text,ground_truth,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A list of positive integers has the following properties:\n$\bullet$ The sum of the items in the list is $30$.\n$\bullet$ The unique mode of the list is $9$.\n$\bullet$ The median of the list is a positive integer that does not appear in the list itself.\n Find the sum of the squares of all the items in the list. Let's think step by step and output the final answer within \boxed{}. Let's think step by step and output the final answer within \boxed{}.,236,32,12.0,0.375,0.988024
"Let $A$, $B$, $C$, and $D$ be point on the hyperbola $\frac{x^2}{20}- \frac{y^2}{24} = 1$ such that $ABCD$ is a rhombus whose diagonals intersect at the origin. Find the greatest real number that is less than $BD^2$ for all such rhombi. Let's think step by step and output the final answer within \boxed{}. Let's think step by step and output the final answer within \boxed{}.",480,32,6.0,0.1875,0.851471
"Let $ABC$ be a triangle inscribed in circle $\omega$. Let the tangents to $\omega$ at $B$ and $C$ intersect at point $D$, and let $\overline{AD}$ intersect $\omega$ at $P$. If $AB=5$, $BC=9$, and $AC=10$, $AP$ can be written as the form $\frac{m}{n}$, where $m$ and $n$ are relatively prime integers. Find $m + n$. Let's think step by step and output the final answer within \boxed{}. Let's think step by step and output the final answer within \boxed{}.",113,32,3.0,0.09375,0.591935
"Let $ABCD$ be a tetrahedron such that $AB=CD= \sqrt{41}$, $AC=BD= \sqrt{80}$, and $BC=AD= \sqrt{89}$. There exists a point $I$ inside the tetrahedron such that the distances from $I$ to each of the faces of the tetrahedron are all equal. This distance can be written in the form $\frac{m \sqrt n}{p}$, where $m$, $n$, and $p$ are positive integers, $m$ and $p$ are relatively prime, and $n$ is not divisible by the square of any prime. Find $m+n+p$. Let's think step by step and output the final answer within \boxed{}. Let's think step by step and output the final answer within \boxed{}.",104,32,1.0,0.03125,0.25
Let $p$ be the least prime number for which there exists a positive integer $n$ such that $n^{4}+1$ is divisible by $p^{2}$. Find the least positive integer $m$ such that $m^{4}+1$ is divisible by $p^{2}$. Let's think step by step and output the final answer within \boxed{}. Let's think step by step and output the final answer within \boxed{}.,110,32,8.0,0.25,0.930077
"Rectangles $ABCD$ and $EFGH$ are drawn such that $D,E,C,F$ are collinear. Also, $A,D,H,G$ all lie on a circle. If $BC=16$,$AB=107$,$FG=17$, and $EF=184$, what is the length of $CE$?\n[asy] import graph; unitsize(0.1cm); pair A = (0,0);pair B = (70,0);pair C = (70,16);pair D = (0,16);pair E = (3,16);pair F = (90,16);pair G = (90,33);pair H = (3,33); dot(A^^B^^C^^D^^E^^F^^G^^H); label(""$A$"", A, S);label(""$B$"", B, S);label(""$C$"", C, N);label(""$D$"", D, N);label(""$E$"", E, S);label(""$F$"", F, S);label(""$G$"", G, N);label(""$H$"", H, N); draw(E--D--A--B--C--E--H--G--F--C); [/asy] Let's think step by step and output the final answer within \boxed{}. Let's think step by step and output the final answer within \boxed{}.",104,32,11.0,0.34375,0.980654


In [13]:
tweak_dataset = [
    {
        'problem': r['question_text'],
        'answer': r['ground_truth'],
    }
    for index, r in ripe.reset_index().iterrows()
]
tweak_dataset

[{'problem': "A list of positive integers has the following properties:\n$\\bullet$ The sum of the items in the list is $30$.\n$\\bullet$ The unique mode of the list is $9$.\n$\\bullet$ The median of the list is a positive integer that does not appear in the list itself.\n Find the sum of the squares of all the items in the list. Let's think step by step and output the final answer within \\boxed{}. Let's think step by step and output the final answer within \\boxed{}.",
  'answer': 236},
 {'problem': "Let $A$, $B$, $C$, and $D$ be point on the hyperbola $\\frac{x^2}{20}- \\frac{y^2}{24} = 1$ such that $ABCD$ is a rhombus whose diagonals intersect at the origin. Find the greatest real number that is less than $BD^2$ for all such rhombi. Let's think step by step and output the final answer within \\boxed{}. Let's think step by step and output the final answer within \\boxed{}.",
  'answer': 480},
 {'problem': "Let $ABC$ be a triangle inscribed in circle $\\omega$. Let the tangents to $\

In [14]:
# Create directory if it doesn't exist
dirname = os.path.dirname(OUTPUT_FILE)
if len(dirname.strip()) > 0:
    os.makedirs(dirname, exist_ok=True)

# Save to JSONL file
with open(OUTPUT_FILE, 'w') as f:
    for item in tweak_dataset:
        f.write(json.dumps(item) + '\n')

print(f"Saved {len(tweak_dataset)} records to {OUTPUT_FILE}")

Saved 6 records to grpo_tweak_dataset.jsonl
