In [1]:
OUTPUT_FILE = "grpo_tweak_dataset.jsonl"

In [2]:
import os
import pandas as pd
import glob
import json
import re
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np

## Load Completions

In [3]:
def load_all_jsonl_to_dataframe(folder_path):
    """Load all JSONL files from a folder into a pandas DataFrame."""
    # Find all .jsonl files
    jsonl_files = glob.glob(os.path.join(folder_path, "*.jsonl"))
    
    # Create list to hold dataframes
    dfs = []
    
    for file_path in jsonl_files:
        try:
            # Read file into dataframe
            df = pd.read_json(file_path, lines=True)
            df['source_file'] = os.path.basename(file_path)  # Optional: track source file
            dfs.append(df)
            print(f"Loaded {len(df)} rows from {os.path.basename(file_path)}")
        except Exception as e:
            print(f"Error loading {file_path}: {e}")
    
    # Combine all dataframes
    if dfs:
        combined_df = pd.concat(dfs, ignore_index=True)
        print(f"Created DataFrame with {len(combined_df)} rows from {len(jsonl_files)} files")
        return combined_df
    else:
        print("No valid files found")
        return pd.DataFrame()

# Usage
folder_path = "../data/DeepScaleR_1_5_B_results/"
combined_data = load_all_jsonl_to_dataframe(folder_path)

Loaded 12 rows from 4.jsonl
Loaded 12 rows from 6.jsonl
Loaded 12 rows from 2.jsonl
Loaded 12 rows from 0.jsonl
Loaded 12 rows from 7.jsonl
Loaded 12 rows from 5.jsonl
Loaded 12 rows from 1.jsonl
Loaded 12 rows from 3.jsonl
Created DataFrame with 96 rows from 8 files


## Display sample data

In [4]:
# Display the first few rows
combined_data.head(100)

Unnamed: 0,prompt,answer,prompt_id,outputs,source_file
0,What is the median of the following list of $4...,1976.5,48,"[{'output': ' </think><think> Okay, so I nee...",4.jsonl
1,"Ten positive integers include the numbers 3, 5...",11.0,49,"[{'output': '"" <think> Okay, so I've got this...",4.jsonl
2,Cory made a complete list of the prime numbers...,25.0,50,"[{'output': '"" Alright, let's tackle this prob...",4.jsonl
3,"For a given list of three numbers, the operati...",18.0,51,"[{'output': ' <think> Okay, so I have this p...",4.jsonl
4,"Given $(b_1, b_2, ..., b_{12})$ is a list of t...",2048.0,52,"[{'output': '' <think> Okay, so I have this pr...",4.jsonl
...,...,...,...,...,...
91,"Let \( (a_1, a_2, \dots, a_{12}) \) be a list ...",2048.0,43,"[{'output': '[/<think> Okay, so I have this pr...",3.jsonl
92,"Let $(a_1, a_2, \dots ,a_{10})$ be a list of t...",512.0,44,"[{'output': ' {Your answer should be...,}.\} I...",3.jsonl
93,"When the mean, median, and mode of the list\n\...",20.0,45,"[{'output': '\ <think> Okay, so I have this pr...",3.jsonl
94,"When the mean, median, and mode of the list\n\...",20.0,46,"[{'output': ' <think> Okay, so I have this pr...",3.jsonl


In [5]:
combined_data.iloc[0]

prompt         What is the median of the following list of $4...
answer                                                    1976.5
prompt_id                                                     48
outputs        [{'output': '  
</think><think>
Okay, so I nee...
source_file                                              4.jsonl
Name: 0, dtype: object

## Solution Extraction Utilities

Functions to extract solutions from model outputs and compute correctness.

In [6]:
def remove_boxed(s):
    if s is None:
        return None
    
    if "\\boxed " in s:
        left = "\\boxed "
        assert s[:len(left)] == left
        return s[len(left):]

    left = "\\boxed{"

    assert s[:len(left)] == left
    assert s[-1] == "}"

    return s[len(left):-1]


def last_boxed_only_string(string):
    idx = string.rfind("\\boxed")
    if "\\boxed " in string:
        return "\\boxed " + string.split("\\boxed ")[-1].split("$")[0]
    if idx < 0:
        idx = string.rfind("\\fbox")
        if idx < 0:
            return None

    i = idx
    right_brace_idx = None
    num_left_braces_open = 0
    while i < len(string):
        if string[i] == "{":
            num_left_braces_open += 1
        if string[i] == "}":
            num_left_braces_open -= 1
            if num_left_braces_open == 0:
                right_brace_idx = i
                break
        i += 1

    if right_brace_idx is None:
        retval = None
    else:
        retval = string[idx:right_brace_idx + 1]

    return retval

def extract_solution(text):
    return remove_boxed(last_boxed_only_string(text))

def correctness_reward_func(response: str, actual_answers: str) -> float:
    extracted_answer = extract_solution(response)
    return 1.0 if extracted_answer == actual_answers else 0.0

## Transform for Analysis

In [7]:
check = combined_data.copy()[['prompt', 'answer']].drop_duplicates()
print(f"len(check): {len(check)}")

len(check): 87


In [8]:
df = combined_data.copy()
df['question_text'] = df['prompt']
df['ground_truth'] = df['answer']
df.drop([
    'source_file',
], axis=1, inplace=True)
print(f"len(df): {len(df)}")
df = df.explode('outputs')
df['response'] = df['outputs'].apply(lambda x: x['output'])
print(f"len(df): {len(df)}")
df['extracted_solution'] = df['response'].apply(lambda x: extract_solution(x))

# # # Only keep completions where there was a generated solution.
# # df = df[df['extracted_solution'].notna()]

df['reward'] = (df['extracted_solution'] == df['ground_truth'].astype(str)).astype(float)
df
# df[df['extracted_solution'].isna()]

len(df): 96
len(df): 3072


Unnamed: 0,prompt,answer,prompt_id,outputs,question_text,ground_truth,response,extracted_solution,reward
0,What is the median of the following list of $4...,1976.5,48,"{'output': ' </think><think> Okay, so I need...",What is the median of the following list of $4...,1976.5,"\n</think><think>\nOkay, so I need to find t...",1011,0.0
0,What is the median of the following list of $4...,1976.5,48,"{'output': '** </think><think> Okay, so I nee...",What is the median of the following list of $4...,1976.5,"**\n\n</think><think>\nOkay, so I need to find...",1010,0.0
0,What is the median of the following list of $4...,1976.5,48,"{'output': ' </think><think> Alright, let's t...",What is the median of the following list of $4...,1976.5,"\n</think><think>\nAlright, let's try to figu...",1010.5,0.0
0,What is the median of the following list of $4...,1976.5,48,"{'output': ' </think><think> Okay, so I need...",What is the median of the following list of $4...,1976.5,"\n</think><think>\nOkay, so I need to find t...",1976.5,1.0
0,What is the median of the following list of $4...,1976.5,48,"{'output': '"" Alright, so I need to find the ...",What is the median of the following list of $4...,1976.5,"""\n\nAlright, so I need to find the median of ...",1976.5,1.0
...,...,...,...,...,...,...,...,...,...
95,What is the median of the following list of $4...,1976.5,47,"{'output': '** Alright, let's try to figure ou...",What is the median of the following list of $4...,1976.5,"**\nAlright, let's try to figure out the media...",2022.5,0.0
95,What is the median of the following list of $4...,1976.5,47,"{'output': '** Okay, so I need to find the me...",What is the median of the following list of $4...,1976.5,"**\n\nOkay, so I need to find the median of th...",1010.5,0.0
95,What is the median of the following list of $4...,1976.5,47,"{'output': '**  <think> Okay, so I need to fi...",What is the median of the following list of $4...,1976.5,"**\n \n<think>\nOkay, so I need to find the me...",1012,0.0
95,What is the median of the following list of $4...,1976.5,47,{'output': ' It's known that the median of th...,What is the median of the following list of $4...,1976.5,\nIt's known that the median of the following...,1976,0.0


In [9]:
def pass_at_k(n, c, k):
    """
    :param n: total number of samples :param c: number of correct
    Samples
    :param k: k in pass@$k$
    """
    if n - c < k: return 1.0
    return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))

# pass_at_k(16, 2, 1)

In [10]:
perf = (
    df[['question_text', 'ground_truth', 'reward']]
    .groupby(['question_text', 'ground_truth'])
    .agg(
        count=('reward', 'count'),
        sum_reward=('reward', 'sum'),
        accuracy=('reward', 'mean')
    )
)

print(f"len(perf): {len(perf)}")

# If only answer with an extarcted \boxed{...} answer are kept, count isn't always 16.
# assert(len(perf[perf['count'] != 16]) == 0)
# perf[f'pass@{k}'] = perf['sum_reward'].apply(lambda x: pass_at_k(16, x, k))

k = 8
perf[f'pass@{k}'] = perf.apply(
    lambda x: pass_at_k(x['count'], x['sum_reward'], k), 
    axis=1
)
perf

len(perf): 87


Unnamed: 0_level_0,Unnamed: 1_level_0,count,sum_reward,accuracy,pass@8
question_text,ground_truth,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"A list of $2018$ positive integers has a unique mode, which occurs exactly $10$ times. What is the least number of distinct values that can occur in the list? Let's think step by step and output the final answer within \boxed{}.",225,64,63.0,0.984375,1.0
"A list of $2023$ positive integers has a unique mode, which occurs exactly $11$ times. Determine the least number of distinct values that can occur in the list. Let's think step by step and output the final answer within \boxed{}.",203,32,31.0,0.968750,1.0
"A list of $3042$ positive integers has a unique mode, which occurs exactly $15$ times. Calculate the least number of distinct values that can occur in the list. Let's think step by step and output the final answer within \boxed{}.",218,32,30.0,0.937500,1.0
A list of five positive integers has a median of 3 and a mean of 11. What is the maximum possible value of the list's largest element? Let's think step by step and output the final answer within \boxed{}.,47,32,31.0,0.968750,1.0
A list of five positive integers has a median of 4 and a mean of 15. What is the maximum possible value of the list's largest element? Let's think step by step and output the final answer within \boxed{}.,65,32,32.0,1.000000,1.0
...,...,...,...,...,...
"What is the median of the following list of $4100$ numbers?\n\[1, 2, 3, \ldots, 2050, 1^2, 2^2, 3^2, \ldots, 2050^2\]\nA) $1977.5$\nB) $2004.5$\nC) $2005.5$\nD) $2006.5$\nE) $2025.5$ Let's think step by step and output the final answer within \boxed{}.",2005.5,32,0.0,0.000000,0.0
"What is the median of the following list of numbers that includes integers from $1$ to $2020$, their squares, and their cubes? \[1, 2, 3, \ldots, 2020, 1^2, 2^2, \ldots, 2020^2, 1^3, 2^3, \ldots, 2020^3\]\nA) $2040200$ \nB) $2040201$\nC) $2040202$\nD) $2040203$\nE) $2040204$ Let's think step by step and output the final answer within \boxed{}.",2040201,32,0.0,0.000000,0.0
"When $15$ is appended to a list of integers, the mean is increased by $2$. When $1$ is appended to the enlarged list, the mean of the enlarged list is decreased by $1$. How many integers were in the original list? Let's think step by step and output the final answer within \boxed{}.",4,32,31.0,0.968750,1.0
"When the mean, median, and mode of the list\n\[10,2,5,2,4,2,x\]\nare arranged in increasing order, they form a non-constant arithmetic progression. What is the sum of all possible real values of $x$? Let's think step by step and output the final answer within \boxed{}.",20,64,0.0,0.000000,0.0


In [11]:
overall_perf = perf[[f'pass@{k}']].mean()
overall_perf

pass@8    0.520621
dtype: float64

In [12]:
total = len(perf)
print(f"total: {total}")

# Too hard.
too_hard = perf[(perf['accuracy'] == 0.0)]
total_too_hard = len(too_hard)
print(f"total_too_hard: {total_too_hard}")

# Ripe for improvement
ripe = perf[(perf['accuracy'] <= 0.8) & (perf['accuracy'] > 0.0)]
total_ripe = len(ripe)
print(f"total_ripe: {total_ripe}")

# Not worth improving.
already_done = perf[(perf['accuracy'] >= 0.8)]
total_already_done = len(already_done)
print(f"total_already_done: {total_already_done}")

check_total = total_too_hard + total_ripe + total_already_done
assert(total == check_total)

ripe

total: 87
total_too_hard: 38
total_ripe: 16
total_already_done: 33


Unnamed: 0_level_0,Unnamed: 1_level_0,count,sum_reward,accuracy,pass@8
question_text,ground_truth,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"A list of five positive integers has all of the following properties:\n\n$\bullet$ The only integer in the list that occurs more than once is $8,$\n\n$\bullet$ its median is $9,$ and\n\n$\bullet$ its average (mean) is $10.$\n\nWhat is the largest possible integer that could appear in the list? Let's think step by step and output the final answer within \boxed{}.",15,32,25.0,0.78125,1.0
A list of five positive integers has mean $12$ and range $18$. The mode and median are both $8$. How many different values are possible for the second largest element of the list? Let's think step by step and output the final answer within \boxed{}.,6,32,15.0,0.46875,0.997689
"A list of integers has mode 32 and mean 22. The smallest number in the list is 10. The median m of the list is a member of the list. If the list member m were replaced by m+10, the mean and median of the new list would be 24 and m+10, respectively. If m were instead replaced by m-8, the median of the new list would be m-4. What is m? Let's think step by step and output the final answer within \boxed{}.",20,32,10.0,0.3125,0.969599
A list of seven positive integers has a median of 4 and a mean of 10. What is the maximum possible value of the list's largest element? Let's think step by step and output the final answer within \boxed{}.,52,32,1.0,0.03125,0.25
A list of seven positive integers has a median of 5 and a mean of 15. What is the maximum possible value of the list's largest element? Let's think step by step and output the final answer within \boxed{}.,87,64,50.0,0.78125,0.999999
"A repunit is a positive integer, all of whose digits are 1s. Let $a_{1}<a_{2}<a_{3}<\ldots$ be a list of all the positive integers that can be expressed as the sum of distinct repunits. Compute $a_{111}$. Let's think step by step and output the final answer within \boxed{}.",1223456,32,3.0,0.09375,0.591935
"Carolyn and Paul are playing a game starting with a list of the integers $1$ to $n.$ The rules of the game are:\n\n$\bullet$ Carolyn always has the first turn.\n\n$\bullet$ Carolyn and Paul alternate turns.\n\n$\bullet$ On each of her turns, Carolyn must remove one number from the list such that this number has at least one positive divisor other than itself remaining in the list.\n\n$\bullet$ On each of his turns, Paul must remove from the list all of the positive divisors of the number that Carolyn has just removed, and he can optionally choose one additional number that is a multiple of any divisor he is removing.\n\n$\bullet$ If Carolyn cannot remove any more numbers, then Paul removes the rest of the numbers.\n\nFor example, if $n=8,$ a possible sequence of moves could be considered.\n\nSuppose that $n=8$ and Carolyn removes the integer $4$ on her first turn. Determine the sum of the numbers that Carolyn removes. Let's think step by step and output the final answer within \boxed{}.",12,32,1.0,0.03125,0.25
"Daniel writes over a board, from top to down, a list of positive integer numbers less or equal to 10. Next to each number of Daniel's list, Martin writes the number of times exists this number into the Daniel's list making a list with the same length. If we read the Martin's list from down to top, we get the same\r\nlist of numbers that Daniel wrote from top to down. Find the greatest length of the Daniel's list can have. Let's think step by step and output the final answer within \boxed{}.",10,32,21.0,0.65625,0.999984
"Find the largest positive integer $k{}$ for which there exists a convex polyhedron $\mathcal{P}$ with 2022 edges, which satisfies the following properties:\n[list]\n[*]The degrees of the vertices of $\mathcal{P}$ don’t differ by more than one, and\n[*]It is possible to colour the edges of $\mathcal{P}$ with $k{}$ colours such that for every colour $c{}$, and every pair of vertices $(v_1, v_2)$ of $\mathcal{P}$, there is a monochromatic path between $v_1$ and $v_2$ in the colour $c{}$.\n[/list]\n[i]Viktor Simjanoski, Macedonia[/i] Let's think step by step and output the final answer within \boxed{}.",2,32,11.0,0.34375,0.980654
"Let $(b_1, b_2, ... b_{12})$ be a list of the 12 integers from 4 to 15 inclusive such that for each $2 \le i \le 12$, either $b_i + 1$ or $b_i - 1$ or both appear somewhere before $b_i$ in the list. How many such lists are there? Let's think step by step and output the final answer within \boxed{}.",2048,32,21.0,0.65625,0.999984


In [13]:
tweak_dataset = [
    {
        'problem': r['question_text'],
        'answer': r['ground_truth'],
    }
    for index, r in ripe.reset_index().iterrows()
]
tweak_dataset

[{'problem': "A list of five positive integers has all of the following properties:\n\n$\\bullet$  The only integer in the list that occurs more than once is $8,$\n\n$\\bullet$  its median is $9,$ and\n\n$\\bullet$  its average (mean) is $10.$\n\nWhat is the largest possible integer that could appear in the list? Let's think step by step and output the final answer within \\boxed{}.",
  'answer': 15},
 {'problem': "A list of five positive integers has mean $12$ and range $18$. The mode and median are both $8$. How many different values are possible for the second largest element of the list? Let's think step by step and output the final answer within \\boxed{}.",
  'answer': 6},
 {'problem': "A list of integers has mode 32 and mean 22. The smallest number in the list is 10. The median  m of the list is a member of the list. If the list member  m were replaced by  m+10, the mean and median of the new list would be 24 and  m+10, respectively. If  m were instead replaced by  m-8, the medi

In [14]:
# Create directory if it doesn't exist
dirname = os.path.dirname(OUTPUT_FILE)
if len(dirname.strip()) > 0:
    os.makedirs(dirname, exist_ok=True)

# Save to JSONL file
with open(OUTPUT_FILE, 'w') as f:
    for item in tweak_dataset:
        f.write(json.dumps(item) + '\n')

print(f"Saved {len(tweak_dataset)} records to {OUTPUT_FILE}")

Saved 16 records to grpo_tweak_dataset.jsonl
