In [1]:
OUTPUT_FILE = "../first_principals_1200/tweak_dataset.jsonl"

In [2]:
import os
import pandas as pd
import glob
import json
import re
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np

## Load Completions

In [3]:
def load_all_jsonl_to_dataframe(folder_path):
    """Load all JSONL files from a folder into a pandas DataFrame."""
    # Find all .jsonl files
    jsonl_files = glob.glob(os.path.join(folder_path, "*.jsonl"))
    
    # Create list to hold dataframes
    dfs = []
    
    for file_path in jsonl_files:
        try:
            # Read file into dataframe
            df = pd.read_json(file_path, lines=True)
            df['source_file'] = os.path.basename(file_path)  # Optional: track source file
            dfs.append(df)
            print(f"Loaded {len(df)} rows from {os.path.basename(file_path)}")
        except Exception as e:
            print(f"Error loading {file_path}: {e}")
    
    # Combine all dataframes
    if dfs:
        combined_df = pd.concat(dfs, ignore_index=True)
        print(f"Created DataFrame with {len(combined_df)} rows from {len(jsonl_files)} files")
        return combined_df
    else:
        print("No valid files found")
        return pd.DataFrame()

# Usage
folder_path = "../data/deepscaler_aime_sample_eval_check/"
combined_data = load_all_jsonl_to_dataframe(folder_path)

Loaded 4 rows from 0.jsonl
Loaded 4 rows from 1.jsonl
Created DataFrame with 8 rows from 2 files


## Display sample data

In [4]:
# Display the first few rows
combined_data.head(10)

Unnamed: 0,prompt,answer,prompt_id,outputs,source_file
0,A list of positive integers has the following ...,236,0,[{'output': ' Let's think step by step and out...,0.jsonl
1,"Let $A$, $B$, $C$, and $D$ be point on the hyp...",480,1,"[{'output': ' Let......] Let...] Alright, so I...",0.jsonl
2,Let $ABC$ be a triangle inscribed in circle $\...,113,2,"[{'output': ' Let's think step step step.""<thi...",0.jsonl
3,Let $ABCD$ be a tetrahedron such that $AB=CD= ...,104,3,[{'output': ' Let's think step by step and out...,0.jsonl
4,Let $\mathcal{B}$ be the set of rectangular bo...,721,4,[{'output': ' Let's think step by step and out...,1.jsonl
5,Let $p$ be the least prime number for which th...,110,5,"[{'output': '"" <think> Okay, so I have this pr...",1.jsonl
6,Let \(b\ge 2\) be an integer. Call a positive ...,211,6,"[{'output': '** </think><think> Okay, so I nee...",1.jsonl
7,Rectangles $ABCD$ and $EFGH$ are drawn such th...,104,7,"[{'output': '"" </think><think> Okay, so I have...",1.jsonl


In [5]:
print(combined_data.iloc[0]['prompt'])

A list of positive integers has the following properties:
$\bullet$ The sum of the items in the list is $30$.
$\bullet$ The unique mode of the list is $9$.
$\bullet$ The median of the list is a positive integer that does not appear in the list itself.
 Find the sum of the squares of all the items in the list. Let's think step by step and output the final answer within \boxed{}. Let's think step by step and output the final answer within \boxed{}.


## Solution Extraction Utilities

Functions to extract solutions from model outputs and compute correctness.

In [6]:
def remove_boxed(s):
    if s is None:
        return None

    left = "\\boxed{"

    if s[:len(left)] != left:
        return None
    if s[-1] != "}":
        return None

    return s[len(left):-1]


def last_boxed_only_string(string):
    idx = string.rfind("\\boxed")
    if "\\boxed " in string:
        return "\\boxed " + string.split("\\boxed ")[-1].split("$")[0]
    if idx < 0:
        idx = string.rfind("\\fbox")
        if idx < 0:
            return None

    i = idx
    right_brace_idx = None
    num_left_braces_open = 0
    while i < len(string):
        if string[i] == "{":
            num_left_braces_open += 1
        if string[i] == "}":
            num_left_braces_open -= 1
            if num_left_braces_open == 0:
                right_brace_idx = i
                break
        i += 1

    if right_brace_idx is None:
        retval = None
    else:
        retval = string[idx:right_brace_idx + 1]

    return retval

def extract_solution(text):
    return remove_boxed(last_boxed_only_string(text))

def correctness_reward_func(response: str, actual_answers: str) -> float:
    extracted_answer = extract_solution(response)
    return 1.0 if extracted_answer == actual_answers else 0.0

## Transform for Analysis

In [7]:
check = combined_data.copy()[['prompt', 'answer']].drop_duplicates()
print(f"len(check): {len(check)}")

len(check): 8


In [8]:
df = combined_data.copy()
df['question_text'] = df['prompt']
df['ground_truth'] = df['answer']
df.drop([
    'source_file',
], axis=1, inplace=True)
# print(f"len(df): {len(df)}")
df = df.explode('outputs')
df['response'] = df['outputs'].apply(lambda x: x['output'])
# print(f"len(df): {len(df)}")
df['extracted_solution'] = df['response'].apply(lambda x: extract_solution(x))

# # # Only keep completions where there was a generated solution.
# # df = df[df['extracted_solution'].notna()]

df['reward'] = (df['extracted_solution'] == df['ground_truth'].astype(str)).astype(float)
# df
# df[df['extracted_solution'].isna()]

## Spot Check Traces

In [9]:
prompt_id = 0
correct_pos = 0
incorrect_pos = 6

prompt = df[df.prompt_id == prompt_id]
prompt_correct = prompt[prompt.reward == 1.0]
prompt_incorrect = prompt[prompt.reward == .0]

In [10]:
print(prompt.iloc[correct_pos]['prompt'])

A list of positive integers has the following properties:
$\bullet$ The sum of the items in the list is $30$.
$\bullet$ The unique mode of the list is $9$.
$\bullet$ The median of the list is a positive integer that does not appear in the list itself.
 Find the sum of the squares of all the items in the list. Let's think step by step and output the final answer within \boxed{}. Let's think step by step and output the final answer within \boxed{}.


In [11]:
print(prompt_correct.iloc[correct_pos]['response'])

 Let's think step by step and output the final answer within \boxed.

To determine the sum of squares, we need to find all the elements in the list, then compute their squares and sum them.

First, note that the sum of the list is 30, unique mode is 9, and the median doesn't appear in the list.

We know that the list must contain at least two 9s because the mode is unique and 9 is the mode. So let's try to figure out the possible number of elements.

Wait, let's think step by step.

Firstly, since the unique mode is 9, which means that all other numbers appear fewer times than 9. So 9 must appear more times than any other number.

Therefore, in the list, 9 occurs the maximum number of times, and the counts of all other numbers are less than that.

Given that the sum is 30, and all numbers are positive integers.

We also know that the median is a positive integer that does not appear in the list.

So first, let's figure out the number of elements.

Let’s denote n as the number of elemen

In [12]:
print(prompt_incorrect.iloc[incorrect_pos]['response'])

 Let's think step by step and output the final answer within \boxed.
Alright, so I need to find the sum of the squares of all the items in a list that satisfies three conditions:

1. The sum of the items is 30.
2. The unique mode is 9. So, 9 must appear more times than any other number in the list, and all other numbers appear strictly less than the number of times 9 appears.
3. The median is a positive integer that doesn't appear in the list itself. Also, the median must be an integer, so perhaps the list length is odd? Wait, not necessarily, but if the median is an integer, maybe the list has an odd number of elements? Hmm.

Alright, let me break this down step by step.

First, the list has a sum of 30. Since 9 is the unique mode, 9 must occur more times than any other number. Let's denote the number of times 9 appears as 'k'. So, all other numbers appear less than 'k' times.

Moreover, the median being an integer but not in the list. Since the median is the middle value when the lis

## Generate Practice Problems

In [13]:
def median_of_list(max_len_list=10, max_int_in_list=10):
    import random
    import statistics
    n = random.randint(1,max_len_list)
    l = [random.randint(1,max_int_in_list) for i in range(n)]
    l_str = ", ".join([str(e) for e in l])
    question = f"What is the median of the list {l_str} ?"
    answer = str(statistics.median(l))
    return {'question': question, 'answer': answer}

median_of_list(max_len_list=10, max_int_in_list=30)

{'question': 'What is the median of the list 9, 14, 30, 18, 13, 4, 17, 21, 13, 13 ?',
 'answer': '13.5'}

In [14]:
def avg_of_list(max_len_list=10, max_int_in_list=10):
    import random
    n = random.randint(1,max_len_list)
    l = [random.randint(1,max_int_in_list) for i in range(n-1)]
    # print(f"sum(l): {sum(l)}")
    # print(f"float(n): {float(n)}")
    # print(f"sum(l) % n: {sum(l) % n}")
    l = l + [n - sum(l) % n]
    # print(f"l: {l}")
    l_str = ", ".join([str(e) for e in l])
    question = f"What is the average of the list {l_str} ?"
    # print(f"sum(l) / len(l): {sum(l) / len(l)}")
    # print(f"float(n): {float(n)}")
    assert sum(l) / len(l) == float(int(sum(l) / len(l)))
    answer = str(sum(l) / len(l))
    return {'question': question, 'answer': answer}

avg_of_list(max_len_list=10, max_int_in_list=30)

{'question': 'What is the average of the list 1 ?', 'answer': '1.0'}

In [15]:
def sum_of_list(max_len_list=10, max_int_in_list=10):
    import random
    n = random.randint(1,max_len_list)
    l = [random.randint(1,max_int_in_list) for i in range(n)]
    l_str = ", ".join([str(e) for e in l])
    question = f"What is sum of the list {l_str} ?"
    answer = str(sum(l))
    return {'question': question, 'answer': answer}

sum_of_list(max_len_list=10, max_int_in_list=30)

{'question': 'What is sum of the list 16, 27, 28, 21, 5 ?', 'answer': '97'}

In [16]:
def count_mode_of_list(max_len_list=10, max_int_in_list=10):
    import random
    import statistics
    assert 2 <= max_len_list
    n = random.randint(2,max_len_list)
    m = random.randint(2,max_len_list)
    l = set([random.randint(1,max_int_in_list) for i in range(n)])
    x = set([random.randint(1,max_int_in_list) for i in range(m)])
    cnt_in_list = random.randint(2,n)
    # print(f"cnt_in_list: {cnt_in_list}")
    # print(f"x: {x}")
    l = list(l - x)
    if cnt_in_list > 0:
        for an_x in x:
            l = list(l) + [an_x]*cnt_in_list
    random.shuffle(l)
    # print(f"l: {l}")
    l_str = ", ".join([str(e) for e in l])
    question = f"How many modes are there in {l_str} ?"
    assert len(statistics.multimode(l)) == len(x)
    answer = str(len(x))
    return {'question': question, 'answer': answer}

count_mode_of_list(max_len_list=10, max_int_in_list=30)

{'question': 'How many modes are there in 16, 13, 21, 16, 21, 4 ?',
 'answer': '2'}

In [17]:
def mode_of_list(max_len_list=10, max_int_in_list=10):
    import statistics
    import random
    assert 2 <= max_len_list
    n = random.randint(2,max_len_list)
    x = random.randint(1,max_int_in_list)
    l = set([random.randint(1,max_int_in_list) for i in range(n)])
    cnt_in_list = random.randint(2,n)
    # print(f"cnt_in_list: {cnt_in_list}")
    l = list(l - set([x]))
    if cnt_in_list > 0:
        l = list(l) + [x]*cnt_in_list
    random.shuffle(l)
    # print(f"l: {l}")
    l_str = ", ".join([str(e) for e in l])
    question = f"What is mode of {l_str} ?"
    assert statistics.mode(l) == x
    answer = str(x)
    return {'question': question, 'answer': answer}

mode_of_list(max_len_list=10, max_int_in_list=30)

{'question': 'What is mode of 26, 28, 22, 27, 28, 13, 28, 28 ?',
 'answer': '28'}

In [18]:
def count_in_list(max_len_list=10, max_int_in_list=10):
    import random
    n = random.randint(1,max_len_list)
    x = random.randint(1,max_int_in_list)
    # print(f"n: {n}")
    # print(f"x: {x}")
    l = set([random.randint(1,max_int_in_list) for i in range(n)])
    in_list = random.randint(0,1)
    cnt_in_list = random.randint(1,n) if in_list else 0
    # print(f"cnt_in_list: {cnt_in_list}")
    l = list(l - set([x]))
    if cnt_in_list > 0:
        l = list(l) + [x]*cnt_in_list
    random.shuffle(l)
    # print(f"l: {l}")
    l_str = ", ".join([str(e) for e in l])
    question = f"How many times is {x} in the list of numbers {l_str} ?"
    assert l.count(x) == cnt_in_list
    answer = str(cnt_in_list)
    return {'question': question, 'answer': answer}

count_in_list(max_len_list=10, max_int_in_list=30)

{'question': 'How many times is 17 in the list of numbers 2, 28, 7 ?',
 'answer': '0'}

In [24]:
problem_generators = [median_of_list, avg_of_list, sum_of_list, count_mode_of_list, mode_of_list, count_in_list]
cnt_per_gen = 200
print(f"len(problem_generators): {len(problem_generators)}")
print(f"cnt_per_gen: {cnt_per_gen}")


tweak_dataset = [
    f()
    for _i in range(cnt_per_gen)
    for f in problem_generators
]

SYSTEM_PROMPT = "Let's think step by step and output the final answer within \\boxed{}."

tweak_dataset = [
    {
        'problem': f"{p['question']} {SYSTEM_PROMPT}",
        'answer': p['answer']
    }
    for p in tweak_dataset
]

len(tweak_dataset)

len(problem_generators): 6
cnt_per_gen: 200


1200

In [25]:
tweak_dataset[0]

{'problem': "What is the median of the list 8, 5 ? Let's think step by step and output the final answer within \\boxed{}.",
 'answer': '6.5'}

In [26]:
# Create directory if it doesn't exist
dirname = os.path.dirname(OUTPUT_FILE)
if len(dirname.strip()) > 0:
    os.makedirs(dirname, exist_ok=True)

# Save to JSONL file
with open(OUTPUT_FILE, 'w') as f:
    for item in tweak_dataset:
        f.write(json.dumps(item) + '\n')

print(f"Saved {len(tweak_dataset)} records to {OUTPUT_FILE}")

Saved 1200 records to ../first_principals_1200/tweak_dataset.jsonl
