In [1]:
K_CLOSEST_MATCHES_RAW = 600
INPUT_QUESTIONS_FILE = "ripe_dataset.jsonl"
OUTPUT_FILE = "../top_600_s1-full-59k_similar/tweak_dataset.jsonl"

In [2]:
import json
import ast
import os
import bm25s
from datasets import load_dataset, Dataset
import pandas as pd
import re

In [3]:
def get_questions(split="train", num_samples=None) -> Dataset:
    data = load_dataset('simplescaling/data_ablation_full59K', split=split)
    
    if num_samples is not None:
        # Randomly sample the specified number of examples
        data = data.shuffle(seed=42).select(range(min(num_samples, len(data))))
    
    return data

In [4]:
def remove_boxed(s):
    if s is None:
        return None

    left = "\\boxed{"

    if s[:len(left)] != left:
        return None
    if s[-1] != "}":
        return None

    return s[len(left):-1]


def last_boxed_only_string(string):
    idx = string.rfind("\\boxed")
    if "\\boxed " in string:
        return "\\boxed " + string.split("\\boxed ")[-1].split("$")[0]
    if idx < 0:
        idx = string.rfind("\\fbox")
        if idx < 0:
            return None

    i = idx
    right_brace_idx = None
    num_left_braces_open = 0
    while i < len(string):
        if string[i] == "{":
            num_left_braces_open += 1
        if string[i] == "}":
            num_left_braces_open -= 1
            if num_left_braces_open == 0:
                right_brace_idx = i
                break
        i += 1

    if right_brace_idx is None:
        retval = None
    else:
        retval = string[idx:right_brace_idx + 1]

    return retval

def extract_solution(text):
    return remove_boxed(last_boxed_only_string(text))

def correctness_reward_func(response: str, actual_answer: str) -> float:
    extracted_answer = extract_solution(response)
    return 1.0 if extracted_answer == actual_answer else 0.0

In [5]:
def get_deduped(dict_list):
    
    df = pd.DataFrame(dict_list)
    print(f"No dedupe: {len(dict_list)}")

    # Drop duplicate questions and answers
    df_unique = df.drop_duplicates()
    print(f"Question answer dedupe: {len(df_unique)}")

    # Remove questions with multiple source of truth answers
    check = df_unique.groupby('problem').agg(set)
    check['set_size'] = check['answer'].apply(len)
    check = check[check['set_size'] == 1]
    out = check.drop('set_size', axis=1).explode('answer').reset_index()
    print(f"Multiple answer filter: {len(out)}")
    # display(out)
    return out.to_dict('records')

In [7]:
data_raw = get_questions()
# data_raw = get_questions(num_samples=10) # Debug

Resolving data files:   0%|          | 0/19 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/19 [00:00<?, ?it/s]

In [8]:
idx = 1
for k in data_raw[idx].keys():
  print(k)
  print(data_raw[idx][k])
  print()

solution
To prove that \( BT = 2PT \), we will use geometric properties and reflections. Here is the detailed step-by-step solution:

1. **Reflect \( C \) and \( T \) across \( P \) to get \( C' \) and \( T' \) respectively.**
   - Since \( P \) is the intersection of \( AT \) and \( CS \), reflecting \( C \) and \( T \) across \( P \) will help us use symmetry in the problem.

2. **Note that \( \triangle ACC' \) is equilateral.**
   - Given \( AB = AC \) and \( \angle BAC = 40^\circ \), we know that \( \angle BCA = \angle ABC = 70^\circ \).
   - Reflecting \( C \) across \( P \) to get \( C' \) implies that \( \angle ACC' = 60^\circ \) because \( \angle ACP = \angle CCP = 30^\circ \) each.
   - Therefore, \( \triangle ACC' \) is equilateral, and \( A \) is the circumcenter of \( \triangle CC'B \).

3. **Show that \( CT \parallel C'T' \).**
   - Since \( T \) is reflected to \( T' \) across \( P \), \( CT \parallel C'T' \) and \( CT = C'T' \).

4. **Prove that \( AC'BT \) is cyclic.**


In [9]:

# Get question, answer, solution, trace.
tweak_dataset = [
    {
        'problem': r['question'],
        'metadata': ast.literal_eval(r['metadata']),
        'solution': r['solution'],
        'attempt': r['attempt']
    }
    for r in data_raw
]

# Extract answer and reward from trace.
tweak_dataset = [
    {
        'problem': r['problem'],
        'answer': r['metadata']['answer'],
        'solution': r['solution'],
        'attempt': r['attempt'],
        'extracted_answer': extract_solution(r['attempt']),
        'reward': correctness_reward_func(response = r['attempt'], actual_answer = r['metadata']['answer'])
    }
    for r in tweak_dataset
    if 'answer' in r['metadata']
]

# Only keep correct traces.
tweak_dataset = [
    r
    for r in tweak_dataset
    if r['reward'] == 1.0 
]

len(tweak_dataset)

11882

In [10]:
data_filtered = get_deduped(tweak_dataset)

No dedupe: 11882
Question answer dedupe: 11882
Multiple answer filter: 11882


### Index Records

In [11]:
data = {i: {**record} for i, record in enumerate(data_filtered)}
data[0]

{'problem': '"Modulo $m$ graph paper" consists of a grid of $m^2$ points, representing all pairs of integer residues $(x,y)$ where $0\\le x, y <m$. To graph a congruence on modulo $m$ graph paper, we mark every point $(x,y)$ that satisfies the congruence. For example, a graph of $y\\equiv x^2\\pmod 5$ would consist of the points $(0,0)$, $(1,1)$, $(2,4)$, $(3,4)$, and $(4,1)$.\n\nThe graph of $$3x\\equiv 4y-1 \\pmod{35}$$has a single $x$-intercept $(x_0,0)$ and a single $y$-intercept $(0,y_0)$, where $0\\le x_0,y_0<35$.\n\nWhat is the value of $x_0+y_0$?',
 'answer': '32',
 'solution': {'To find the $x$-intercept, we plug in $0$ for $y$ and solve $$3x\\equiv 4(0)-1 \\pmod{35}.$$Multiplying both sides by $12$, we get $$36x \\equiv -12\\pmod{35}$$and thus $x\\equiv -12\\pmod{35}$. Translating this to the interval $0\\le x<35$, we have $x\\equiv 23\\pmod{35}$, so the $x$-intercept on our graph is at $(23,0)$.\n\nTo find the $y$-intercept, we plug in $0$ for $x$ and solve $$3(0)\\equiv 4y-

### Toy BM25 Search

In [12]:
### Toy BM25 Search

# Create your corpus here
corpus = [
    "a cat is a feline and likes to purr",
    "a dog is the human's best friend and loves to play",
    "a bird is a beautiful animal that can fly",
    "a fish is a creature that lives in water and swims",
]

corpus_with_ids = [(idx, doc) for idx, doc in enumerate(corpus)]
[print(idx_and_doc) for idx_and_doc in corpus_with_ids]
corpus_ids = [idx for idx, doc in corpus_with_ids]

# Create the BM25 model and index the corpus
retriever = bm25s.BM25(corpus=corpus, method="robertson") # With original method from Robertson.
retriever.index(bm25s.tokenize(corpus))

# Query the corpus and get top-k results
query = "does the fish purr like a cat?"
results, scores = retriever.retrieve(bm25s.tokenize(query), k=2, corpus=corpus_ids)

# Let's see what we got!
print(f"\nquery: {query}")
for score, result in zip(scores[0], results[0]):
    print(f"{score:.4f}: {result}")

(0, 'a cat is a feline and likes to purr')
(1, "a dog is the human's best friend and loves to play")
(2, 'a bird is a beautiful animal that can fly')
(3, 'a fish is a creature that lives in water and swims')


Split strings:   0%|          | 0/4 [00:00<?, ?it/s]

BM25S Count Tokens:   0%|          | 0/4 [00:00<?, ?it/s]

BM25S Compute Scores:   0%|          | 0/4 [00:00<?, ?it/s]

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]


query: does the fish purr like a cat?
0.7449: 0
0.3389: 3


### Actual BM25 Search

In [17]:
# Initialize corpus with items from query,item pairs data as well as from random pool.

corpus_docs = [f"{v['problem']} {v['solution']} {v['attempt']}" for k, v in data.items()]
corpus_ids = [k for k, v in data.items()]
# print(corpus_ids[0])
# print(corpus_docs[0])

# Create the BM25 model and index the corpus
retriever = bm25s.BM25(corpus=corpus_docs, method="robertson") # With original method from Robertson.
retriever.index(bm25s.tokenize(corpus_docs))

Split strings:   0%|          | 0/11882 [00:00<?, ?it/s]

BM25S Count Tokens:   0%|          | 0/11882 [00:00<?, ?it/s]

BM25S Compute Scores:   0%|          | 0/11882 [00:00<?, ?it/s]

In [18]:
def load_jsonl(file_path):
    with open(file_path, 'r') as f:
        return [json.loads(line) for line in f]

input_questions = load_jsonl(INPUT_QUESTIONS_FILE)
input_questions[0]

{'problem': "A list of positive integers has the following properties:\n$\\bullet$ The sum of the items in the list is $30$.\n$\\bullet$ The unique mode of the list is $9$.\n$\\bullet$ The median of the list is a positive integer that does not appear in the list itself.\n Find the sum of the squares of all the items in the list. Let's think step by step and output the final answer within \\boxed{}.",
 'answer': '236'}

In [20]:
# from bs4 import BeautifulSoup

# def extract_meta_keywords(file_path):
#     with open(file_path, 'r') as f:
#         html_content = f.read()
    
#     soup = BeautifulSoup(html_content, 'html.parser')
#     # Find the META tag with name="keywords"
#     meta_tag = soup.find('meta', attrs={'name': 'keywords'})
    
#     # If found, return the content attribute
#     if meta_tag:
#         return meta_tag.get('content')
#     else:
#         return None

# math_topics_str = extract_meta_keywords("mathematicsdictionary.com.html")
# math_topics = math_topics_str.split(',')
# print(len(math_topics))

# # Initialize corpus with items from query,item pairs data as well as from random pool.

# corpus_docs_math_topics = math_topics
# corpus_ids_math_topics = [i for i, v in enumerate(math_topics)]

# # Create the BM25 model and index the corpus
# retriever_math_topics = bm25s.BM25(corpus=corpus_docs_math_topics, method="robertson") # With original method from Robertson.
# retriever_math_topics.index(bm25s.tokenize(corpus_docs_math_topics))

# # Test
# query = input_questions[0]['problem']
# tokenized_query = bm25s.tokenize(query)

# results, scores = retriever_math_topics.retrieve(
#     tokenized_query, k=(K_CLOSEST_MATCHES_RAW+100), corpus=corpus_ids_math_topics, show_progress=False, # return_as="documents", 
# )
# results
# # ' '.join(results[0])

# # Let's see what we got!
# math_topics_keywords = set([
#     math_topics[result].strip().lower()
#     for score, result in zip(scores[0], results[0])
#     if score >= 2.0
# ])
# print(f"\nquery: {query}\n")
# print(f"\nkeywords: {math_topics_keywords}\n")
# # i = 0
# # for score, result in zip(scores[0], results[0]):
# #     if i > 50:
# #         break
# #     else:
# #         i += 1
# #     print(f"{score:.4f} (result {result}): {wiktionary_titles[result]}\n")

In [23]:
# Debug for comparison, use Claude to gen test difference with high quality extracted keywords.
llm_gen_keywords = [
  "positive integers",
  "list properties",
  "sum of integers",
  "mode of list",
  "unique mode",
  "median",
  "integer median",
  "median constraint",
  "sum of squares",
  "number theory",
  "arithmetic sequence",
  "statistical measures",
  "constrained list",
  "math competition",
  "integer constraints",
  "numerical properties",
  "discrete mathematics",
  "combinatorial constraints",
  "number relations",
  "mathematical puzzle"
]

", ".join(llm_gen_keywords)

'positive integers, list properties, sum of integers, mode of list, unique mode, median, integer median, median constraint, sum of squares, number theory, arithmetic sequence, statistical measures, constrained list, math competition, integer constraints, numerical properties, discrete mathematics, combinatorial constraints, number relations, mathematical puzzle'

In [24]:
# Test
gold_answer = input_questions[0]['answer']
# query = input_questions[0]['problem']
query = ", ".join(llm_gen_keywords)
# query = ", ".join(math_topics_keywords)
tokenized_query = bm25s.tokenize(query)

results, scores = retriever.retrieve(
    tokenized_query, k=(K_CLOSEST_MATCHES_RAW+100), corpus=corpus_ids, show_progress=False, # return_as="documents", 
)
results
# ' '.join(results[0])

# Let's see what we got!
print(f"\nquery: {query}\n")
i = 0
for score, result in zip(scores[0], results[0]):
    if i > 50:
        break
    else:
        i += 1
    print(f"{score:.4f} (result {result}): {data[result]['problem']} (solution:{data[result]['answer']})\n")

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]


query: positive integers, list properties, sum of integers, mode of list, unique mode, median, integer median, median constraint, sum of squares, number theory, arithmetic sequence, statistical measures, constrained list, math competition, integer constraints, numerical properties, discrete mathematics, combinatorial constraints, number relations, mathematical puzzle

32.6572 (result 291): A collection of five positive integers has mean 4.4, unique mode 3 and median 4. If an 8 is added to the collection, what is the new median? Express your answer as a decimal to the nearest tenth. (solution:4.5)

31.7589 (result 10015): There is a set of five positive integers whose average (mean) is 5, whose median is 5, and whose only mode is 8. What is the difference between the largest and smallest integers in the set? (solution:7)

30.1922 (result 8809): The data in the stem and leaf plot shown are the long jump distances, in centimeters, that the girls team of Pseudo H.S. made at practice today

In [25]:
def get_deduped_no_gold_answer(dict_list, gold_answer):
    
    df = pd.DataFrame(dict_list)
    print(f"No dedupe: {len(dict_list)}")

    # Drop duplicate questions and answers
    df_unique = df.drop_duplicates()
    print(f"Question answer dedupe: {len(df_unique)}")

    # Remove questions with multiple source of truth answers
    check = df_unique.groupby('problem').agg(set)
    check['set_size'] = check['answer'].apply(len)
    check = check[check['set_size'] == 1]
    out = check.drop('set_size', axis=1).explode(['answer', 'match_score', 'teacher_trace']).reset_index()
    print(f"Multiple answer filter: {len(out)}")
    out = out[out.answer != gold_answer]
    print(f"Gold answer filter: {len(out)}")
    out = out.sort_values(by='match_score', ascending=False)
    out = out.drop('match_score', axis=1)
    # display(out)
    return out.to_dict('records')

tweak_dataset = [
    {
        'problem': data[result]['problem'],
        'answer': data[result]['answer'],
        'teacher_trace': list(data[result]['attempt'])[0],
        'match_score': score,
    }
    for score, result in zip(scores[0], results[0])
]
# tweak_dataset[0]['answer'] = '236' # To debug filter
tweak_dataset = get_deduped_no_gold_answer(tweak_dataset, gold_answer)[:K_CLOSEST_MATCHES_RAW]
print(f"Limited to desired amount: {len(tweak_dataset)}")
tweak_dataset[:50]

No dedupe: 700
Question answer dedupe: 700
Multiple answer filter: 700
Gold answer filter: 700
Limited to desired amount: 600


[{'problem': 'A collection of five positive integers has mean 4.4, unique mode 3 and median 4. If an 8 is added to the collection, what is the new median? Express your answer as a decimal to the nearest tenth.',
  'answer': '4.5',
  'teacher_trace': 'Solution:\nLet the initial collection of five positive integers in non-decreasing order be $a, b, c, d, e$.\nWe are given:\n1. Mean = 4.4, so $\\frac{a+b+c+d+e}{5} = 4.4$, which means $a+b+c+d+e = 22$.\n2. Unique mode = 3.\n3. Median = 4, so $c = 4$.\n\nSince the mode is 3 and unique, the number 3 must appear more frequently than any other number. Given the sorted order and median 4, we must have $a=3$ and $b=3$. The collection starts with $3, 3, 4, d, e$.\n\nSubstituting into the sum equation: $3+3+4+d+e = 22 \\implies 10+d+e = 22 \\implies d+e = 12$.\nSince the mode is uniquely 3, $d > 3$. Also, $d \\ge c = 4$.\nWe have $4 \\le d \\le e$.\n\nIf $d=4$, the collection is $3, 3, 4, 4, e$. For the mode to be uniquely 3, $e \\neq 4$, so $e > 

In [39]:
# Create directory if it doesn't exist
dirname = os.path.dirname(OUTPUT_FILE)
if len(dirname.strip()) > 0:
    os.makedirs(dirname, exist_ok=True)

# Save to JSONL file
with open(OUTPUT_FILE, 'w') as f:
    for item in tweak_dataset:
        f.write(json.dumps(item) + '\n')

print(f"Saved {len(tweak_dataset)} records to {OUTPUT_FILE}")

Saved 600 records to ../top_600_s1-full-59k_similar/tweak_dataset.jsonl


In [40]:
# def load_jsonl(file_path):
#     with open(file_path, 'r') as f:
#         return [json.loads(line) for line in f]

def load_jsonl(file_path):
    SYSTEM_PROMPT = "Let's think step by step and output the final answer within \\boxed{}."
    with open(file_path, 'r') as f:
        data = [json.loads(line) for line in f]
        data = [{
            'prompt': f"{x['problem']} {SYSTEM_PROMPT}",
            **x
        } for x in data]
        return data

In [41]:
reloaded = load_jsonl(OUTPUT_FILE)
print(len(reloaded))
reloaded

600


[{'prompt': "There is a set of five positive integers whose average (mean) is 5, whose median is 5, and whose only mode is 8. What is the difference between the largest and smallest integers in the set? Let's think step by step and output the final answer within \\boxed{}.",
  'problem': 'There is a set of five positive integers whose average (mean) is 5, whose median is 5, and whose only mode is 8. What is the difference between the largest and smallest integers in the set?',
  'answer': '7',
  'teacher_trace': 'Solution:\nLet the set of five positive integers be $\\{a, b, c, d, e\\}$ in non-decreasing order, so $a \\le b \\le c \\le d \\le e$.\n\nWe are given:\n1. The average is 5: $\\frac{a+b+c+d+e}{5} = 5 \\implies a+b+c+d+e = 25$.\n2. The median is 5: Since there are 5 elements, the median is the middle element, $c = 5$.\n3. The only mode is 8: The number 8 appears more frequently than any other number in the set.\n\nSubstituting $c = 5$, the set is $\\{a, b, 5, d, e\\}$, and $a \