In [1]:
K_CLOSEST_MATCHES_RAW = 100
INPUT_QUESTIONS_FILE = "ripe_dataset.jsonl"
OUTPUT_FILE = "../top_100_light_r1_similar/tweak_dataset.jsonl"

In [2]:
import json
import os
import bm25s
from datasets import load_dataset, Dataset
import pandas as pd

In [3]:
def get_questions(split="train", num_samples=None) -> Dataset:
    data = load_dataset('qihoo360/Light-R1-SFTData', split=split)
    
    if num_samples is not None:
        # Randomly sample the specified number of examples
        data = data.shuffle(seed=42).select(range(min(num_samples, len(data))))
    
    return data

In [4]:
def remove_boxed(s):
    if s is None:
        return None

    left = "\\boxed{"

    if s[:len(left)] != left:
        return None
    if s[-1] != "}":
        return None

    return s[len(left):-1]


def last_boxed_only_string(string):
    idx = string.rfind("\\boxed")
    if "\\boxed " in string:
        return "\\boxed " + string.split("\\boxed ")[-1].split("$")[0]
    if idx < 0:
        idx = string.rfind("\\fbox")
        if idx < 0:
            return None

    i = idx
    right_brace_idx = None
    num_left_braces_open = 0
    while i < len(string):
        if string[i] == "{":
            num_left_braces_open += 1
        if string[i] == "}":
            num_left_braces_open -= 1
            if num_left_braces_open == 0:
                right_brace_idx = i
                break
        i += 1

    if right_brace_idx is None:
        retval = None
    else:
        retval = string[idx:right_brace_idx + 1]

    return retval

def extract_solution(text):
    return remove_boxed(last_boxed_only_string(text))

def correctness_reward_func(response: str, actual_answers: str) -> float:
    extracted_answer = extract_solution(response)
    return 1.0 if extracted_answer == actual_answers else 0.0

In [5]:
def get_deduped(dict_list):
    
    df = pd.DataFrame(dict_list)
    print(f"No dedupe: {len(dict_list)}")

    # Drop duplicate questions and answers
    df_unique = df.drop_duplicates()
    print(f"Question answer dedupe: {len(df_unique)}")

    # Remove questions with multiple source of truth answers
    check = df_unique.groupby('problem').agg(set)
    check['set_size'] = check['answer'].apply(len)
    check = check[check['set_size'] == 1]
    out = check.drop('set_size', axis=1).explode('answer').reset_index()
    print(f"Multiple answer filter: {len(out)}")
    # display(out)
    return out.to_dict('records')

In [6]:
data_raw = get_questions()
# data_raw = get_questions(num_samples=10) # Debug
# data_raw[0]

In [7]:
tweak_dataset = [
    {
        'problem': r['conversations'][0]['value'],
        'answer': extract_solution(r['conversations'][1]['value']),
        # 'full_answer': r['conversations'][1]['value']
    }
    for r in data_raw
]
len(tweak_dataset)

79439

In [8]:
tweak_dataset[4]

{'problem': 'If we divide a square into parts with three lines, then among the resulting polygons, there is always one whose diameter is not less than $\\sqrt{13}$. What is the minimum length of the side of the square? (The diameter of a polygon is the distance between its two farthest vertices.)\n\n\n\n\n\nTranslating the text as requested, while preserving the original formatting and line breaks.',
 'answer': '6'}

In [9]:
data_filtered = get_deduped(tweak_dataset)

No dedupe: 79439
Question answer dedupe: 75978
Multiple answer filter: 75978


### Index Records

In [10]:
data = {i: {**record} for i, record in enumerate(data_filtered)}
data[0]

{'problem': "\t1. Baron Munchhausen was told that some polynomial $P(x)=a_{n} x^{n}+\\ldots+a_{1} x+a_{0}$ is such that $P(x)+P(-x)$ has exactly 45 distinct real roots. Baron doesn't know the value of $n$. Nevertheless he claims that he can determine one of the coefficients $a_{n}, \\ldots, a_{1}, a_{0}$ (indicating its position and value). Isn't Baron mistaken?\n\t\n\tBoris Frenkin",
 'answer': 'a_0 = 0'}

### Toy BM25 Search

In [11]:
### Toy BM25 Search

# Create your corpus here
corpus = [
    "a cat is a feline and likes to purr",
    "a dog is the human's best friend and loves to play",
    "a bird is a beautiful animal that can fly",
    "a fish is a creature that lives in water and swims",
]

corpus_with_ids = [(idx, doc) for idx, doc in enumerate(corpus)]
[print(idx_and_doc) for idx_and_doc in corpus_with_ids]
corpus_ids = [idx for idx, doc in corpus_with_ids]

# Create the BM25 model and index the corpus
retriever = bm25s.BM25(corpus=corpus, method="robertson") # With original method from Robertson.
retriever.index(bm25s.tokenize(corpus))

# Query the corpus and get top-k results
query = "does the fish purr like a cat?"
results, scores = retriever.retrieve(bm25s.tokenize(query), k=2, corpus=corpus_ids)

# Let's see what we got!
print(f"\nquery: {query}")
for score, result in zip(scores[0], results[0]):
    print(f"{score:.4f}: {result}")

(0, 'a cat is a feline and likes to purr')
(1, "a dog is the human's best friend and loves to play")
(2, 'a bird is a beautiful animal that can fly')
(3, 'a fish is a creature that lives in water and swims')


Split strings:   0%|          | 0/4 [00:00<?, ?it/s]

BM25S Count Tokens:   0%|          | 0/4 [00:00<?, ?it/s]

BM25S Compute Scores:   0%|          | 0/4 [00:00<?, ?it/s]

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]


query: does the fish purr like a cat?
0.7449: 0
0.3389: 3


### Actual BM25 Search

In [12]:
# Initialize corpus with items from query,item pairs data as well as from random pool.

corpus_docs = [v['problem'] for k, v in data.items()]
corpus_ids = [k for k, v in data.items()]
# print(corpus_ids[0])
# print(corpus_docs[0])

# Create the BM25 model and index the corpus
retriever = bm25s.BM25(corpus=corpus_docs, method="robertson") # With original method from Robertson.
retriever.index(bm25s.tokenize(corpus_docs))

Split strings:   0%|          | 0/75978 [00:00<?, ?it/s]

BM25S Count Tokens:   0%|          | 0/75978 [00:00<?, ?it/s]

BM25S Compute Scores:   0%|          | 0/75978 [00:00<?, ?it/s]

In [13]:
def load_jsonl(file_path):
    with open(file_path, 'r') as f:
        return [json.loads(line) for line in f]

input_questions = load_jsonl(INPUT_QUESTIONS_FILE)
input_questions

[{'problem': "A list of positive integers has the following properties:\n$\\bullet$ The sum of the items in the list is $30$.\n$\\bullet$ The unique mode of the list is $9$.\n$\\bullet$ The median of the list is a positive integer that does not appear in the list itself.\n Find the sum of the squares of all the items in the list. Let's think step by step and output the final answer within \\boxed{}.",
  'answer': '236'},
 {'problem': "Let $A$, $B$, $C$, and $D$ be point on the hyperbola $\\frac{x^2}{20}- \\frac{y^2}{24} = 1$ such that $ABCD$ is a rhombus whose diagonals intersect at the origin. Find the greatest real number that is less than $BD^2$ for all such rhombi. Let's think step by step and output the final answer within \\boxed{}.",
  'answer': '480'},
 {'problem': "Let $ABC$ be a triangle inscribed in circle $\\omega$. Let the tangents to $\\omega$ at $B$ and $C$ intersect at point $D$, and let $\\overline{AD}$ intersect $\\omega$ at $P$. If $AB=5$, $BC=9$, and $AC=10$, $AP$ c

In [14]:
# Test
gold_answer = input_questions[0]['answer']
query = input_questions[0]['problem']
# gold_answer = '236' # to filter out anything with same answer.
# query = """
# A list of positive integers has the following properties:\n
#  The sum of the items in the list is 
# .\n
#  The unique mode of the list is 
# .\n
#  The median of the list is a positive integer that does not appear in the list itself.\n Find the sum of the squares of all the items in the list.
# """
tokenized_query = bm25s.tokenize(query)

results, scores = retriever.retrieve(
    tokenized_query, k=(K_CLOSEST_MATCHES_RAW+100), corpus=corpus_ids, show_progress=False, # return_as="documents", 
)
results
# ' '.join(results[0])

# Let's see what we got!
print(f"\nquery: {query}\n")
i = 0
for score, result in zip(scores[0], results[0]):
    if i > 50:
        break
    else:
        i += 1
    print(f"{score:.4f}: {data[result]['problem']} (solution:{data[result]['answer']})\n")

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]


query: A list of positive integers has the following properties:
$\bullet$ The sum of the items in the list is $30$.
$\bullet$ The unique mode of the list is $9$.
$\bullet$ The median of the list is a positive integer that does not appear in the list itself.
 Find the sum of the squares of all the items in the list. Let's think step by step and output the final answer within \boxed{}.

35.8845: Find all functions $f:\mathbb{Z}_{>0}\rightarrow\mathbb{Z}_{>0}$ that satisfy the following two conditions:
[list]$\bullet\ f(n)$ is a perfect square for all $n\in\mathbb{Z}_{>0}$
$\bullet\ f(m+n)=f(m)+f(n)+2mn$ for all $m,n\in\mathbb{Z}_{>0}$.[/list] (solution:n^2)

35.4359: A list of $11$ positive integers has a mean of $10$ , a median of $9$ , and a unique mode of $8$ . What is the largest possible value of an integer in the list? (solution:35)

29.0591: Henry starts with a list of the first 1000 positive integers, and performs a series of steps on the list. At each step, he erases any nonpo

In [15]:
def get_deduped_no_gold_answer(dict_list, gold_answer):
    
    df = pd.DataFrame(dict_list)
    print(f"No dedupe: {len(dict_list)}")

    # Drop duplicate questions and answers
    df_unique = df.drop_duplicates()
    print(f"Question answer dedupe: {len(df_unique)}")

    # Remove questions with multiple source of truth answers
    check = df_unique.groupby('problem').agg(set)
    check['set_size'] = check['answer'].apply(len)
    check = check[check['set_size'] == 1]
    out = check.drop('set_size', axis=1).explode('answer').reset_index()
    print(f"Multiple answer filter: {len(out)}")
    out = out[out.answer != gold_answer]
    print(f"Gold answer filter: {len(out)}")
    # display(out)
    return out.to_dict('records')

tweak_dataset = [
    {
        'problem': data[result]['problem'],
        'answer': data[result]['answer']
    }
    for result in results[0]
]
# tweak_dataset[0]['answer'] = '236' # To debug filter
tweak_dataset = get_deduped_no_gold_answer(tweak_dataset, gold_answer)[:K_CLOSEST_MATCHES_RAW]
print(f"Limited to desired amount: {len(tweak_dataset)}")
tweak_dataset[:50]

No dedupe: 200
Question answer dedupe: 200
Multiple answer filter: 200
Gold answer filter: 200
Limited to desired amount: 100


[{'problem': '\nMárcio starts with a natural number and replaces it with the sum of its digits, repeating the process until he finally gets a single-digit number. For example, Márcio replaces 1784102 with 23, and then with 8. He also applies this process to lists of $N$ natural numbers, replacing each number in the list with the sum of its digits, continuing the process until he ends with a final list where each number is a single digit.\n\n(a) Starting with $3^{2009}$, what is the final single-digit number?\n\n(b) Starting with $17^{2009}$, what is the final single-digit number?\n\n(c) Starting with the list of the first 20092009 natural numbers, does the final list have more 4s or 5s? How many 9s are in the final list?',
  'answer': '2232445'},
 {'problem': '\nSvetlana takes a triplet of numbers and transforms it by the rule: at each step, each number is replaced by the sum of the other two. What is the difference between the largest and smallest numbers in the triplet on the 1580th 

In [16]:
# Create directory if it doesn't exist
dirname = os.path.dirname(OUTPUT_FILE)
if len(dirname.strip()) > 0:
    os.makedirs(dirname, exist_ok=True)

# Save to JSONL file
with open(OUTPUT_FILE, 'w') as f:
    for item in tweak_dataset:
        f.write(json.dumps(item) + '\n')

print(f"Saved {len(tweak_dataset)} records to {OUTPUT_FILE}")

Saved 100 records to ../top_100_light_r1_similar/tweak_dataset.jsonl


In [17]:
# def load_jsonl(file_path):
#     with open(file_path, 'r') as f:
#         return [json.loads(line) for line in f]

def load_jsonl(file_path):
    SYSTEM_PROMPT = "Let's think step by step and output the final answer within \\boxed{}."
    with open(file_path, 'r') as f:
        data = [json.loads(line) for line in f]
        data = [{
            'prompt': f"{x['problem']} {SYSTEM_PROMPT}",
            **x
        } for x in data]
        return data

In [18]:
reloaded = load_jsonl(OUTPUT_FILE)
print(len(reloaded))
reloaded

100


[{'prompt': "\nMárcio starts with a natural number and replaces it with the sum of its digits, repeating the process until he finally gets a single-digit number. For example, Márcio replaces 1784102 with 23, and then with 8. He also applies this process to lists of $N$ natural numbers, replacing each number in the list with the sum of its digits, continuing the process until he ends with a final list where each number is a single digit.\n\n(a) Starting with $3^{2009}$, what is the final single-digit number?\n\n(b) Starting with $17^{2009}$, what is the final single-digit number?\n\n(c) Starting with the list of the first 20092009 natural numbers, does the final list have more 4s or 5s? How many 9s are in the final list? Let's think step by step and output the final answer within \\boxed{}.",
  'problem': '\nMárcio starts with a natural number and replaces it with the sum of its digits, repeating the process until he finally gets a single-digit number. For example, Márcio replaces 17841