In [1]:
K_CLOSEST_MATCHES_RAW = 100
INPUT_QUESTIONS_FILE = "ripe_dataset.jsonl"
OUTPUT_FILE = "../top_100_similar/tweak_dataset.jsonl"

In [2]:
import json
import os
import bm25s
from datasets import load_dataset, Dataset
import pandas as pd

In [3]:
def get_deepscaler_questions(split="train", num_samples=None) -> Dataset:
    """Load questions from DeepScaleR Preview Dataset with optional sample size."""
    # qihoo360/Light-R1-SFTData
    data = load_dataset('agentica-org/DeepScaleR-Preview-Dataset', split=split)
    
    if num_samples is not None:
        # Randomly sample the specified number of examples
        data = data.shuffle(seed=42).select(range(min(num_samples, len(data))))
    
    return data

In [4]:
data_raw = get_deepscaler_questions()
# data_raw = get_deepscaler_questions(num_samples=10) # Debug

In [5]:
data_raw[0]

{'problem': 'The operation $\\otimes$ is defined for all nonzero numbers by $a \\otimes b = \\frac{a^{2}}{b}$. Determine $[(1 \\otimes 2) \\otimes 3] - [1 \\otimes (2 \\otimes 3)]$.',
 'answer': '-\\frac{2}{3}',
 'solution': '1. **Apply the operation $\\otimes$ to the innermost parentheses first:**\n   \\[\n   (1 \\otimes 2) \\otimes 3 = \\left(\\frac{1^2}{2}\\right) \\otimes 3 = \\frac{1}{2} \\otimes 3\n   \\]\n   \\[\n   1 \\otimes (2 \\otimes 3) = 1 \\otimes \\left(\\frac{2^2}{3}\\right) = 1 \\otimes \\frac{4}{3}\n   \\]\n\n2. **Calculate each part using the definition of $\\otimes$:**\n   \\[\n   \\frac{1}{2} \\otimes 3 = \\frac{\\left(\\frac{1}{2}\\right)^2}{3} = \\frac{\\frac{1}{4}}{3} = \\frac{1}{12}\n   \\]\n   \\[\n   1 \\otimes \\frac{4}{3} = \\frac{1^2}{\\frac{4}{3}} = \\frac{1}{\\frac{4}{3}} = \\frac{3}{4}\n   \\]\n\n3. **Subtract the two results:**\n   \\[\n   \\left(\\frac{1}{12}\\right) - \\left(\\frac{3}{4}\\right) = \\frac{1}{12} - \\frac{9}{12} = -\\frac{8}{12} = -\\f

In [6]:
len(data_raw)

40315

### Index Records

In [7]:
data = {i: {**record} for i, record in enumerate(data_raw)}
data[0]

{'problem': 'The operation $\\otimes$ is defined for all nonzero numbers by $a \\otimes b = \\frac{a^{2}}{b}$. Determine $[(1 \\otimes 2) \\otimes 3] - [1 \\otimes (2 \\otimes 3)]$.',
 'answer': '-\\frac{2}{3}',
 'solution': '1. **Apply the operation $\\otimes$ to the innermost parentheses first:**\n   \\[\n   (1 \\otimes 2) \\otimes 3 = \\left(\\frac{1^2}{2}\\right) \\otimes 3 = \\frac{1}{2} \\otimes 3\n   \\]\n   \\[\n   1 \\otimes (2 \\otimes 3) = 1 \\otimes \\left(\\frac{2^2}{3}\\right) = 1 \\otimes \\frac{4}{3}\n   \\]\n\n2. **Calculate each part using the definition of $\\otimes$:**\n   \\[\n   \\frac{1}{2} \\otimes 3 = \\frac{\\left(\\frac{1}{2}\\right)^2}{3} = \\frac{\\frac{1}{4}}{3} = \\frac{1}{12}\n   \\]\n   \\[\n   1 \\otimes \\frac{4}{3} = \\frac{1^2}{\\frac{4}{3}} = \\frac{1}{\\frac{4}{3}} = \\frac{3}{4}\n   \\]\n\n3. **Subtract the two results:**\n   \\[\n   \\left(\\frac{1}{12}\\right) - \\left(\\frac{3}{4}\\right) = \\frac{1}{12} - \\frac{9}{12} = -\\frac{8}{12} = -\\f

### Toy BM25 Search

In [8]:
### Toy BM25 Search

# Create your corpus here
corpus = [
    "a cat is a feline and likes to purr",
    "a dog is the human's best friend and loves to play",
    "a bird is a beautiful animal that can fly",
    "a fish is a creature that lives in water and swims",
]

corpus_with_ids = [(idx, doc) for idx, doc in enumerate(corpus)]
[print(idx_and_doc) for idx_and_doc in corpus_with_ids]
corpus_ids = [idx for idx, doc in corpus_with_ids]

# Create the BM25 model and index the corpus
retriever = bm25s.BM25(corpus=corpus, method="robertson") # With original method from Robertson.
retriever.index(bm25s.tokenize(corpus))

# Query the corpus and get top-k results
query = "does the fish purr like a cat?"
results, scores = retriever.retrieve(bm25s.tokenize(query), k=2, corpus=corpus_ids)

# Let's see what we got!
print(f"\nquery: {query}")
for score, result in zip(scores[0], results[0]):
    print(f"{score:.4f}: {result}")

(0, 'a cat is a feline and likes to purr')
(1, "a dog is the human's best friend and loves to play")
(2, 'a bird is a beautiful animal that can fly')
(3, 'a fish is a creature that lives in water and swims')


Split strings:   0%|          | 0/4 [00:00<?, ?it/s]

BM25S Count Tokens:   0%|          | 0/4 [00:00<?, ?it/s]

BM25S Compute Scores:   0%|          | 0/4 [00:00<?, ?it/s]

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]


query: does the fish purr like a cat?
0.7449: 0
0.3389: 3


### Actual BM25 Search

In [9]:
# Initialize corpus with items from query,item pairs data as well as from random pool.

corpus_docs = [v['problem'] for k, v in data.items()]
corpus_ids = [k for k, v in data.items()]
# print(corpus_ids[0])
# print(corpus_docs[0])

# Create the BM25 model and index the corpus
retriever = bm25s.BM25(corpus=corpus_docs, method="robertson") # With original method from Robertson.
retriever.index(bm25s.tokenize(corpus_docs))

Split strings:   0%|          | 0/40315 [00:00<?, ?it/s]

BM25S Count Tokens:   0%|          | 0/40315 [00:00<?, ?it/s]

BM25S Compute Scores:   0%|          | 0/40315 [00:00<?, ?it/s]

In [10]:
def load_jsonl(file_path):
    with open(file_path, 'r') as f:
        return [json.loads(line) for line in f]

input_questions = load_jsonl(INPUT_QUESTIONS_FILE)
input_questions

[{'problem': "A list of positive integers has the following properties:\n$\\bullet$ The sum of the items in the list is $30$.\n$\\bullet$ The unique mode of the list is $9$.\n$\\bullet$ The median of the list is a positive integer that does not appear in the list itself.\n Find the sum of the squares of all the items in the list. Let's think step by step and output the final answer within \\boxed{}.",
  'answer': '236'},
 {'problem': "Let $A$, $B$, $C$, and $D$ be point on the hyperbola $\\frac{x^2}{20}- \\frac{y^2}{24} = 1$ such that $ABCD$ is a rhombus whose diagonals intersect at the origin. Find the greatest real number that is less than $BD^2$ for all such rhombi. Let's think step by step and output the final answer within \\boxed{}.",
  'answer': '480'},
 {'problem': "Let $ABC$ be a triangle inscribed in circle $\\omega$. Let the tangents to $\\omega$ at $B$ and $C$ intersect at point $D$, and let $\\overline{AD}$ intersect $\\omega$ at $P$. If $AB=5$, $BC=9$, and $AC=10$, $AP$ c

In [11]:
# Test
gold_answer = input_questions[0]['answer']
query = input_questions[0]['problem']
# gold_answer = '236' # to filter out anything with same answer.
# query = """
# A list of positive integers has the following properties:\n
#  The sum of the items in the list is 
# .\n
#  The unique mode of the list is 
# .\n
#  The median of the list is a positive integer that does not appear in the list itself.\n Find the sum of the squares of all the items in the list.
# """
tokenized_query = bm25s.tokenize(query)

results, scores = retriever.retrieve(
    tokenized_query, k=(K_CLOSEST_MATCHES_RAW+100), corpus=corpus_ids, show_progress=False, # return_as="documents", 
)
results
# ' '.join(results[0])

# Let's see what we got!
print(f"\nquery: {query}\n")
i = 0
for score, result in zip(scores[0], results[0]):
    if i > 50:
        break
    else:
        i += 1
    print(f"{score:.4f}: {data[result]['problem']} (solution:{data[result]['answer']})\n")

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]


query: A list of positive integers has the following properties:
$\bullet$ The sum of the items in the list is $30$.
$\bullet$ The unique mode of the list is $9$.
$\bullet$ The median of the list is a positive integer that does not appear in the list itself.
 Find the sum of the squares of all the items in the list. Let's think step by step and output the final answer within \boxed{}.

44.0008: A list of five positive integers has all of the following properties:

$\bullet$  The only integer in the list that occurs more than once is $8,$

$\bullet$  its median is $9,$ and

$\bullet$  its average (mean) is $10.$

What is the largest possible integer that could appear in the list? (solution:15)

31.5324: Suppose that $k \geq 2$ is a positive integer. An in-shuffle is performed on a list with $2 k$ items to produce a new list of $2 k$ items in the following way: - The first $k$ items from the original are placed in the odd positions of the new list in the same order as they appeared in t

In [12]:
def get_deduped_no_gold_answer(dict_list, gold_answer):
    
    df = pd.DataFrame(dict_list)
    print(f"No dedupe: {len(dict_list)}")

    # Drop duplicate questions and answers
    df_unique = df.drop_duplicates()
    print(f"Question answer dedupe: {len(df_unique)}")

    # Remove questions with multiple source of truth answers
    check = df_unique.groupby('problem').agg(set)
    check['set_size'] = check['answer'].apply(len)
    check = check[check['set_size'] == 1]
    out = check.drop('set_size', axis=1).explode('answer').reset_index()
    print(f"Multiple answer filter: {len(out)}")
    out = out[out.answer != gold_answer]
    print(f"Gold answer filter: {len(out)}")
    # display(out)
    return out.to_dict('records')

tweak_dataset = [
    {
        'problem': data[result]['problem'],
        'answer': data[result]['answer']
    }
    for result in results[0]
]
# tweak_dataset[0]['answer'] = '236' # To debug filter
tweak_dataset = get_deduped_no_gold_answer(tweak_dataset, gold_answer)[:K_CLOSEST_MATCHES_RAW]
print(f"Limited to desired amount: {len(tweak_dataset)}")
tweak_dataset[:50]

No dedupe: 200
Question answer dedupe: 185
Multiple answer filter: 181
Gold answer filter: 181
Limited to desired amount: 100


[{'problem': '\nSvetlana takes a triplet of numbers and transforms it by the rule: at each step, each number is replaced by the sum of the other two. What is the difference between the largest and smallest numbers in the triplet on the 1580th step of applying this rule, if the initial triplet of numbers was $\\{80, 71, 20\\}$? If the problem allows for multiple answers, list them all as a set.',
  'answer': '60'},
 {'problem': '15 balls numbered 1 through 15 are placed in a bin. Joe produces a list of four numbers by performing the following sequence four times: he chooses a ball, records the number, and places the ball back in the bin. Finally, Joe chooses to make a unique list by selecting 3 numbers from these 4, and forgetting the order in which they were drawn. How many different lists are possible?',
  'answer': '202500'},
 {'problem': 'A list of $2018$ positive integers has a unique mode, which occurs exactly $10$ times. What is the least number of distinct values that can occur 

In [13]:
# Create directory if it doesn't exist
dirname = os.path.dirname(OUTPUT_FILE)
if len(dirname.strip()) > 0:
    os.makedirs(dirname, exist_ok=True)

# Save to JSONL file
with open(OUTPUT_FILE, 'w') as f:
    for item in tweak_dataset:
        f.write(json.dumps(item) + '\n')

print(f"Saved {len(tweak_dataset)} records to {OUTPUT_FILE}")

Saved 100 records to ../top_100_similar/tweak_dataset.jsonl


In [14]:
# def load_jsonl(file_path):
#     with open(file_path, 'r') as f:
#         return [json.loads(line) for line in f]

def load_jsonl(file_path):
    SYSTEM_PROMPT = "Let's think step by step and output the final answer within \\boxed{}."
    with open(file_path, 'r') as f:
        data = [json.loads(line) for line in f]
        data = [{
            'prompt': f"{x['problem']} {SYSTEM_PROMPT}",
            **x
        } for x in data]
        return data

In [15]:
reloaded = load_jsonl(OUTPUT_FILE)
print(len(reloaded))
reloaded

100


[{'prompt': "\nSvetlana takes a triplet of numbers and transforms it by the rule: at each step, each number is replaced by the sum of the other two. What is the difference between the largest and smallest numbers in the triplet on the 1580th step of applying this rule, if the initial triplet of numbers was $\\{80, 71, 20\\}$? If the problem allows for multiple answers, list them all as a set. Let's think step by step and output the final answer within \\boxed{}.",
  'problem': '\nSvetlana takes a triplet of numbers and transforms it by the rule: at each step, each number is replaced by the sum of the other two. What is the difference between the largest and smallest numbers in the triplet on the 1580th step of applying this rule, if the initial triplet of numbers was $\\{80, 71, 20\\}$? If the problem allows for multiple answers, list them all as a set.',
  'answer': '60'},
 {'prompt': "15 balls numbered 1 through 15 are placed in a bin. Joe produces a list of four numbers by performin