In [43]:
TOY_AIME_24_QUESTIONS_FILE = "ripe_dataset.jsonl"
QUESTIONS_LIM = 10
CONCEPT_LIM = 100

CONCEPTS_OUTPUT_FILE = "../synth-data/syn-concepts.jsonl"
PROPERTIES_OUTPUT_FILE = "../synth-data/syn-prop.jsonl"
CODE_OUTPUT_FILE = "../synth-data/syn-gen-code.jsonl"
OUTPUT_FILE = "../synth-data/syn-problems.jsonl"

In [2]:
import json
import ast
import os
import bm25s
from datasets import load_dataset, Dataset
import pandas as pd
import re
from dotenv import load_dotenv
from groq import Groq
# from cerebras.cloud.sdk import Cerebras

In [3]:
# Load environment variables from .env file
load_dotenv()

True

In [21]:
def load_jsonl(file_path):
    with open(file_path, 'r') as f:
        return [json.loads(line) for line in f]

def save_to_jsonl(file_path, a_list):
    # Create directory if it doesn't exist
    dirname = os.path.dirname(file_path)
    if len(dirname.strip()) > 0:
        os.makedirs(dirname, exist_ok=True)

    # Save to JSONL file
    with open(file_path, 'w') as f:
        for item in a_list:
            f.write(json.dumps(item) + '\n')

    print(f"Saved {len(a_list)} records to {file_path}")

### Debug Toy AIME 24 Questions

In [49]:
input_questions = load_jsonl(TOY_AIME_24_QUESTIONS_FILE)
data_with_answers = input_questions[:1]
data_with_answers
print(data_with_answers[0]['problem'])

A list of positive integers has the following properties:
$\bullet$ The sum of the items in the list is $30$.
$\bullet$ The unique mode of the list is $9$.
$\bullet$ The median of the list is a positive integer that does not appear in the list itself.
 Find the sum of the squares of all the items in the list. Let's think step by step and output the final answer within \boxed{}.


### Load S1 data with answers

In [35]:
# def get_questions(split="train", num_samples=None) -> Dataset:
#     data = load_dataset('simplescaling/data_ablation_full59K', split=split)
    
#     if num_samples is not None:
#         # Randomly sample the specified number of examples
#         data = data.shuffle(seed=42).select(range(min(num_samples, len(data))))
    
#     return data

In [36]:
# data_raw = get_questions()
# # data_raw = get_questions(num_samples=10) # Debug

# print(len(data_raw))

In [37]:
# data_with_answers = []
# for k in range(len(data_raw)):
#   answer = ast.literal_eval(data_raw[k]['metadata']).get('answer', None)
#   if answer is not None:
#     data_with_answers.append({
#       'problem': data_raw[k]['question'],
#       'answer': answer
#     })

# print(len(data_with_answers))

In [38]:
# print(data_raw[0].keys())
# print()

# for k in range(3):
#   print("Question:")
#   print(data_with_answers[k]['problem'])
#   print("Answer:")
#   print(data_with_answers[k]['answer'])
#   print()

### Use Inference APIs to generate math concepts used.

In [39]:
groq_client = Groq()

def run_groq_inference_qwq_32b(groq_client, prompt):
    completion = groq_client.chat.completions.create(
        model="qwen-qwq-32b",
        messages=[
            {
                "role": "user",
                "content": prompt
            },
        ],
        temperature=0.6,
        max_completion_tokens=131072,
        top_p=0.9,
        stream=False,
        stop=None,
    )
    return completion.choices[0].message.content

def extract_after_think(text):
    # Find the last occurrence of </think> and extract everything after it
    pattern = r'</think>(.*?)$'
    match = re.search(pattern, text, re.DOTALL)  # re.DOTALL to match newlines
    if match:
        return match.group(1).strip()
    else:
        return None

In [53]:
all_concepts = set()
num_questions = min(len(data_with_answers), QUESTIONS_LIM) if QUESTIONS_LIM else len(data_with_answers)

for k in range(min(len(data_with_answers), 10)):
  prompt = f"""
  What are the math concepts used in the problem below?

  Don't solve problem, just give the math concepts used as a json list.

  Please only return the math concepts used as a json list, nothing else.

  Example answers: ["median", "mode", "sum of squares", ...], ["subsets", "permutations", "combinations", ...], ["linear equations", "systems of equations", "substitution", ...], etc.

  {data_with_answers[k]['problem']}
  """
  print("="*100)
  print("Prompt:")
  print(prompt)
  think_answer = run_groq_inference_qwq_32b(groq_client, prompt)
  answer = extract_after_think(think_answer)
  print("\nAnswer:")
  print(answer)
  print()
  if answer is not None:
    concepts = json.loads(answer)
    for concept in concepts:
      if concept not in all_concepts:
        all_concepts.add(concept)
  
  save_to_jsonl(CONCEPTS_OUTPUT_FILE, all_concepts)


Prompt:

  What are the math concepts used in the problem below?

  Don't solve problem, just give the math concepts used as a json list.

  Please only return the math concepts used as a json list, nothing else.

  Example answers: ["median", "mode", "sum of squares", ...], ["subsets", "permutations", "combinations", ...], ["linear equations", "systems of equations", "substitution", ...], etc.

  A list of positive integers has the following properties:
$\bullet$ The sum of the items in the list is $30$.
$\bullet$ The unique mode of the list is $9$.
$\bullet$ The median of the list is a positive integer that does not appear in the list itself.
 Find the sum of the squares of all the items in the list. Let's think step by step and output the final answer within \boxed{}.
  

Answer:
["sum", "mode", "median", "sum of squares"]

Saved 4 records to ../synth-data/syn-concepts.jsonl


In [54]:
all_properties = set()
all_concepts_list = list(all_concepts)
num_concepts = min(len(all_concepts), CONCEPT_LIM) if CONCEPT_LIM else len(all_concepts)

for k in range(num_concepts):
  prompt = f"""
  Please list key math properties for the concept below.

  Don't solve a problem, just give all key math properties as a json list.

  Please only return the all key math properties as a json list, nothing else.

  Example answer for input "logarithm math properties": ["log_b(a) = c if and only if b^c = a", "mode", "log_b(M*N) = log_b(M) + log_b(N) (Product Rule)", ...]

  {all_concepts_list[k]}
  """
  print("="*100)
  print("Prompt:")
  print(prompt)
  think_answer = run_groq_inference_qwq_32b(groq_client, prompt)
  answer = extract_after_think(think_answer)
  print("\nAnswer:")
  print(answer)
  print()
  if answer is not None:
    properties = json.loads(answer)
    for property in properties:
      if property not in all_properties:
        all_properties.add(property)
  
save_to_jsonl(PROPERTIES_OUTPUT_FILE, all_properties)

Prompt:

  Please list key math properties for the concept below.

  Don't solve a problem, just give all key math properties as a json list.

  Please only return the all key math properties as a json list, nothing else.

  Example answer for input "logarithm math properties": ["log_b(a) = c if and only if b^c = a", "mode", "log_b(M*N) = log_b(M) + log_b(N) (Product Rule)", ...]

  mode
  

Answer:
["The mode is the value that appears most frequently in a dataset.", "A dataset can be unimodal (one mode), bimodal (two modes), trimodal (three modes), or multimodal (multiple modes).", "A dataset where all values occur with the same frequency has no mode.", "The mode can be applied to both qualitative (categorical) and quantitative data.", "The mode is not influenced by extreme values or outliers.", "In a histogram, the mode corresponds to the bin(s) with the highest frequency.", "The mode is one of the three common measures of central tendency, alongside the mean and median."]

Prompt:

