In [1]:
TOY_AIME_24_QUESTIONS_FILE = "ripe_dataset.jsonl"
QUESTIONS_LIM = 10
CONCEPT_LIM = 100
CONCEPTS_ASKED_PER_QUESTION = 7
PROPERTIES_ASKED_PER_CONCEPTS = 7
NUM_PROPERTIES_FOR_PROBLEM_GEN = 5
PROBLEM_RANDOM_INTEGER_INPUT_RANGE_MIN = 2
PROBLEM_RANDOM_INTEGER_INPUT_RANGE_MAX = 30
PROBLEM_NUM_RANDOM_INTEGERS_START = 1
PROBLEM_NUM_RANDOM_INTEGERS_END = 4

MAX_VALIDATED_PROBLEMS_TO_GEN = 10

MAX_LLM_RETRIES = 10
MAX_CODE_GEN_LLM_RETRIES = 2 * MAX_LLM_RETRIES
NUM_CHECKS_ALL_IMPLEMENTATIONS_GEN_SAME_RESULT = 100

INT_AGGREGATORS = [
    "sum of squares",
    "sum of cubes",
    "alternating sum",
    "the sum of values modulo 7",
    "the sum of values modulo 11",
]

CONCEPTS_OUTPUT_FILE = "../synth-data/syn-concepts.jsonl"
PROPERTIES_OUTPUT_FILE = "../synth-data/syn-prop.jsonl"
OUTPUT_FILE = "../synth-data/syn-problems.jsonl"

In [2]:
import json
import ast
import os
import itertools
import re
import random
import traceback
import types

from dotenv import load_dotenv
from groq import Groq
# from cerebras.cloud.sdk import Cerebras

In [3]:
# Load environment variables from .env file
load_dotenv()

True

In [4]:
def load_jsonl(file_path):
    with open(file_path, 'r') as f:
        return [json.loads(line) for line in f]

def save_to_jsonl(file_path, a_list):
    # Create directory if it doesn't exist
    dirname = os.path.dirname(file_path)
    if len(dirname.strip()) > 0:
        os.makedirs(dirname, exist_ok=True)

    # Save to JSONL file
    with open(file_path, 'w') as f:
        for item in a_list:
            f.write(json.dumps(item) + '\n')

    print(f"Saved {len(a_list)} records to {file_path}")

### Debug Toy AIME 24 Questions

In [5]:
input_questions = load_jsonl(TOY_AIME_24_QUESTIONS_FILE)
data_with_answers = input_questions[:1]
data_with_answers
print(data_with_answers[0]['problem'])

A list of positive integers has the following properties:
$\bullet$ The sum of the items in the list is $30$.
$\bullet$ The unique mode of the list is $9$.
$\bullet$ The median of the list is a positive integer that does not appear in the list itself.
 Find the sum of the squares of all the items in the list. Let's think step by step and output the final answer within \boxed{}.


### Load S1 data with answers

In [6]:
# def get_questions(split="train", num_samples=None) -> Dataset:
#     data = load_dataset('simplescaling/data_ablation_full59K', split=split)
    
#     if num_samples is not None:
#         # Randomly sample the specified number of examples
#         data = data.shuffle(seed=42).select(range(min(num_samples, len(data))))
    
#     return data

In [7]:
# data_raw = get_questions()
# # data_raw = get_questions(num_samples=10) # Debug

# print(len(data_raw))

In [8]:
# data_with_answers = []
# for k in range(len(data_raw)):
#   answer = ast.literal_eval(data_raw[k]['metadata']).get('answer', None)
#   if answer is not None:
#     data_with_answers.append({
#       'problem': data_raw[k]['question'],
#       'answer': answer
#     })

# print(len(data_with_answers))

In [9]:
# print(data_raw[0].keys())
# print()

# for k in range(3):
#   print("Question:")
#   print(data_with_answers[k]['problem'])
#   print("Answer:")
#   print(data_with_answers[k]['answer'])
#   print()

### Use Inference APIs to generate math concepts used and properties of those concepts.

In [10]:
groq_client = Groq()

def run_groq_inference_qwq_32b(groq_client, prompt):
    completion = groq_client.chat.completions.create(
        model="qwen-qwq-32b",
        messages=[
            {
                "role": "user",
                "content": prompt
            },
        ],
        temperature=0.6,
        max_completion_tokens=131072,
        top_p=0.9,
        stream=False,
        stop=None,
    )
    return completion.choices[0].message.content

def extract_after_think(text):
    # Find the last occurrence of </think> and extract everything after it
    pattern = r'</think>(.*?)$'
    match = re.search(pattern, text, re.DOTALL)  # re.DOTALL to match newlines
    if match:
        return match.group(1).strip()
    else:
        return None

def extract_last_python_code(text):
    """Extract the code from the last ```python ... ``` code block in the text."""
    # Pattern to match Python code blocks
    pattern = r"```python\s*(.*?)\s*```"
    
    # Find all matches, using re.DOTALL to match across multiple lines
    matches = re.findall(pattern, text, re.DOTALL)
    
    # Return the last match if any matches are found
    if matches:
        return matches[-1].strip()
    else:
        return ""

def extract_last_json_code(text):
    """Extract the code from the last ```json ... ``` code block in the text."""
    # Pattern to match json data blocks
    pattern = r"```json\s*(.*?)\s*```"
    
    # Find all matches, using re.DOTALL to match across multiple lines
    matches = re.findall(pattern, text, re.DOTALL)
    
    # Return the last match if any matches are found
    if matches:
        return matches[-1].strip()
    else:
        return ""

def replace_random_integer(a_list_of_str, orig_random_integer, new_random_integer):
  return [a_str.replace(str(orig_random_integer), str(new_random_integer)) for a_str in a_list_of_str]

def load_and_execute_code(code_string):
    """
    Load code from a string into Python context and make its functions available
    
    Args:
        code_string: String containing Python code to execute
        
    Returns:
        Dictionary containing all definitions from the executed code
    """
    # Create a namespace dictionary to hold the functions
    namespace = {}
    
    # Execute the code in this namespace
    exec(code_string, namespace)
    
    # Return the namespace that now contains all defined functions/classes
    return namespace

def find_integers_in_string(text):
    """
    Find all integers in a string using regular expressions.
    
    Args:
        text: String to search for integers
        
    Returns:
        List of integers found in the string
    """
    # Pattern matches integers (including negative numbers)
    # \b ensures we match whole numbers, not parts of words/other numbers
    pattern = r'\b-?\d+\b'
    
    # Find all matches
    matches = re.findall(pattern, text)
    
    # Convert matches to integers
    integers = [int(match) for match in matches]
    
    return integers

def is_array_of_strings(the_data):
    # Check if it's a list/array
    if not isinstance(the_data, list):
        return False
    
    # Check if all elements are strings
    return all(isinstance(item, str) for item in the_data)

In [11]:
all_concepts = set()
num_questions = min(len(data_with_answers), QUESTIONS_LIM) if QUESTIONS_LIM else len(data_with_answers)

for k in range(min(len(data_with_answers), 10)):
  prompt = f"""
  What are the math concepts used in the problem below and related math concepts?
  
  Try to list {CONCEPTS_ASKED_PER_QUESTION} concepts and if ambiguous, add their field of math in parenthesis.

  Don't solve problem, just give the math concepts used as a json list.

  Please only return the math concepts as a json list in ```json ``` tags.

  Example answers:
  ```json
  ["median (statistics)", "mode (statistics)", "mean (statistics)", "frequency (statistics)", "sum", "sum of squares", ...], ["set", "subsets", "permutations", "combinations", "probability", "coin flips", ...], ["linear equations", "systems of equations", "substitution", ...], etc.
  ```

  {data_with_answers[k]['problem']}
  """
  print("="*100)
  print("Prompt:")
  print(prompt)
  retry_count = 0
  while retry_count < MAX_LLM_RETRIES:
    try:
      think_answer = run_groq_inference_qwq_32b(groq_client, prompt)
    except Exception as e:
      print(f"Error running groq inference, skipping: {e}")
      continue
    answer = extract_last_json_code(extract_after_think(think_answer))
    print("\nAnswer:")
    print(answer)
    print()
    if answer is None:
      print("Extracted answer is None, skipping.")
      continue

    concepts = ast.literal_eval(answer) # ast doesn't error out on latex formulas.
    if not is_array_of_strings(concepts):
      print("Concepts are not a list of strings, skipping.")
      continue

    for concept in concepts:
      if concept not in all_concepts:
        all_concepts.add(concept)
    
    break

  if retry_count >= MAX_LLM_RETRIES:
    print(f"Tried to generate concepts {retry_count} times, failed and stopped.\n")
  
  save_to_jsonl(CONCEPTS_OUTPUT_FILE, all_concepts)

Prompt:

  What are the math concepts used in the problem below and related math concepts?
  
  Try to list 7 concepts and if ambiguous, add their field of math in parenthesis.

  Don't solve problem, just give the math concepts used as a json list.

  Please only return the math concepts as a json list in ```json ``` tags.

  Example answers:
  ```json
  ["median (statistics)", "mode (statistics)", "mean (statistics)", "frequency (statistics)", "sum", "sum of squares", ...], ["set", "subsets", "permutations", "combinations", "probability", "coin flips", ...], ["linear equations", "systems of equations", "substitution", ...], etc.
  ```

  A list of positive integers has the following properties:
$\bullet$ The sum of the items in the list is $30$.
$\bullet$ The unique mode of the list is $9$.
$\bullet$ The median of the list is a positive integer that does not appear in the list itself.
 Find the sum of the squares of all the items in the list. Let's think step by step and output the f

In [12]:
all_properties = set()
all_concepts_list = list(all_concepts)
num_concepts = min(len(all_concepts), CONCEPT_LIM) if CONCEPT_LIM else len(all_concepts)

for k in range(num_concepts):
  prompt = f"""
  Please list key algebraic math properties leveraging a formula for the concept below. Try to list {PROPERTIES_ASKED_PER_CONCEPTS} algebraic math properties leveraging a formula.

  Just return all key algebraic math properties leveraging formulas as a json list, nothing else.
  
  Don't include a algebraic math properties that don't use a formula. Please only use precise formulas (no approximations).

  Please only return the math concepts as a json list in ```json ``` tags.

  Example answer for input "logarithm math properties":
  ```json
  ["log_b(a) = c if and only if b^c = a", "log_b(M*N) = log_b(M) + log_b(N) (Product Rule)", ...]
  ```

  {all_concepts_list[k]}
  """
  print("="*100)
  print("Prompt:")
  print(prompt)
  retry_count = 0
  while retry_count < MAX_LLM_RETRIES:
    try:
      think_answer = run_groq_inference_qwq_32b(groq_client, prompt)
    except Exception as e:
      print(f"Error running groq inference, skipping: {e}")
      continue
    answer = extract_last_json_code(extract_after_think(think_answer))
    print("\nAnswer:")
    print(answer)
    print()
    if answer is None:
      print("Extracted answer is None, skipping.")
      continue
    
    properties = ast.literal_eval(answer) # ast doesn't error out on latex formulas.
    if not is_array_of_strings(concepts):
      print("Properties are not a list of strings, skipping.")
      continue
    
    for property in properties:
      contextualized_property = f"Property of {all_concepts_list[k]}: {property}"
      if contextualized_property not in all_properties:
          all_properties.add(contextualized_property)
    
    break

  if retry_count >= MAX_LLM_RETRIES:
    print(f"Tried to properties {retry_count} times, failed and stopped.\n")
  
  save_to_jsonl(PROPERTIES_OUTPUT_FILE, all_properties)

Prompt:

  Please list key algebraic math properties leveraging a formula for the concept below. Try to list 7 algebraic math properties leveraging a formula.

  Just return all key algebraic math properties leveraging formulas as a json list, nothing else.
  
  Don't include a algebraic math properties that don't use a formula. Please only use precise formulas (no approximations).

  Please only return the math concepts as a json list in ```json ``` tags.

  Example answer for input "logarithm math properties":
  ```json
  ["log_b(a) = c if and only if b^c = a", "log_b(M*N) = log_b(M) + log_b(N) (Product Rule)", ...]
  ```

  multiset (discrete mathematics)
  

Answer:
[
  "μ_{M + N}(x) = μ_M(x) + μ_N(x) (Multiset Sum)",
  "μ_{M ∩ N}(x) = min(μ_M(x), μ_N(x)) (Multiset Intersection)",
  "μ_{M - N}(x) = max(μ_M(x) - μ_N(x), 0) (Multiset Difference)",
  "|M| = \\sum_{x} μ_M(x) (Cardinality of Multiset)",
  "M = N ⇨ ∀x, μ_M(x) = μ_N(x) (Multiset Equality)",
  "M ⊆ N ⇨ ∀x, μ_M(x) ≤ μ_N(x) 

In [13]:
def generate_problem(sampled_properties, inspiration_integers, solution_space_size, print_prompt=False):
  sampled_properties_str = '\n'.join(sampled_properties)
  prompt = f"""
Here are math properties:
{sampled_properties_str}

Please create a problem where the solution space is a list of exactly {solution_space_size} random integers
in the range [{PROBLEM_RANDOM_INTEGER_INPUT_RANGE_MIN}, {PROBLEM_RANDOM_INTEGER_INPUT_RANGE_MAX}] and the problem has a unique solution.

The problem should start with statement: A list of {solution_space_size} integers in the range [{PROBLEM_RANDOM_INTEGER_INPUT_RANGE_MIN}, {PROBLEM_RANDOM_INTEGER_INPUT_RANGE_MAX}].

Solving the problem must include a non-trivial step using at most 1 of the algebraic formulas in the math properties highlighted above and
1 or more input integers {inspiration_integers}.

Don't include statements indicating that one of the integers is in a directly specified range of integers or in a directly specified set of integers.

Please return the problem as a json list of strings where each string is a problem requirement, including start statement and the final question, and nothing else.
  """
  if print_prompt:
    print(prompt)
  think_answer = run_groq_inference_qwq_32b(groq_client, prompt)
  sampled_problem = extract_after_think(think_answer)
  return sampled_problem

def expand_problem(sampled_properties, inspiration_integers, solution_space_size, previous_problem_statements, previous_solution, print_prompt=False):
  sampled_properties_str = '\n'.join(sampled_properties)
  prompt = f"""
Here are math properties:
{sampled_properties_str}

Here is a problem:
{previous_problem_statements}
========================================

Here is a unique solution to the problem:
{previous_solution}
========================================

Please iterate on the problem above and expand its solution space to a list of exactly {solution_space_size} random integers
in the range [{PROBLEM_RANDOM_INTEGER_INPUT_RANGE_MIN}, {PROBLEM_RANDOM_INTEGER_INPUT_RANGE_MAX}] so that the problem has a unique solution.

The problem should start with statement: A list of {solution_space_size} integers in the range [{PROBLEM_RANDOM_INTEGER_INPUT_RANGE_MIN}, {PROBLEM_RANDOM_INTEGER_INPUT_RANGE_MAX}].

Don't remove requirements of the original problem, just modify or add to them in order to expand the solution space to a list of
exactly {solution_space_size} random integers in the range [{PROBLEM_RANDOM_INTEGER_INPUT_RANGE_MIN}, {PROBLEM_RANDOM_INTEGER_INPUT_RANGE_MAX}] so that the problem has a unique solution.

Solving the problem must include at least 1 non-trivial step using at most 1 of the new algebraic formulas in the math properties highlighted
above (along with original algebraic formulas) and 1 or more new input integers from {inspiration_integers}.

Don't include statements indicating that one of the integers is in a directly specified range of integers or in a directly specified set of integers.

Please return the problem as a json list of strings where each string is a problem requirement, including start statement and the final question, and nothing else.
  """
  if print_prompt:
    print(prompt)
  think_answer = run_groq_inference_qwq_32b(groq_client, prompt)
  sampled_problem = extract_after_think(think_answer)
  return sampled_problem

def generate_check_solution_code(sampled_problem, activated_inspiration_integers_params, solution_space_size, previous_code_and_error=None, print_prompt=False):
  prompt = f"""
Here is a problem: {sampled_problem}
========================================

The problem uses the random integer parameters {activated_inspiration_integers_params}, we'll call them params.

Please write code for a python method called 'check_solution' that takes as input the proposed list of exactly {solution_space_size} integers
that form a solution to the problem along with an extra params dictionary, and checks if the solution is valid.

Example Problem: 'Median of the list of 5 integers is 10.'
Example params: {{'guiding_param_1': 10}}

Example code:
def check_solution(solution, **params):
  import statistics
  m = statistics.median(solution)
  return m == params['guiding_param_1']

Please only return code in in ```python ``` tags.
  """
  if previous_code_and_error is not None and len(previous_code_and_error.strip()) > 0:
    prompt = f"""
    {prompt}

    Here is a previous version of the code that you generated and the error it produced when trying to run it, please try to fix the code to avoid the error:
    {previous_code_and_error}
    """
  if print_prompt:
    print(prompt)
  think_answer = run_groq_inference_qwq_32b(groq_client, prompt)
  answer = extract_after_think(think_answer)
  code = extract_last_python_code(answer)  # Using the previous function
  return code

def brute_force_solutions(check_solution, num_inputs, **params):
    """
    Parametrized brute force solution finder that works with any number of inputs.
    
    Args:
        check_solution: Function that checks if a solution is valid
        num_inputs: Number of input variables to iterate through
        params: Additional parameters to pass to check_solution
        
    Returns:
        Set of solution tuples
    """
    solutions = set()
    STOP_AT_MAX_SOLUTIONS = 20
    
    # Generate all combinations using itertools
    for values in itertools.product(range(PROBLEM_RANDOM_INTEGER_INPUT_RANGE_MIN, PROBLEM_RANDOM_INTEGER_INPUT_RANGE_MAX + 1), repeat=num_inputs):
      if check_solution(list(values), **params):
          solutions.add(values)
          if len(solutions) >= STOP_AT_MAX_SOLUTIONS:
              return solutions
    
    return solutions

In [14]:
# # Test Debug.
# condition_method_str = """
# def check_solution(solution, **params):
#   import statistics
#   m = statistics.median(solution)
#   return m == params['guiding_param_1']
# """
# namespace = {}
# exec(condition_method_str, namespace)
# check_solution = namespace['check_solution']

# print(brute_force_solutions(check_solution, 3, **{
#   'guiding_param_1': 10
# }))

In [15]:
def gen_problem_with_retry(sampled_properties, inspiration_integers, solution_space_size, max_retries=MAX_LLM_RETRIES, debug=False):
  total_iterations = 0

  while total_iterations < max_retries:
    total_iterations += 1

    try:
      problem = generate_problem(sampled_properties, inspiration_integers, solution_space_size, print_prompt=debug)
    except Exception as e:
      if debug:
        print(f"Error generating problem: {e}")
      continue
    if problem is None or len(problem.strip()) == 0:
      if debug:
        print("No problem generated, skipping.")
      continue

    if debug:
      print("Problem:")
      print(problem)
      print()
    
    problem_statements_str = extract_last_json_code(problem)
    if problem_statements_str is None or len(problem_statements_str.strip()) == 0:
      problem_statements_str = problem
    
    try:
      problem_statements = ast.literal_eval(problem_statements_str) # ast doesn't error out on latex formulas.
    except:
      print("Error parsing problem statements.")
      problem_statements = None
      continue
    if problem_statements is None or len(problem_statements) == 0:
      if debug:
        print("No problem statements extracted into a list, skipping.")
      continue

    if not is_array_of_strings(problem_statements):
      print("Problem statements are not a list of strings, skipping.")
      continue

    # Print problem.
    if debug:
      print("Problem statements:")
      for statement in problem_statements:
        print(statement)
      print()
    
    if total_iterations >= MAX_LLM_RETRIES:
      print(f"Tried to generate problem {total_iterations} times, failed and stopped.\n")
      return None

    return problem_statements

def expand_problem_with_retry(sampled_properties, inspiration_integers, solution_space_size, previous_problem_statements, previous_solution, max_retries=MAX_LLM_RETRIES, debug=False):
  total_iterations = 0

  while total_iterations < max_retries:
    total_iterations += 1

    try:
      problem = expand_problem(sampled_properties, inspiration_integers, solution_space_size, previous_problem_statements, previous_solution, print_prompt=debug)
    except Exception as e:
      if debug:
        print(f"Error expanding problem: {e}")
      continue
    if problem is None or len(problem.strip()) == 0:
      if debug:
        print("No expanded problem generated during iteration, skipping.")
      continue

    if debug:
      print("Expanded problem:")
      print(problem)
      print()
    
    problem_statements_str = extract_last_json_code(problem)
    if problem_statements_str is None or len(problem_statements_str.strip()) == 0:
      problem_statements_str = problem
    
    try:
      problem_statements = ast.literal_eval(problem_statements_str) # ast doesn't error out on latex formulas.
    except:
      print("Error parsing expanded problem statements.")
      problem_statements = None
      continue
    if problem_statements is None or len(problem_statements) == 0:
      if debug:
        print("No expanded problem statements extracted into a list, skipping.")
      continue

    if not is_array_of_strings(problem_statements):
      print("Problem statements are not a list of strings, skipping.")
      continue

    # Print problem.
    if debug:
      print("Expanded problem statements:")
      for statement in problem_statements:
        print(statement)
      print()
  
    if total_iterations >= MAX_LLM_RETRIES:
      print(f"Tried to expand problem {total_iterations} times, failed and stopped.\n")
      return None

    return problem_statements

def generate_check_solution_code_with_retry(full_problem_statement, activated_inspiration_integers_params, solution_space_size, max_retries=MAX_CODE_GEN_LLM_RETRIES, debug=False):
  new_code_iterations = 0
  code_fix_iterations = 0
  previous_code_and_error = None
  while new_code_iterations < max_retries:
    if code_fix_iterations >= MAX_CODE_GEN_LLM_RETRIES:
        print(f"Tried to fix check solution code {code_fix_iterations} times, starting over.\n")
        code_fix_iterations = 0
        previous_code_and_error = None
    
    if previous_code_and_error is None:
      new_code_iterations += 1
    else:
      code_fix_iterations += 1
    
    try:
      check_solution_function_str = generate_check_solution_code(
        full_problem_statement, activated_inspiration_integers_params, solution_space_size, previous_code_and_error=previous_code_and_error, print_prompt=debug
      )
    except Exception as e:
      if debug:
        print(f"Error generating check solution code: {e}")
      continue
    if check_solution_function_str is None or len(check_solution_function_str.strip()) == 0:
      if debug:
        print("No check solution code generated, skipping.")
      continue

    if debug:
      print("Check solution code:")
      print(check_solution_function_str)
      print()

    try:
      check_solution_function = load_and_execute_code(check_solution_function_str).get('check_solution', None)
    except Exception as e:
      stack_trace = traceback.format_exc()
      if debug:
        print(f"Error loading and executing check solution code:\n Error:\n{e}\n Stack Trace:\n{stack_trace}")
      previous_code_and_error = f"{check_solution_function_str}\nError:{e}\nStack Trace:\n{stack_trace}"
      continue
    if check_solution_function is None:
      if debug:
        print("No check solution function found, skipping.")
      continue

    # Try running the check solution function with the input integers.
    try:
      # Generate all solutions.
      possible_solutions = brute_force_solutions(check_solution_function, solution_space_size, **activated_inspiration_integers_params)
      if debug:
        print(f"possible_solutions: {possible_solutions}\n")
    except Exception as e:
      stack_trace = traceback.format_exc()
      if debug:
        print(f"Error checking all brute force solutions:\n Error:\n{e}\n Stack Trace:\n{stack_trace}")
      previous_code_and_error = f"{check_solution_function_str}\nError:{e}\nStack Trace:\n{stack_trace}"
      continue

    return possible_solutions, check_solution_function, check_solution_function_str

  print(f"Tried to generate new check solution for problem {new_code_iterations} times, failed and stopped.\n")
  return None, None, None

In [None]:
state = types.SimpleNamespace(
    previous_validated_problem=None,
    statiteration_per_problem_and_solution_space_size=0,
    solution_space_size = PROBLEM_NUM_RANDOM_INTEGERS_START,
    inspiration_integers = None,
    sampled_properties = None,
)

def gen_inspiration_integers_and_properties(state):
  inspiration_integers = [random.randint(PROBLEM_RANDOM_INTEGER_INPUT_RANGE_MIN, PROBLEM_RANDOM_INTEGER_INPUT_RANGE_MAX) for i in range(state.solution_space_size)]
  sampled_properties = random.sample(list(all_properties), NUM_PROPERTIES_FOR_PROBLEM_GEN)
  return inspiration_integers, sampled_properties

def display_validated_problem(validated_problem):
  print("Validated problem:")
  for statement in validated_problem['problem_statements']:
    print(statement)
  print(f"\nValidated list of integers solution: {validated_problem['possible_solution']}\n")
  print(f"Validated check solution code:\n{validated_problem['check_solution_function_str']}")

def reset_problem_state(state, previous_validated_problem=None, solution_space_size=PROBLEM_NUM_RANDOM_INTEGERS_START):
  state.previous_validated_problem = previous_validated_problem
  state.iteration_per_problem_and_solution_space_size = 0
  state.solution_space_size = solution_space_size
  state.inspiration_integers, state.sampled_properties = gen_inspiration_integers_and_properties(state)



validated_problems = []
reset_problem_state(state)
total_iterations = 0
while len(validated_problems) < MAX_VALIDATED_PROBLEMS_TO_GEN and total_iterations < MAX_VALIDATED_PROBLEMS_TO_GEN * PROBLEM_NUM_RANDOM_INTEGERS_START * 10:
  total_iterations += 1
  state.iteration_per_problem_and_solution_space_size += 1

  if state.previous_validated_problem is None:  # Create initial low solution space problem.
    problem_statements = gen_problem_with_retry(state.sampled_properties, state.inspiration_integers, state.solution_space_size, max_retries=MAX_LLM_RETRIES, debug=True)
  else:  # Expand problem solution space.
    problem_statements = expand_problem_with_retry(
      state.sampled_properties, state.inspiration_integers, state.solution_space_size,
      state.previous_validated_problem['problem_statements'], state.previous_validated_problem['possible_solution'],
      max_retries=MAX_LLM_RETRIES, debug=True
    )
  
  if problem_statements is None or len(problem_statements) == 0:
    print(f"Did not generate an expanded problem, skipping to next expanded problem.\n")
    reset_problem_state(state)
    continue

  # Get params that were actually used by LLM in generated problem statements.
  # Skip first statement since it just sets number of integers which is already accounted for in code.
  full_problem_statement = ' '.join(problem_statements[1:])
  activated_inspiration_integers = find_integers_in_string(full_problem_statement)
  activated_inspiration_integers_params = {
    f"guiding_param_{idx+1}": i
    for idx, i in enumerate(activated_inspiration_integers)
  }
  print(f"Activated inspiration integers params:\n{activated_inspiration_integers_params}\n")

  # Generete code to check a possible solution.
  possible_solutions, check_solution_function, check_solution_function_str = generate_check_solution_code_with_retry(
    full_problem_statement, activated_inspiration_integers_params, state.solution_space_size, max_retries=MAX_CODE_GEN_LLM_RETRIES, debug=True
  )

  if possible_solutions is None:
    print(f"Did not generate possible solutions for this expanded problem, skipping to next problem.\n")
    reset_problem_state(state)
    continue

  # If unique solution, updated new validated problem and print it.
  if len(possible_solutions) == 1:
    previous_validated_problem = {
      'problem_statements': problem_statements,
      'check_solution_function_str': check_solution_function_str,
      'possible_solution': list(possible_solutions)[0],
    }
    display_validated_problem(previous_validated_problem) # Print problem.
    reset_problem_state(state, previous_validated_problem=previous_validated_problem, solution_space_size=state.solution_space_size + 1)

  if state.solution_space_size > PROBLEM_NUM_RANDOM_INTEGERS_END:
    print(f"Expanded solution space of problem to {state.solution_space_size - 1}, stopping with validated problem:")
    display_validated_problem(state.previous_validated_problem)
    validated_problems.append(state.previous_validated_problem)
    save_to_jsonl(OUTPUT_FILE, validated_problems)
    reset_problem_state(state)

  if len(validated_problems) >= MAX_VALIDATED_PROBLEMS_TO_GEN:
    print("="*100)
    print(f"\n\nGenerated {len(validated_problems)} validated problems, stopping.\n\n")
    print("="*100)
    break

print("="*100)
print(f"\n\nGenerated {len(validated_problems)} validated problems, stopping.\n\n")
print("="*100)


Here are math properties:
Property of frequency (statistics): Joint frequency in a two-way table: f_{ij} = count of occurrences in cell (i,j)
Property of mode (statistics): M = \lfloor (n+1)p \rfloor \text{ for a binomial}(n,p) \text{ distribution}
Property of average (arithmetic): Sum of deviations from the mean is zero: Σ(x_i - A) = 0
Property of sum (arithmetic): S_n = \frac{n}{2}(a_1 + a_n) (Arithmetic Series Sum Formula)
Property of median (statistics): Median = Q_2 (second quartile)

Please create a problem where the solution space is a list of exactly 1 random integers
in the range [2, 30] and the problem has a unique solution.

The problem should start with statement: A list of 1 integers in the range [2, 30].

Solving the problem must include a non-trivial step using at most 1 of the algebraic formulas in the math properties highlighted above and
1 or more input integers [11].

Don't include statements indicating that one of the integers is in a directly specified range of in

In [17]:
reloaded_validated_problems = load_jsonl(OUTPUT_FILE)
for idx, problem in enumerate(reloaded_validated_problems):
  print(f"ID: {idx} ================================================================================================")
  display_validated_problem(problem)
  print()

Validated problem:
A list of 4 integers in the range [2, 30.
The first integer is twice the average of the first five terms of an arithmetic sequence starting at 1 with a common difference of 3.
The second integer is the median of the first five terms of an arithmetic sequence starting at4 with a common difference of6.
The third integer is the cumulative frequency of the first two integers in a dataset where each integer's frequency is half its value.
The fourth integer is the median of the first five term of an arithmetic sequence starting at14 with a common difference of4.
The second integer must be a power of2.

Validated list of integers solution: [14, 16, 15, 22]

Validated check solution code:
def check_solution(solution, **guiding_params):
    if len(solution) != 4:
        return False
    a, b, c, d = solution

    # Check first integer
    start1 = guiding_params['guiding_param_1']
    diff1 = guiding_params['guiding_param_2']
    required_a = 2 * (start1 + 2 * diff1)
    if 