### Enable logging stuff

In [1]:
from litellm.litellm_core_utils import logging_worker
from roma_dspy.utils import log_async_execution
import dspy
dspy.settings.provide_traceback = True
import asyncio
dspy.provide_traceback = True
import os
os.environ["LITELLM_LOG"] = "ERROR"

dspy.disable_litellm_logging()
def _run_logging_inline(async_coroutine):
    try:
        loop = asyncio.get_running_loop()
    except RuntimeError:
        asyncio.run(async_coroutine)
    else:
        loop.create_task(async_coroutine)

# mlflow.set_tracking_uri("http://localhost:5000")
# mlflow.set_experiment("dspy_experiment_test")
# mlflow.dspy.autolog(
#     # Log the optimization progress
#     log_compiles=True,
#     # Log the evaluation results
#     log_evals=True,
#     # Log traces from module executions
#     log_traces=True
# )

# log_async_execution(verbose=True)  # DEBUG level
# import logging
# for name in ["openai", "openai._base_client", "httpx", "httpcore"]:
#     logging.getLogger(name).setLevel(logging.WARNING)

logging_worker.GLOBAL_LOGGING_WORKER.start = lambda: None
logging_worker.GLOBAL_LOGGING_WORKER.enqueue = _run_logging_inline
logging_worker.GLOBAL_LOGGING_WORKER.ensure_initialized_and_enqueue = _run_logging_inline


### Configuring DSPy stuff

In [2]:
import dspy 
from prompt_optimization.seed_prompts import ATOMIZER_PROMPT, PLANNER_PROMPT, AGGREGATOR_PROMPT, ATOMIZER_DEMOS, PLANNER_DEMOS
from roma_dspy import RecursiveSolverModule, RecursiveSolverFactory, Executor, Atomizer, Planner, Aggregator

executor_lm = dspy.LM("cerebras/gpt-oss-120b", temperature=0.5, max_tokens=128000, cache=False)
atomizer_lm = dspy.LM("cerebras/qwen-3-235b-a22b-instruct-2507", temperature=0.5, max_tokens=128000,  cache=False)
planner_lm = dspy.LM("cerebras/qwen-3-235b-a22b-instruct-2507", temperature=0.5, max_tokens=128000, cache=False)
aggregator_lm = dspy.LM("cerebras/gpt-oss-120b", temperature=0.5, max_tokens=128000, cache=False)

# Initialize modules
atomizer = Atomizer(lm=atomizer_lm)
planner = Planner(lm=planner_lm)
executor = Executor(lm=executor_lm)
aggregator = Aggregator(lm=aggregator_lm)

#Add few-shot examples + prompts
atomizer.signature.instructions = ATOMIZER_PROMPT
atomizer._predictor.predict.demos.extend(ATOMIZER_DEMOS)
planner.signature.instructions = PLANNER_PROMPT
planner._predictor.predict.demos.extend(PLANNER_DEMOS)
aggregator.signature.instructions = AGGREGATOR_PROMPT

# Create solver
solver = RecursiveSolverFactory(
    atomizer,
    planner,
    executor,
    aggregator,
    max_depth=1,
    enable_logging=False
)

dspy_module = RecursiveSolverModule(solver_factory=solver, visualize=True)

In [3]:
from prompt_optimization.datasets import load_aimo_datasets

train_set, val_set, test_set = load_aimo_datasets(
    train_size=32,
    val_size=8,
    test_size=8,
    seed=42
)

In [4]:
from prompt_optimization import ComponentJudge, MetricWithFeedback

judge = ComponentJudge()
metric = MetricWithFeedback(judge)

In [5]:
from dspy import GEPA

optimizer = GEPA(
    metric=metric,
    # auto="light",
    component_selector="round_robin",
    max_metric_calls=225,
    add_format_failure_as_feedback=True,
    num_threads=12,
    track_stats=True,
    log_dir="logs/aime_test",
    use_wandb=True,
    wandb_init_kwargs={"project": "aime_test"},
    reflection_minibatch_size=8,
    reflection_lm=dspy.LM(model="openrouter/anthropic/claude-sonnet-4.5", temperature=.75, max_tokens=128000)
)

In [None]:
optimized_program = optimizer.compile(
    dspy_module,
    trainset=train_set,
    valset=val_set,
)

2025/10/09 11:18:01 INFO dspy.teleprompt.gepa.gepa: Running GEPA for approx 225 metric calls of the program. This amounts to 5.62 full evals on the train+val set.
2025/10/09 11:18:01 INFO dspy.teleprompt.gepa.gepa: Using 8 examples for tracking Pareto scores. You can consider using a smaller sample of the valset to allow GEPA to explore more diverse solutions within the same budget. GEPA requires you to provide the smallest valset that is just large enough to match your downstream task distribution, while providing as large trainset as possible.
[34m[1mwandb[0m: Currently logged in as: [33msalzubi[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Detected [dspy, litellm, openai] in use.
[34m[1mwandb[0m: Use W&B Weave for improved LLM call tracing. Weave is installed but not imported. Add `import weave` to the top of your script.
[34m[1mwandb[0m: For more information, check out the docs at: https://weave-docs.wandb.ai/
GEPA Optimization:   0%|          | 0/225 [00:00<?, ?rollouts/s]2025/10/09 11:18:02 INFO dspy.teleprompt.gepa.gepa: Loading gepa state from run dir
2025/10/09 11:18:02 INFO dspy.teleprompt.gepa.gepa: Iteration 9: Base program full valset score: 0.25
2025/10/09 11:18:02 INFO dspy.teleprompt.gepa.gepa: Iteration 10: Selected program 4 score: 0.875


Average Metric: 8.00 / 8 (100.0%): 100%|██████████| 8/8 [00:20<00:00,  2.55s/it]

2025/10/09 11:18:23 INFO dspy.evaluate.evaluate: Average Metric: 8 / 8 (100.0%)
2025/10/09 11:18:23 INFO dspy.teleprompt.gepa.gepa: Iteration 10: All subsample scores perfect. Skipping.
2025/10/09 11:18:23 INFO dspy.teleprompt.gepa.gepa: Iteration 10: Reflective mutation did not propose a new candidate
GEPA Optimization:  92%|█████████▏| 208/225 [00:20<00:01, 10.20rollouts/s]2025/10/09 11:18:23 INFO dspy.teleprompt.gepa.gepa: Iteration 11: Selected program 4 score: 0.875



Average Metric: 4.00 / 8 (50.0%): 100%|██████████| 8/8 [01:20<00:00, 10.00s/it] 

2025/10/09 11:19:43 INFO dspy.evaluate.evaluate: Average Metric: 4 / 8 (50.0%)



Feedback: None
Score: 0
Feedback: None
Score: 0
Feedback: None
Score: 1
Feedback: None
Score: 0
Feedback: None
Score: 1
Feedback: None
Score: 1


2025/10/09 11:23:05 INFO dspy.teleprompt.gepa.gepa: Iteration 11: Proposed new text for planner._predictor.predict: You are a mathematical problem decomposition assistant. Your task is to break down complex competition mathematics problems into structured, executable subtasks.

**Input Format:**
- You will receive a mathematical problem statement in the `goal` field
- The problem will typically be from mathematical competitions (AMC, AIME, IMO, etc.)

**Your Task:**
Generate a complete problem decomposition consisting of:

1. **Reasoning Section**: 
   - Analyze the problem structure and mathematical domain
   - Identify key insights, theorems, or techniques needed
   - Outline the overall solution strategy
   - Explain how subtasks will fit together
   - Note any important mathematical properties or constraints
   - Work through key derivations and intermediate steps in detail
   - For problems involving sequences, recurrences, or periodic behavior, explicitly compute initial terms to

0,1
base_program_full_valset_score,▁
best_program_as_per_agg_score,▁
best_program_as_per_agg_score_valset,▁
best_score_on_train_val,▁
best_score_on_valset,▁
best_valset_agg_score,▁
iteration,▁▅█
linear_pareto_front_program_idx,▁
new_program_idx,▁
new_subsample_score,▁

0,1
base_program_full_valset_score,0.25
best_program_as_per_agg_score,1
best_program_as_per_agg_score_valset,1
best_score_on_train_val,0.875
best_score_on_valset,0.875
best_valset_agg_score,0.875
iteration,11
linear_pareto_front_program_idx,1
new_instruction_planner._predictor.predict,You are a mathematic...
new_program_idx,7




In [7]:
optimized_program

atomizer._predictor.predict = Predict(StringSignature(goal -> reasoning, is_atomic, node_type
    instructions='Signature for task atomization.'
    goal = Field(annotation=str required=True description='Task to atomize' json_schema_extra={'__dspy_field_type': 'input', 'desc': 'Task to atomize', 'prefix': 'Goal:'})
    reasoning = Field(annotation=str required=True json_schema_extra={'prefix': "Reasoning: Let's think step by step in order to", 'desc': '${reasoning}', '__dspy_field_type': 'output'})
    is_atomic = Field(annotation=bool required=True description='True if task can be executed directly' json_schema_extra={'__dspy_field_type': 'output', 'desc': 'True if task can be executed directly', 'prefix': 'Is Atomic:'})
    node_type = Field(annotation=NodeType required=True description='Type of node to process (PLAN or EXECUTE)' json_schema_extra={'__dspy_field_type': 'output', 'desc': 'Type of node to process (PLAN or EXECUTE)', 'prefix': 'Node Type:'})
))
planner._predictor.predic