## Ape (gpt-4o-mini)

### Load dataset (GSM8K)

In [2]:
import dotenv
import json
from dspy.datasets.gsm8k import parse_integer_answer
from ape.types import DatasetItem

# load environment variables
dotenv.load_dotenv()

# Check if jsonl files exist
# Load from jsonl files
with open('trainset.jsonl', 'r') as f:
    trainset = [DatasetItem(**json.loads(line)) for line in f]
with open('testset.jsonl', 'r') as f:
    testset = [DatasetItem(**json.loads(line)) for line in f]
    
print(f"Loaded {len(trainset)} items from trainset.jsonl")
print(f"Loaded {len(testset)} items from testset.jsonl")

Loaded 500 items from trainset.jsonl
Loaded 1319 items from testset.jsonl


### Setup evaluation metric

In [3]:
from ape import BaseMetric
from typing import Any, Dict, Optional
from dataclasses import dataclass

@dataclass
class MetricResult:
  score: float
  intermediate_values: Optional[Dict[str, Any]] = None

# Set up the metric
class GSM8KMetric(BaseMetric):
  def compute(self, inputs: dict, gold: dict, pred: dict, trace=None):
    if not isinstance(pred, dict):
      return MetricResult(score=0.0)
    if "answer" not in pred:
      return MetricResult(score=0.0)

    is_correct = int(parse_integer_answer(str(gold["answer"]))) == int(
        parse_integer_answer(str(pred["answer"]))
    )

    return MetricResult(score=float(is_correct))

### Setup MIPRO and load base prompt

In [4]:
from ape import MIPRO

mipro = MIPRO(
    prompt_model="gpt-4o", # model that will generate instruction
    task_model="gpt-4o-mini", # model that will run the prompt
    metric=GSM8KMetric(), # metric
    verbose=True,
    num_candidates=10, # number of candidate instructions that will be generated
    minibatch_size=50, # number of examples to use in each minibatch
)

In [5]:
from ape import Prompt

gsm8k_base_prompt = Prompt.load_file("gsm8k-base.prompt")

### Start Optimizing

In [5]:
from ape.types import ResponseFormat
from ape.types.response_format import JsonSchema

optimized_prompt = await mipro.optimize(
    student=gsm8k_base_prompt,
    task_description="Solve math problems, come up with short factoid answers.",
    trainset=trainset,
    testset=testset,
    log_dir="./.gsm8k_logs",  # all logs will be saved here
    eval_kwargs={
        "max_errors": 3,
    },
    max_bootstrapped_demos=5,  # maximum number of fewshot examples to use
    max_labeled_demos=5,  # maximum number of labeled examples to use
    max_steps=20,  # maximum number of optimization steps
    goal_score=1.0,  # goal score to achieve, stop optimization if achieved
    response_format=ResponseFormat(type="json_object"),
    requires_permission_to_run=False,
)

[2m2024-08-29T17:49:41.987573Z[0m [[31m[1merror    [0m] [1mError reformatting prompt: ape.types.response_format.ResponseFormat() argument after ** must be a mapping, not str. Retrying...[0m [[0m[1m[34mape.optimizer.utils[0m][0m [36mfilename[0m=[35mlogging.py[0m [36mlineno[0m=[35m32[0m


Working on seed: -3
Working on seed: -2
Working on seed: -1
Working on seed: 0
Working on seed: 1
Working on seed: 2
Working on seed: 3
Working on seed: 4
Working on seed: 5
Working on seed: 6
fewshot candidates: [[], [DatasetItem(inputs={'question': 'Karen is a dog groomer. Rottweilers take 20 minutes to groom, border collies take 10 minutes to groom, and chihuahuas take 45 minutes to groom because they ferociously resist. How many minutes will it take Karen to groom 6 Rottweilers, 9 border collies and 1 chihuahua?'}, outputs={'gold_reasoning': 'First find the total time Karen spending grooming Rottweilers: 20 minutes/Rottweiler * 6 Rottweilers = <<20*6=120>>120 minutes Then find the total time she spends grooming border collies: 10 minutes/border collie * 9 border collies = <<10*9=90>>90 minutes Then add the time spent grooming all three kinds of dogs to find her total grooming time: 120 minutes + 90 minutes + 45 minutes = <<120+90+45=255>>255 minutes', 'answer': '255'}, metadata={'s

[I 2024-08-29 10:50:34,100] A new study created in memory with name: no-name-b4a19719-aa2f-4727-8cc3-2a52a33c0b72
[I 2024-08-29 10:50:50,908] Trial 0 finished with value: 0.88 and parameters: {'instruction': 1, 'fewshot': 2}. Best is trial 0 with value: 0.88.
[I 2024-08-29 10:50:56,681] Trial 1 finished with value: 0.92 and parameters: {'instruction': 6, 'fewshot': 2}. Best is trial 1 with value: 0.92.
[I 2024-08-29 10:51:02,873] Trial 2 finished with value: 0.84 and parameters: {'instruction': 8, 'fewshot': 6}. Best is trial 1 with value: 0.92.
[I 2024-08-29 10:51:08,197] Trial 3 finished with value: 0.94 and parameters: {'instruction': 4, 'fewshot': 5}. Best is trial 3 with value: 0.94.
[I 2024-08-29 10:51:19,265] Trial 4 finished with value: 0.82 and parameters: {'instruction': 3, 'fewshot': 8}. Best is trial 3 with value: 0.94.
[I 2024-08-29 10:51:24,290] Trial 5 finished with value: 0.96 and parameters: {'instruction': 2, 'fewshot': 3}. Best is trial 5 with value: 0.96.
[I 2024-08

### Evaluate Prompt

In [6]:
from ape.evaluate import Evaluate
import asyncio

evaluate = Evaluate(testset, metric=GSM8KMetric())

async def run_evaluation():
    try:
        score = await asyncio.wait_for(evaluate(optimized_prompt), timeout=300)  # 5 minutes timeout
        print(score)
    except asyncio.TimeoutError:
        print("Evaluation timed out after 5 minutes")
    except Exception as e:
        print(f"An error occurred during evaluation: {str(e)}")

await run_evaluation()

0.9317664897649734


In [7]:
print(optimized_prompt)

model='gpt-4o-mini' messages=[{'role': 'system', 'content': 'You are a math tutor who explains problems step-by-step to help a student understand the solution. Your goal is to solve math problems clearly and thoroughly, showing all necessary steps. Make sure to format your output as a JSON object. Each step should be clearly explained, and calculations should be shown inline as needed. Remember to conclude with a concise and accurate final answer.'}, {'role': 'user', 'content': 'Task demonstrations:\n{_FEWSHOT_}\n\nNow perform the task for the following question:\n{question}'}] metadata={'inputs': {'question': 'The question to be answered.'}, 'outputs': {'answer': 'The answer to the question.'}, 'response_format': ResponseFormat(type='json_object', json_schema=None), 'fewshot': [DatasetItem(inputs={'question': 'John uses the bathroom every 50 minutes.  How many times does he use the bathroom during a 2.5-hour movie?'}, outputs={'gold_reasoning': 'The movie last 2.5*60=<<2.5*60=150>>150

## Compare with CoT 

### 4o-mini with CoT

In [6]:
from ape.evaluate import Evaluate
import asyncio

cot_prompt_4o_mini = Prompt.load_file("gsm8k-cot-4o-mini.prompt")

evaluate = Evaluate(testset, metric=GSM8KMetric())

async def run_evaluation(semaphore):
    async with semaphore:
        try:
            score = await asyncio.wait_for(evaluate(cot_prompt_4o_mini), timeout=300)  # 5 minutes timeout
            print(score)
        except asyncio.TimeoutError:
            print("Evaluation timed out after 5 minutes")
        except Exception as e:
            print(f"An error occurred during evaluation: {str(e)}")

semaphore = asyncio.Semaphore(5)

await run_evaluation(semaphore)

0.9075056861258529
