## Ape (gpt-4o-mini)

### Load dataset (GSM8K)

In [1]:
import dotenv
import json
from dspy.datasets.gsm8k import parse_integer_answer
from ape.types import DatasetItem

# load environment variables
dotenv.load_dotenv()

# Check if jsonl files exist
# Load from jsonl files
with open('trainset.jsonl', 'r') as f:
    trainset = [DatasetItem(**json.loads(line)) for line in f]
with open('testset.jsonl', 'r') as f:
    testset = [DatasetItem(**json.loads(line)) for line in f]
    
print(f"Loaded {len(trainset)} items from trainset.jsonl")
print(f"Loaded {len(testset)} items from testset.jsonl")

  from .autonotebook import tqdm as notebook_tqdm


Loaded 500 items from trainset.jsonl
Loaded 1319 items from testset.jsonl


  warn(


### Setup evaluation metric

In [3]:
from ape import BaseMetric

# Set up the metric
class GSM8KMetric(BaseMetric):
    def compute(self, inputs: dict, gold: dict, pred: dict, trace=None):
        if not isinstance(pred, dict):
            return False
        if "answer" not in pred:
            return False
        return int(parse_integer_answer(str(gold["answer"]))) == int(
            parse_integer_answer(str(pred))
        )

### Setup MIPRO and load base prompt

In [9]:
from ape import MIPRO

mipro = MIPRO(
    prompt_model="gpt-4o", # model that will generate instruction
    task_model="gpt-4o-mini", # model that will run the prompt
    metric=GSM8KMetric(), # metric
    verbose=True,
    num_candidates=10, # number of candidate instructions that will be generated
    minibatch_size=50, # number of examples to use in each minibatch
    update_prompt_after_full_eval=False # set False to update prompt if evaluation on minibatch has highest score
)

In [10]:
from ape import Prompt

gsm8k_base_prompt = Prompt.load_file("gsm8k-base.prompt")

### Start Optimizing

In [11]:
from ape.types import ResponseFormat
from ape.types.response_format import JsonSchema

optimized_prompt = await mipro.optimize(
    student=gsm8k_base_prompt,
    task_description="Solve math problems, come up with short factoid answers.",
    trainset=trainset,
    testset=testset,
    log_dir="./.gsm8k_logs",  # all logs will be saved here
    eval_kwargs={
        "max_errors": 3,
    },
    max_bootstrapped_demos=5,  # maximum number of fewshot examples to use
    max_labeled_demos=5,  # maximum number of labeled examples to use
    max_steps=20,  # maximum number of optimization steps
    goal_score=1.0,  # goal score to achieve, stop optimization if achieved
    response_format=ResponseFormat(type="json_object"),
    requires_permission_to_run=False,
)

[2m2024-08-29T07:52:46.221021Z[0m [[31m[1merror    [0m] [1mError parsing outputs: Expecting ',' delimiter: line 6 column 626 (char 742)[0m [[0m[1m[34mape.optimizer.utils[0m][0m [36mfilename[0m=[35mlogging.py[0m [36mlineno[0m=[35m32[0m
[2m2024-08-29T07:52:46.221616Z[0m [[31m[1merror    [0m] [1m{
  "total_cans_collected": 12,
  "cans_grandparents_house": 36,
  "cans_from_neighbor": 46,
  "cans_from_dad": 250,
  "total_cans": 12         	  	  	 																																																																																																																																																																																																																																																																																																																																																																																																																																																																																																																																																

### Evaluate Prompt

In [7]:
from ape.evaluate import Evaluate
import asyncio

evaluate = Evaluate(testset, metric=GSM8KMetric())

async def run_evaluation():
    try:
        score = await asyncio.wait_for(evaluate(optimized_prompt), timeout=300)  # 5 minutes timeout
        print(score)
    except asyncio.TimeoutError:
        print("Evaluation timed out after 5 minutes")
    except Exception as e:
        print(f"An error occurred during evaluation: {str(e)}")

await run_evaluation()

0.934040940106141


In [8]:
print(optimized_prompt)

model='gpt-4o-mini' messages=[{'role': 'system', 'content': 'You are an expert math tutor proficient in solving word problems. Your task is to answer the given questions by breaking them down into detailed, step-by-step reasoning.\n\nYou must provide your output as a JSON object containing:\n1. gold_reasoning: a step-by-step explanation of how to solve the problem.\n2. answer: the final answer to the problem.\n\nEnsure that your reasoning is detailed, breaking down each part of the problem clearly and logically.'}, {'role': 'user', 'content': '{_FEWSHOT_}\n\nNow solve the following problem step by step:\n\nQuestion:\n{question}'}] metadata={'inputs': {'question': 'The question to be answered.'}, 'outputs': {'answer': 'The answer to the question.'}, 'response_format': ResponseFormat(type='json_object', json_schema=None), 'fewshot': [DatasetItem(inputs={'question': 'Megan pays $16 for a shirt that costs $22 before sales. What is the amount of the discount?'}, outputs={'gold_reasoning': '

## Compare with CoT 

### 4o-mini with CoT

In [None]:
from ape.evaluate import Evaluate
import asyncio

cot_prompt_4o_mini = Prompt.load_file("gsm8k-cot-4o-mini.prompt")

evaluate = Evaluate(testset, metric=GSM8KMetric())

async def run_evaluation():
    try:
        score = await asyncio.wait_for(evaluate(cot_prompt_4o_mini), timeout=300)  # 5 minutes timeout
        print(score)
    except asyncio.TimeoutError:
        print("Evaluation timed out after 5 minutes")
    except Exception as e:
        print(f"An error occurred during evaluation: {str(e)}")

await run_evaluation()