## Load & Preprocess Dataset

In [None]:
import json
import random
import matplotlib.pyplot as plt

from typing import Dict, List, Any, Union

from dotenv import load_dotenv
from openai import AsyncOpenAI
from datasets import load_dataset

load_dotenv()

# get dataset
ds = load_dataset("google/boolq")
trainset_ds = ds["train"]
trainset_raw = trainset_ds.to_list()

testset_ds = ds["validation"]
testset_raw = testset_ds.to_list()
random.seed(1)

trainset_random = random.sample(trainset_raw, 200)
testset_random = random.sample(testset_raw, 100)

from ape.common.types import DatasetItem

trainset = [DatasetItem(inputs={"question": item["question"], "passage": item["passage"]}, outputs={"answer": item["answer"]}) for item in trainset_random]
testset = [DatasetItem(inputs={"question": item["question"], "passage": item["passage"]}, outputs={"answer": item["answer"]}) for item in testset_random]

## Prepare Prompt to optimize

In [2]:
from ape.common import Prompt

# define prompt
prompt_with_passage = """\
For given question and passage, return True or False.

respond in JSON format:
{{
    "thought": "<thought>",
    "answer": "<True or False>"
}}

question: {question}
passage: {passage}
"""

response_format = {
    "type": "json_schema", "json_schema": {
        "name": "Prediction",
        "strict": True,
        "schema": {
            "type": "object",
            "properties": {
                "thought": {
                    "type": "string",
                    "description": "The reasoning process of the prediction"
                },
                "answer": {
                    "type": "boolean",
                    "description": "The prediction of the given question"
                }
            },
            "required": ["thought", "answer"],
            "additionalProperties": False
        }
    }
}


student_prompt = Prompt(
    messages=[
        {"role": "system", "content": prompt_with_passage},
    ],
    model="gpt-4o-mini",
    temperature=0.0,
    name="Boolean Question Answer Bot",
    response_format=response_format,
)


## Prepare Generator, Metric, and Global Metric

In [3]:
from ape.common.generator import BaseGenerator
from ape.common.metric import BaseMetric
from ape.common.global_metric import BaseGlobalMetric
from ape.common.types import MetricResult, GlobalMetricResult

# define generator, metric, global metric
openai = AsyncOpenAI()

class Classifier(BaseGenerator):
    async def generate(
        self,
        prompt: Prompt,
        inputs: Dict[str, Any],
    ) -> Union[Dict[str, Any], str]:
        try:
            messages = prompt.format(**inputs).messages
            response = await openai.chat.completions.create(
                model=prompt.model,
                messages=messages,
                response_format=prompt.response_format,
                temperature=0.0,
            )
            simulation_cost.append(response.usage.model_dump())
            return json.loads(response.choices[0].message.content)
        except Exception as e:
            print(e)
            return {
                "thought": "error",
                "answer": None,
            }
    
class BoolQMetric(BaseMetric):
    async def compute(
        self,
        dataset_item: DatasetItem,
        pred: Dict[str, Any],
    ) -> MetricResult:
        try:
            pred_answer = pred["answer"]
            gold_answer = dataset_item["outputs"]["answer"]
            if pred_answer == gold_answer:
                return MetricResult(
                    score=1.0,
                )
            else:
                return MetricResult(
                    score=0.0,
                )
        except Exception as e:
            # print(e)
            # print(pred)
            return MetricResult(
                score=0.0,
            )

class GlobalBoolQMetric(BaseGlobalMetric):
    async def compute(
        self,
        results: List[MetricResult],
    ) -> GlobalMetricResult:
        try:
            scores = [result.score for result in results]
            return GlobalMetricResult(
                score=sum(scores) / len(scores) if len(results) > 0 else 0.0,
            )
        except Exception as e:
            # print("Error in GlobalEmotionMetric: ", e)
            return GlobalMetricResult(
                score=0.0,
            )
        

## Select Trainer & Run

In [None]:
from ape.core.trainer import (
    TextGradientTrainer,
    ExpelTrainer,
    FewShotTrainer,
    EvoPromptTrainer,
    DspyMiproTrainer,
    OptunaTrainer,
)

# define trainer 
trainer = FewShotTrainer(
    generator=Classifier(),
    metric=BoolQMetric(),
    global_metric=GlobalBoolQMetric(),
    testmode=True # If True, trainer will run prompts for validation set and save results.
)

# run trainer
optimized_prompt, report = await trainer.train(
    prompt=student_prompt,
    trainset=trainset,
    valset=testset,
)


## Print Optimized Prompt

In [None]:
# print optimized prompt
for message in optimized_prompt.messages:
    print(message)

## Print Benchmark Test Results

In [None]:
# visualize experiment results
def visualize_scores(report):
    scores = report.scores
    trainset_scores = [score["score"] for score in scores]
    valset_scores = [score["val_score"] for score in scores]
    iterations = range(1, len(trainset_scores) + 1)

    plt.figure(figsize=(10, 6))
    plt.plot(iterations, trainset_scores, label='Training Set', marker='o')
    plt.plot(iterations, valset_scores, label='Validation Set', marker='s')
    
    for i, (train_score, val_score) in enumerate(zip(trainset_scores, valset_scores)):
        plt.text(iterations[i], train_score, f'{train_score:.2f}', 
                    ha='center', va='bottom', fontsize=8, color='blue')
        plt.text(iterations[i], val_score, f'{val_score:.2f}', 
                    ha='center', va='bottom', fontsize=8, color='green')

    plt.title('Training and Validation Scores over Iterations')
    plt.xlabel('Iteration')
    plt.ylabel('Score')
    plt.legend()
    plt.show()

visualize_scores(report)