# Synth GEPA Demo - Banking77

Prompt optimization using Synth's GEPA algorithm on the Banking77 intent classification task.

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/synth-laboratories/synth-ai/blob/main/demos/gepa_banking77/gepa_banking77_prompt_optimization.ipynb)

**Structure:**
1. **Setup** - Install dependencies and configure
2. **Task Definition** - Banking77 classification task
3. **Local API** - Expose the task for optimization
4. **Optimize** - Run GEPA to discover better prompts
5. **Evaluate** - Formal eval on held-out data

In [None]:
import sys

if "google.colab" in sys.modules:
    import subprocess
    import os

    print("Installing dependencies...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "--no-deps", "synth-ai"])
    subprocess.check_call([
        sys.executable, "-m", "pip", "install", "-q",
        "nest_asyncio", "openai", "httpx", "pydantic", "rich", "tqdm",
        "aiohttp", "pynacl", "fastapi", "uvicorn", "python-dotenv"
    ])

    if not os.path.exists("/usr/local/bin/cloudflared"):
        subprocess.check_call([
            "wget", "-q",
            "https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64",
            "-O", "/usr/local/bin/cloudflared",
        ])
        os.chmod("/usr/local/bin/cloudflared", 0o755)

    print("Done!")

## Step 1: Setup

In [None]:
import json
import os

import nest_asyncio
from datasets import load_dataset
from synth_ai.core.env import PROD_BASE_URL, mint_demo_api_key

nest_asyncio.apply()

API_KEY = mint_demo_api_key()
os.environ["SYNTH_API_KEY"] = API_KEY
SYNTH_API_BASE = PROD_BASE_URL

print(f"API Key: {API_KEY[:20]}...")

## Step 2: Task Definition

Banking77 is an intent classification task with 77 possible intents.

In [None]:
BANKING77_LABELS = [
    "activate_my_card",
    "age_limit",
    "apple_pay_or_google_pay",
    "atm_support",
    "automatic_top_up",
    "balance_not_updated_after_bank_transfer",
    "balance_not_updated_after_cheque_or_cash_deposit",
    "beneficiary_not_allowed",
    "cancel_transfer",
    "card_about_to_expire",
    "card_acceptance",
    "card_arrival",
    "card_delivery_estimate",
    "card_linking",
    "card_not_working",
    "card_payment_fee_charged",
    "card_payment_not_recognised",
    "card_payment_wrong_exchange_rate",
    "card_swallowed",
    "cash_withdrawal_charge",
    "cash_withdrawal_not_recognised",
    "change_pin",
    "compromised_card",
    "contactless_not_working",
    "country_support",
    "declined_card_payment",
    "declined_cash_withdrawal",
    "declined_transfer",
    "direct_debit_payment_not_recognised",
    "disposable_card_limits",
    "edit_personal_details",
    "exchange_charge",
    "exchange_rate",
    "exchange_via_app",
    "extra_charge_on_statement",
    "failed_transfer",
    "fiat_currency_support",
    "get_disposable_virtual_card",
    "get_physical_card",
    "getting_spare_card",
    "getting_virtual_card",
    "lost_or_stolen_card",
    "lost_or_stolen_phone",
    "order_physical_card",
    "passcode_forgotten",
    "pending_card_payment",
    "pending_cash_withdrawal",
    "pending_top_up",
    "pending_transfer",
    "pin_blocked",
    "receiving_money",
    "Refund_not_showing_up",
    "request_refund",
    "reverted_card_payment?",
    "supported_cards_and_currencies",
    "terminate_account",
    "top_up_by_bank_transfer_charge",
    "top_up_by_card_charge",
    "top_up_by_cash_or_cheque",
    "top_up_failed",
    "top_up_limits",
    "top_up_reverted",
    "topping_up_by_card",
    "transaction_charged_twice",
    "transfer_fee_charged",
    "transfer_into_account",
    "transfer_not_received_by_recipient",
    "transfer_timing",
    "unable_to_verify_identity",
    "verify_my_identity",
    "verify_source_of_funds",
    "verify_top_up",
    "virtual_card_not_working",
    "visa_or_mastercard",
    "why_verify_identity",
    "wrong_amount_of_cash_received",
    "wrong_exchange_rate_for_cash_withdrawal",
]

TOOL_NAME = "banking77_classify"
TOOL_SCHEMA = {
    "type": "function",
    "function": {
        "name": TOOL_NAME,
        "description": "Return the predicted banking77 intent label.",
        "parameters": {
            "type": "object",
            "properties": {"intent": {"type": "string"}},
            "required": ["intent"],
        },
    },
}


def format_available_intents(labels: list) -> str:
    return "\n".join(f"{i + 1}. {label}" for i, label in enumerate(labels))


dataset = load_dataset("banking77", split="test", trust_remote_code=False)
label_names = dataset.features["label"].names
print(f"Loaded {len(dataset)} samples, {len(label_names)} intents")

## Step 3: Local API

Expose the task via HTTP so Synth can run optimization against it.

In [None]:
from synth_ai.sdk.localapi import LocalAPIConfig, create_local_api
from synth_ai.sdk.task.contracts import RolloutMetrics, RolloutRequest, RolloutResponse, TaskInfo
from synth_ai.sdk.tunnels import TunnelBackend, TunneledLocalAPI

BASELINE_SYSTEM_PROMPT = """You are an expert banking assistant that classifies customer queries into banking intents. Given a customer message, respond with exactly one intent label from the provided list using the `banking77_classify` tool."""

USER_PROMPT = "Customer Query: {query}\n\nAvailable Intents:\n{available_intents}\n\nClassify this query into one of the above banking intents using the tool call."


class Banking77Dataset:
    def __init__(self):
        self._cache = {}
        self._label_names = None

    def _load_split(self, split: str):
        if split not in self._cache:
            ds = load_dataset("banking77", split=split, trust_remote_code=False)
            self._cache[split] = ds
            if self._label_names is None:
                self._label_names = ds.features["label"].names
        return self._cache[split]

    def size(self, split: str) -> int:
        return len(self._load_split(split))

    def sample(self, *, split: str, index: int) -> dict:
        ds = self._load_split(split)
        row = ds[index % len(ds)]
        return {
            "index": index % len(ds),
            "split": split,
            "text": row["text"],
            "label": self._label_names[row["label"]],
        }


def create_banking77_local_api(system_prompt: str):
    ds = Banking77Dataset()
    ds._load_split("train")
    ds._load_split("test")

    async def run_rollout(request: RolloutRequest, fastapi_request) -> RolloutResponse:
        sample = ds.sample(
            split=request.env.config.get("split", "train"),
            index=request.env.seed,
        )

        messages = [
            {"role": "system", "content": system_prompt},
            {
                "role": "user",
                "content": f"Customer Query: {sample['text']}\n\nAvailable Intents:\n{format_available_intents(BANKING77_LABELS)}\n\nClassify this query into one of the above banking intents using the tool call.",
            },
        ]

        from openai import AsyncOpenAI

        client = AsyncOpenAI(
            base_url=f"{SYNTH_API_BASE}/api/openai/v1",
            api_key=API_KEY,
        )
        response = await client.chat.completions.create(
            model=request.policy.config.get("model", "gpt-4.1-nano"),
            messages=messages,
            tools=[TOOL_SCHEMA],
            tool_choice={"type": "function", "function": {"name": TOOL_NAME}},
        )

        predicted = json.loads(response.choices[0].message.tool_calls[0].function.arguments)[
            "intent"
        ]
        expected = sample["label"]
        reward = (
            1.0
            if predicted.lower().replace("_", " ") == expected.lower().replace("_", " ")
            else 0.0
        )

        return RolloutResponse(
            run_id=request.run_id,
            metrics=RolloutMetrics(outcome_reward=reward),
            trace=None,
        )

    def provide_taskset_description():
        return {
            "splits": ["train", "test"],
            "sizes": {"train": ds.size("train"), "test": ds.size("test")},
        }

    def provide_task_instances(seeds):
        for seed in seeds:
            sample = ds.sample(split="train", index=seed)
            yield TaskInfo(
                task={"id": "banking77", "name": "Banking77"},
                dataset={"id": "banking77", "split": sample["split"], "index": sample["index"]},
                inference={"tool": TOOL_NAME},
                limits={"max_turns": 1},
                task_metadata={"query": sample["text"], "expected_intent": sample["label"]},
            )

    return create_local_api(
        LocalAPIConfig(
            app_id="banking77",
            name="Banking77",
            description="Banking77 intent classification",
            provide_taskset_description=provide_taskset_description,
            provide_task_instances=provide_task_instances,
            rollout=run_rollout,
            cors_origins=["*"],
        )
    )


print("Starting local API...")
baseline_app = create_banking77_local_api(BASELINE_SYSTEM_PROMPT)

baseline_tunnel = await TunneledLocalAPI.create_for_app(
    app=baseline_app,
    local_port=None,
    backend=TunnelBackend.CloudflareManagedTunnel,
    progress=True,
)
BASELINE_LOCAL_API_URL = baseline_tunnel.url
print(f"Local API URL: {BASELINE_LOCAL_API_URL}")

## Step 4: Run GEPA

GEPA evolves prompts over multiple generations, selecting the best performers.

In [None]:
from synth_ai.sdk.api.train.prompt_learning import PromptLearningJob

config = {
    "prompt_learning": {
        "algorithm": "gepa",
        "task_app_url": BASELINE_LOCAL_API_URL,
        "env_name": "banking77",
        "initial_prompt": {
            "messages": [
                {"role": "system", "order": 0, "pattern": BASELINE_SYSTEM_PROMPT},
                {"role": "user", "order": 1, "pattern": USER_PROMPT},
            ],
            "wildcards": {"query": "REQUIRED", "available_intents": "OPTIONAL"},
        },
        "policy": {
            "model": "gpt-4.1-nano",
            "provider": "openai",
            "inference_mode": "synth_hosted",
            "temperature": 0.0,
            "max_completion_tokens": 256,
        },
        "gepa": {
            "env_name": "banking77",
            "evaluation": {
                "seeds": list(range(50)),
                "validation_seeds": list(range(50, 60)),
            },
            "rollout": {"budget": 80, "max_concurrent": 8, "minibatch_size": 8},
            "proposer_effort": "MEDIUM",
            "proposer_output_tokens": "FAST",
            "mutation": {"rate": 0.3},
            "population": {"initial_size": 4, "num_generations": 3, "children_per_generation": 3},
            "archive": {"size": 5, "pareto_set_size": 10},
        },
    },
}

job = PromptLearningJob.from_dict(config)
job_id = job.submit()
print(f"Job ID: {job_id}")

result = job.poll_until_complete(timeout=3600.0, interval=3.0, progress=True)
print(f"\nStatus: {result.status.value}")
if result.succeeded:
    print(f"Best Score: {result.best_score:.1%}")

## Step 5: Evaluate

Compare baseline vs optimized prompts on held-out test samples.

In [None]:
from synth_ai.sdk.api.eval import EvalJob, EvalJobConfig
from synth_ai.sdk.learning.prompt_learning_client import PromptLearningClient

if result.succeeded:
    pl_client = PromptLearningClient()
    prompt_results = await pl_client.get_prompts(result.job_id)
    optimized_prompt = next(
        s["content"]
        for s in prompt_results.top_prompts[0]["template"]["sections"]
        if s["role"] == "system"
    )

    print(f"Best training score: {prompt_results.best_score:.1%}")
    print(f"\nOptimized prompt:\n{optimized_prompt[:300]}...")

    optimized_app = create_banking77_local_api(optimized_prompt)
    optimized_tunnel = await TunneledLocalAPI.create_for_app(
        app=optimized_app,
        local_port=None,
        backend=TunnelBackend.CloudflareManagedTunnel,
        progress=True,
    )

    EVAL_SEEDS = list(range(100, 150))

    def run_eval(url: str, name: str):
        job = EvalJob(
            EvalJobConfig(
                local_api_url=url,
                backend_url=SYNTH_API_BASE,
                api_key=API_KEY,
                env_name="banking77",
                seeds=EVAL_SEEDS,
                policy_config={"model": "gpt-4.1-nano", "provider": "openai"},
                env_config={"split": "test"},
                concurrency=10,
            )
        )
        job.submit()
        return job.poll_until_complete(timeout=600.0, interval=2.0, progress=True)

    print("\nEvaluating baseline...")
    baseline_eval = run_eval(BASELINE_LOCAL_API_URL, "baseline")

    print("\nEvaluating optimized...")
    optimized_eval = run_eval(optimized_tunnel.url, "optimized")

    if baseline_eval.succeeded and optimized_eval.succeeded:
        lift = optimized_eval.mean_score - baseline_eval.mean_score
        print(f"\n{'=' * 50}")
        print(f"Baseline:  {baseline_eval.mean_score:.1%}")
        print(f"Optimized: {optimized_eval.mean_score:.1%}")
        print(f"Lift:      {lift:+.1%}")
else:
    print(f"Optimization failed: {result.error}")

In [None]:
from synth_ai.sdk.tunnels import cleanup_all

cleanup_all()
print("Done!")