# Synth GEPA Demo - Banking77 (Production)

This notebook demonstrates prompt optimization using Synth's GEPA algorithm.

**Banking77** is a task where an AI classifies customer service requests into one of 77 banking intents.

**Run in Google Colab:** [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/synth-laboratories/synth-ai/blob/main/demos/gepa_banking77/gepa_banking77_prompt_optimization.ipynb)

**Structure:**
1. **Setup** - Install dependencies and configure API keys
2. **Business Logic** - A simple Banking77 classification app
3. **Before/After** - Preview: 78% baseline → 92% optimized
4. **Optimize** - Run GEPA to discover better prompts
5. **Evaluate** - Formal eval on held-out data

In [None]:
# Step 0: Install dependencies (run this first on Colab)
import sys
IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
    print("Running in Google Colab - installing dependencies...")
    !pip install -q "numpy<2" synth-ai httpx fastapi uvicorn datasets nest_asyncio
    
    # Install cloudflared
    !wget -q https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64 -O /usr/local/bin/cloudflared
    !chmod +x /usr/local/bin/cloudflared
    !cloudflared --version
    
    print("Dependencies installed!")
else:
    print("Not in Colab - assuming dependencies are already installed")
    print("Required: pip install synth-ai httpx fastapi uvicorn datasets nest_asyncio")
    print("Required: brew install cloudflare/cloudflare/cloudflared (macOS)")

## Step 1: Setup

Configure all imports, API keys, and environment keys in one place.

In [None]:
# Step 1: Setup - All imports, config, and API keys
import os, sys, json, asyncio
import httpx
import nest_asyncio
nest_asyncio.apply()

from datasets import load_dataset
from openai import AsyncOpenAI
from synth_ai.core.env import PROD_BASE_URL, mint_demo_api_key

# Production backend
SYNTH_API_BASE = PROD_BASE_URL
# Ports are optional - will auto-find available ports if not specified
LOCAL_API_PORT = 8001  # Optional: specify a port, or None to auto-select
OPTIMIZED_LOCAL_API_PORT = 8002  # Optional: specify a port, or None to auto-select

# Get API Key (use env var or mint demo key)
API_KEY = os.environ.get('SYNTH_API_KEY', '')
if not API_KEY:
    print('\nNo SYNTH_API_KEY found, minting demo key...')
    API_KEY = mint_demo_api_key()
    print(f'Demo API Key: {API_KEY[:25]}...')
else:
    print(f'\nUsing SYNTH_API_KEY: {API_KEY[:20]}...')

# Set API keys in environment for SDK to use
os.environ['SYNTH_API_KEY'] = API_KEY

# Set OpenAI API key if available (optional - Step 3 will be skipped if not set)
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY', '')
if OPENAI_API_KEY:
    os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY
    print(f'\nOpenAI API key found: {OPENAI_API_KEY[:20]}...')
else:
    print('\nNo OPENAI_API_KEY found - Step 3 (preview) will be skipped')

# Note: Environment API key is automatically handled by create_local_api()
# No need to call ensure_localapi_auth() manually - it happens on app startup


print('\n' + '=' * 50)
print('SETUP COMPLETE')
print('=' * 50)

## Step 2: Business Logic - A Simple Banking77 Classifier

Here's a simple prompt-based app that classifies customer queries into banking intents.

This is **normal business logic** - just an async function that calls OpenAI. No Synth-specific code here. You could use this exact code in any application.

In [None]:
# Step 2: Business Logic - Banking77 Classification Pipeline
#
# This is a simple prompt app for Banking77 intent classification.
# Run this cell to see the core business logic - no Synth dependencies.

BANKING77_LABELS = [
    "activate_my_card", "age_limit", "apple_pay_or_google_pay", "atm_support", "automatic_top_up",
    "balance_not_updated_after_bank_transfer", "balance_not_updated_after_cheque_or_cash_deposit",
    "beneficiary_not_allowed", "cancel_transfer", "card_about_to_expire", "card_acceptance",
    "card_arrival", "card_delivery_estimate", "card_linking", "card_not_working",
    "card_payment_fee_charged", "card_payment_not_recognised", "card_payment_wrong_exchange_rate",
    "card_swallowed", "cash_withdrawal_charge", "cash_withdrawal_not_recognised", "change_pin",
    "compromised_card", "contactless_not_working", "country_support", "declined_card_payment",
    "declined_cash_withdrawal", "declined_transfer", "direct_debit_payment_not_recognised",
    "disposable_card_limits", "edit_personal_details", "exchange_charge", "exchange_rate",
    "exchange_via_app", "extra_charge_on_statement", "failed_transfer", "fiat_currency_support",
    "get_disposable_virtual_card", "get_physical_card", "getting_spare_card", "getting_virtual_card",
    "lost_or_stolen_card", "lost_or_stolen_phone", "order_physical_card", "passcode_forgotten",
    "pending_card_payment", "pending_cash_withdrawal", "pending_top_up", "pending_transfer",
    "pin_blocked", "receiving_money", "Refund_not_showing_up", "request_refund",
    "reverted_card_payment?", "supported_cards_and_currencies", "terminate_account",
    "top_up_by_bank_transfer_charge", "top_up_by_card_charge", "top_up_by_cash_or_cheque",
    "top_up_failed", "top_up_limits", "top_up_reverted", "topping_up_by_card",
    "transaction_charged_twice", "transfer_fee_charged", "transfer_into_account",
    "transfer_not_received_by_recipient", "transfer_timing", "unable_to_verify_identity",
    "verify_my_identity", "verify_source_of_funds", "verify_top_up", "virtual_card_not_working",
    "visa_or_mastercard", "why_verify_identity", "wrong_amount_of_cash_received",
    "wrong_exchange_rate_for_cash_withdrawal",
]

TOOL_NAME = "banking77_classify"
TOOL_SCHEMA = {
    "type": "function",
    "function": {
        "name": TOOL_NAME,
        "description": "Return the predicted banking77 intent label.",
        "parameters": {
            "type": "object",
            "properties": {"intent": {"type": "string"}},
            "required": ["intent"],
        },
    },
}


def format_available_intents(label_names: list) -> str:
    """Format the list of available intents for the prompt."""
    return "\n".join(f"{i+1}. {l}" for i, l in enumerate(label_names))


async def classify_banking77_query(
    query: str,
    system_prompt: str,
    model: str = "gpt-4o-mini",
) -> str:
    """Classify a banking query into an intent using OpenAI.
    
    This is the CORE PIPELINE - clean async code with NO Synth-specific logic.
    
    Args:
        query: The customer query to classify
        system_prompt: System prompt for the model
        model: Model to use (e.g., "gpt-4o-mini")
    
    Returns:
        The predicted intent label
    """
    client = AsyncOpenAI()  # Uses OPENAI_API_KEY from environment
    available_intents = format_available_intents(BANKING77_LABELS)
    
    user_msg = (
        f"Customer Query: {query}\n\n"
        f"Available Intents:\n{available_intents}\n\n"
        f"Classify this query into one of the above banking intents using the tool call."
    )
    
    response = await client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_msg},
        ],
        tools=[TOOL_SCHEMA],
        tool_choice={"type": "function", "function": {"name": TOOL_NAME}},
    )
    
    tool_call = response.choices[0].message.tool_calls[0]
    args = json.loads(tool_call.function.arguments)
    return args["intent"]


# Load Banking77 dataset
print("Loading Banking77 dataset...")
dataset = load_dataset("banking77", split="test", trust_remote_code=False)
label_names = dataset.features["label"].names
print(f"Loaded {len(dataset)} test samples with {len(label_names)} intent labels")

print('\n' + '=' * 50)
print('BUSINESS LOGIC READY')
print('=' * 50)
print('\nclassify_banking77_query(query, system_prompt) -> intent')
print('\nThis is the core app. Now let\'s see how prompts affect performance...')

## Step 3: Before/After Preview

Compare a **baseline prompt** (78%) vs an **optimized prompt** (92%) on 50 test samples.

In [None]:
# Step 3: Before/After Comparison
#
# Compare baseline vs optimized prompts on 50 test samples.
# The optimized prompt was discovered by GEPA - it achieves ~92% vs ~78% baseline.

import os

OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY", "")
if not OPENAI_API_KEY:
    try:
        from google.colab import userdata
        OPENAI_API_KEY = userdata.get("OPENAI_API_KEY") or ""
    except Exception:
        OPENAI_API_KEY = ""

if OPENAI_API_KEY:
    os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
else:
    print("OPENAI_API_KEY not set; skipping Step 3 eval.")

BASELINE_SYSTEM_PROMPT = """You are an expert banking assistant that classifies customer queries into banking intents. Given a customer message, respond with exactly one intent label from the provided list using the `banking77_classify` tool."""

# This optimized prompt was discovered by GEPA - it adds classification strategy and key distinctions
OPTIMIZED_SYSTEM_PROMPT = """You are a precise banking intent classifier. Analyze customer queries and classify them into exactly one of the 77 predefined banking intents.

Classification Strategy:
1. IDENTIFY THE PRIMARY ACTION: What does the customer want to DO? (activate, cancel, check, transfer, verify, etc.)
2. IDENTIFY THE SUBJECT: What is it about? (card, transfer, payment, account, etc.)
3. IDENTIFY THE STATE: Is it about something pending, failed, declined, or completed?

Key Intent Distinctions:
- "card_arrival" vs "card_delivery_estimate": Both about card delivery. Use "card_arrival" for "where is my card?" and "card_delivery_estimate" for "how long will it take?"
- "get_physical_card" vs "order_physical_card": Use "order_physical_card" for placing an order, "get_physical_card" for asking HOW to get one
- "pending_*" intents: Transaction is IN PROGRESS, not yet complete
- "failed_*" or "declined_*" intents: Transaction was REJECTED
- "*_not_recognised" intents: Customer doesn't recognize a transaction on their statement
- "verify_*" intents: About verification/authentication processes
- "top_up_*" intents: About adding money TO the account
- "transfer_*" intents: About moving money between accounts

Output the single most appropriate intent using the banking77_classify tool."""

# Test on 50 held-out samples
if OPENAI_API_KEY:
    TEST_INDICES = list(range(100, 150))
    
    
    async def score_prompt(system_prompt: str, indices: list[int], prompt_name: str) -> float:
        """Score a prompt on a set of test samples."""
        correct = 0
        total = len(indices)
        
        for i, idx in enumerate(indices):
            sample = dataset[idx]
            query = sample["text"]
            expected = label_names[sample["label"]]
            
            predicted = await classify_banking77_query(
                query=query,
                system_prompt=system_prompt,
                model="gpt-4o-mini",
            )
            
            # Normalize for comparison
            pred_norm = predicted.lower().replace("_", " ").strip()
            exp_norm = expected.lower().replace("_", " ").strip()
            is_correct = pred_norm == exp_norm
            
            if is_correct:
                correct += 1
            
            if (i + 1) % 10 == 0:
                print(f'  {prompt_name}: {i+1}/{total} done, {correct}/{i+1} correct ({correct/(i+1):.0%})')
        
        accuracy = correct / total
        return accuracy
    
    
    print(f'Testing on {len(TEST_INDICES)} samples (indices {TEST_INDICES[0]}-{TEST_INDICES[-1]})...\n')
    
    print('Scoring BASELINE prompt...')
    baseline_score = await score_prompt(BASELINE_SYSTEM_PROMPT, TEST_INDICES, "Baseline")
    
    print('\nScoring OPTIMIZED prompt...')
    optimized_score = await score_prompt(OPTIMIZED_SYSTEM_PROMPT, TEST_INDICES, "Optimized")
    
    print('\n' + '=' * 60)
    print('BEFORE/AFTER COMPARISON')
    print('=' * 60)
    print(f'\nBASELINE PROMPT:')
    print(f'  "{BASELINE_SYSTEM_PROMPT[:80]}..."')
    print(f'  Accuracy: {baseline_score:.0%} ({int(baseline_score * len(TEST_INDICES))}/{len(TEST_INDICES)})')
    
    print(f'\nOPTIMIZED PROMPT (from GEPA):')
    print(f'  "{OPTIMIZED_SYSTEM_PROMPT[:80]}..."')
    print(f'  Accuracy: {optimized_score:.0%} ({int(optimized_score * len(TEST_INDICES))}/{len(TEST_INDICES)})')
    
    lift = optimized_score - baseline_score
    print(f'\nLIFT: {lift:+.0%}')
    
    if lift > 0:
        print('\n>>> Better prompts = better results!')
        print('>>> Now let\'s see how Synth finds these optimized prompts...')


## Step 4: Setup Local API for GEPA

To run GEPA, we need to expose our classification pipeline via HTTP. This cell:
1. Creates a FastAPI wrapper around our business logic
2. Starts it on a local port
3. Creates a Cloudflare tunnel so Synth can reach it

In [None]:
# Step 4: Setup Local API for Optimization
from synth_ai.sdk.localapi import LocalAPIConfig, create_local_api
from synth_ai.sdk.task.contracts import RolloutMetrics, RolloutRequest, RolloutResponse, TaskInfo
from synth_ai.sdk.tunnels import TunneledLocalAPI, TunnelBackend

APP_ID = "banking77"
APP_NAME = "Banking77 Intent Classification"
USER_PROMPT = "Customer Query: {query}\n\nAvailable Intents:\n{available_intents}\n\nClassify this query into one of the above banking intents using the tool call."


class Banking77Dataset:
    """Lazy dataset loader for Banking77."""
    def __init__(self):
        self._cache = {}
        self._label_names = None

    def _load_split(self, split: str):
        if split not in self._cache:
            ds = load_dataset("banking77", split=split, trust_remote_code=False)
            self._cache[split] = ds
            if self._label_names is None and hasattr(ds.features.get("label"), "names"):
                self._label_names = ds.features["label"].names
        return self._cache[split]

    def ensure_ready(self, splits):
        for split in splits:
            self._load_split(split)

    def size(self, split: str) -> int:
        return len(self._load_split(split))

    def sample(self, *, split: str, index: int) -> dict:
        ds = self._load_split(split)
        idx = index % len(ds)
        row = ds[idx]
        label_idx = int(row.get("label", 0))
        label_text = self._label_names[label_idx] if self._label_names and label_idx < len(self._label_names) else f"label_{label_idx}"
        return {"index": idx, "split": split, "text": str(row.get("text", "")), "label": label_text}

    @property
    def label_names(self) -> list:
        if self._label_names is None:
            self._load_split("train")
        return self._label_names or []


def create_banking77_local_api(system_prompt: str):
    """Create a Banking77 local API for optimization."""
    
    dataset = Banking77Dataset()
    dataset.ensure_ready(["train", "test"])
    
    async def run_rollout(request: RolloutRequest, fastapi_request) -> RolloutResponse:
        split = request.env.config.get("split", "train")
        seed = request.env.seed
        sample = dataset.sample(split=split, index=seed)
        
        # Use model from policy config (OPENAI_API_KEY is already set in environment)
        predicted_intent = await classify_banking77_query(
            query=sample["text"],
            system_prompt=system_prompt,
            model=request.policy.config.get("model", "gpt-4o-mini"),
        )
        
        expected_intent = sample["label"]
        is_correct = (
            predicted_intent.lower().replace("_", " ").strip() 
            == expected_intent.lower().replace("_", " ").strip()
        )
        reward = 1.0 if is_correct else 0.0
        
        return RolloutResponse(
            run_id=request.run_id,
            metrics=RolloutMetrics(outcome_reward=reward),
            trace=None,
            trace_correlation_id=request.policy.config.get("trace_correlation_id"),
        )
    
    def provide_taskset_description():
        return {
            "splits": ["train", "test"],
            "sizes": {"train": dataset.size("train"), "test": dataset.size("test")},
        }
    
    def provide_task_instances(seeds):
        for seed in seeds:
            sample = dataset.sample(split="train", index=seed)
            yield TaskInfo(
                task={"id": APP_ID, "name": APP_NAME},
                dataset={"id": APP_ID, "split": sample["split"], "index": sample["index"]},
                inference={"tool": TOOL_NAME},
                limits={"max_turns": 1},
                task_metadata={"query": sample["text"], "expected_intent": sample["label"]},
            )
    
    return create_local_api(LocalAPIConfig(
        app_id=APP_ID,
        name=APP_NAME,
        description=f"{APP_NAME} local API for classifying customer queries into banking intents.",
        provide_taskset_description=provide_taskset_description,
        provide_task_instances=provide_task_instances,
        rollout=run_rollout,
        cors_origins=["*"],
    ))


# Create and start the local API with tunnel
print("Creating baseline local API...")
baseline_app = create_banking77_local_api(BASELINE_SYSTEM_PROMPT)

# Create tunnel - handles server startup, health check, and tunnel creation automatically
print('\nStarting server and provisioning Cloudflare tunnel...')
baseline_tunnel = await TunneledLocalAPI.create_for_app(
    app=baseline_app,
    local_port=LOCAL_API_PORT,  # Optional: None to auto-select port
    backend=TunnelBackend.CloudflareManagedTunnel,
    progress=True,
)
BASELINE_LOCAL_API_URL = baseline_tunnel.url

print(f'\n' + '=' * 50)
print('LOCAL API READY')
print('=' * 50)
print(f'URL: {BASELINE_LOCAL_API_URL}')

## Step 5: Run GEPA Optimization

GEPA (Genetic Evolutionary Prompt Algorithm) evolves prompts over multiple generations.
Each generation evaluates candidates on training samples and selects the best performers.

In [None]:
# Step 5: Run GEPA Optimization
from synth_ai.sdk.api.train.prompt_learning import PromptLearningJob

def run_gepa():
    config_body = {
        'prompt_learning': {
            'algorithm': 'gepa',
            'task_app_url': BASELINE_LOCAL_API_URL,
            'env_name': 'banking77',
            'initial_prompt': {
                'messages': [
                    {'role': 'system', 'order': 0, 'pattern': BASELINE_SYSTEM_PROMPT},
                    {'role': 'user', 'order': 1, 'pattern': USER_PROMPT},
                ],
                'wildcards': {'query': 'REQUIRED', 'available_intents': 'OPTIONAL'},
            },
            'policy': {
                'model': 'gpt-4.1-nano',
                'provider': 'openai',
                'inference_mode': 'synth_hosted',
                'temperature': 0.0,
                'max_completion_tokens': 256,
            },
            'gepa': {
                'env_name': 'banking77',
                'evaluation': {
                    'seeds': list(range(50)),  # Training seeds (used during optimization)
                    'validation_seeds': list(range(50, 60)),  # Validation seeds (held-out, checked during optimization)
                },
                'rollout': {'budget': 80, 'max_concurrent': 8, 'minibatch_size': 8},
                'proposer_effort': 'MEDIUM',  # Controls mutation model: LOW_CONTEXT, LOW, MEDIUM, HIGH
                'proposer_output_tokens': 'FAST',  # Controls mutation length: RAPID, FAST, SLOW
                'mutation': {'rate': 0.3},  # llm_model is deprecated - use proposer_effort instead
                'population': {'initial_size': 4, 'num_generations': 3, 'children_per_generation': 3},
                'archive': {'size': 5, 'pareto_set_size': 10},
            },
        },
    }

    print(f'Creating GEPA job...')
    
    pl_job = PromptLearningJob.from_dict(
        config_dict=config_body,
        skip_health_check=True,
    )
    
    job_id = pl_job.submit()
    print(f'Job ID: {job_id}')

    result = pl_job.poll_until_complete(timeout=3600.0, interval=3.0, progress=True)
    
    print(f'\nFINAL: {result.status.value}')
    
    if result.succeeded:
        print(f'BEST SCORE: {result.best_score}')
    elif result.failed:
        print(f'ERROR: {result.error}')
    
    return result

result = run_gepa()


## Step 6: Evaluate on Held-Out Data

Run formal eval jobs comparing baseline vs optimized prompts on 50 held-out test samples.
This validates that optimization generalizes beyond the training samples.

In [None]:
# Step 6: Evaluate on Held-Out Data
from synth_ai.sdk.api.eval import EvalJob, EvalJobConfig, EvalResult
from synth_ai.sdk.learning.prompt_learning_client import PromptLearningClient
from synth_ai.sdk.learning.prompt_learning_types import PromptResults

EVAL_SEEDS = list(range(100, 150))  # Held-out test samples

def run_eval_job(local_api_url: str, seeds: list[int], mode: str) -> EvalResult:
    """Run an eval job and wait for completion."""
    config = EvalJobConfig(
        local_api_url=local_api_url,
        backend_url=SYNTH_API_BASE,
        api_key=API_KEY,
        env_name='banking77',
        seeds=seeds,
        policy_config={'model': 'gpt-4.1-nano', 'provider': 'openai'},
        env_config={'split': 'test'},
        concurrency=10,
    )
    job = EvalJob(config)
    job.submit()
    return job.poll_until_complete(timeout=600.0, interval=2.0, progress=True)


def extract_system_prompt(prompt_results: PromptResults) -> str:
    """Extract system prompt from the best optimized prompt."""
    sections = prompt_results.top_prompts[0]['template']['sections']
    return next(s['content'] for s in sections if s['role'] == 'system')


if result.succeeded:
    # Retrieve optimized prompt
    pl_client = PromptLearningClient()
    prompt_results = await pl_client.get_prompts(result.job_id)
    gepa_optimized_system = extract_system_prompt(prompt_results)
    best_train_reward = prompt_results.best_score
    
    print('\n' + '=' * 60)
    print('OPTIMIZATION RESULTS')
    print('=' * 60)
    print(f'\nBest Train Reward: {best_train_reward:.1%}')
    print(f'\nOptimized Prompt:')
    print(gepa_optimized_system[:400] + "..." if len(gepa_optimized_system) > 400 else gepa_optimized_system)
    
    # Create optimized API and run final evaluation
    optimized_app = create_banking77_local_api(gepa_optimized_system)
    optimized_tunnel = await TunneledLocalAPI.create_for_app(
        app=optimized_app,
        local_port=OPTIMIZED_LOCAL_API_PORT,
        backend=TunnelBackend.CloudflareManagedTunnel,
        progress=True,
    )
    OPTIMIZED_LOCAL_API_URL = optimized_tunnel.url
    
    baseline_result = run_eval_job(BASELINE_LOCAL_API_URL, EVAL_SEEDS, 'baseline')
    optimized_result = run_eval_job(OPTIMIZED_LOCAL_API_URL, EVAL_SEEDS, 'optimized')
    
    # Final results
    if baseline_result.succeeded and optimized_result.succeeded:
        eval_lift = optimized_result.mean_score - baseline_result.mean_score
        print('\n' + '=' * 60)
        print('FINAL EVALUATION')
        print('=' * 60)
        print(f'Training Score:  {best_train_reward:.1%}')
        print(f'Held-Out Test:   {optimized_result.mean_score:.1%} (baseline: {baseline_result.mean_score:.1%})')
        print(f'Improvement:     {eval_lift:+.1%}')
        
        if eval_lift > 0:
            print('\n✓ Optimization generalizes to held-out data!')
        elif eval_lift == 0:
            print('\n= Same performance on held-out data')
        else:
            print('\n⚠ Possible overfitting (baseline better on held-out)')
else:
    print(f"Job failed: {result.status.value}")
    if result.error:
        print(f"Error: {result.error}")

In [None]:
# Cleanup
from synth_ai.sdk.tunnels import cleanup_all

print('Cleaning up cloudflared processes...')
cleanup_all()
print('Demo complete!')