# Synth GEPA Demo - Banking77 (Production)

Self-contained notebook for prompt optimization using GEPA against the **production backend**.

**Run in Google Colab:** [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/synth-laboratories/synth-ai/blob/main/demos/gepa_banking77/demo_prod.ipynb)

**What this demo does:**
1. Spins up a local Banking77 classification task app
2. Creates a Cloudflare tunnel to expose it to the internet
3. Runs GEPA prompt optimization on the Synth backend
4. Compares baseline vs optimized prompts on held-out data

In [None]:
# Step 0: Install dependencies (run this first on Colab)
import sys
IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
    print("Running in Google Colab - installing dependencies...")
    !pip install -q httpx pynacl fastapi uvicorn datasets nest_asyncio
    
    # Install cloudflared
    !wget -q https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64 -O /usr/local/bin/cloudflared
    !chmod +x /usr/local/bin/cloudflared
    !cloudflared --version
    
    print("Dependencies installed!")
else:
    print("Not in Colab - assuming dependencies are already installed")
    print("Required: pip install httpx pynacl fastapi uvicorn datasets nest_asyncio")
    print("Required: brew install cloudflare/cloudflare/cloudflared (macOS)")

In [None]:
# Step 1: Imports and Config
import os, sys, time, secrets, base64, asyncio, json, threading
from typing import Any, Optional
from contextlib import asynccontextmanager

import httpx
import uvicorn
from nacl.public import PublicKey, SealedBox
from fastapi import FastAPI, Request, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
from datasets import load_dataset

# Production backend
SYNTH_API_BASE = 'https://api.usesynth.ai'
TASK_APP_PORT = 8001
OPTIMIZED_TASK_APP_PORT = 8002

print(f'Backend: {SYNTH_API_BASE}')
print(f'Task App Ports: {TASK_APP_PORT}, {OPTIMIZED_TASK_APP_PORT}')

In [None]:
# Step 2: Check Backend Health
r = httpx.get(f'{SYNTH_API_BASE}/health', timeout=30)
if r.status_code == 200:
    print(f'Backend health: {r.json()}')
else:
    print(f'WARNING: Backend returned status {r.status_code}')
    print(f'Response: {r.text[:200]}...' if len(r.text) > 200 else f'Response: {r.text}')
    raise RuntimeError(f'Backend not healthy: status {r.status_code}')

In [3]:
# Step 3: Get API Key (use env var or mint demo key)
API_KEY = os.environ.get('SYNTH_API_KEY', '')

if not API_KEY:
    print('No SYNTH_API_KEY found, minting demo key...')
    resp = httpx.post(f'{SYNTH_API_BASE}/api/demo/keys', json={'ttl_hours': 4}, timeout=30)
    resp.raise_for_status()
    API_KEY = resp.json()['api_key']
    print(f'Demo API Key: {API_KEY[:25]}...')
else:
    print(f'Using SYNTH_API_KEY: {API_KEY[:20]}...')

No SYNTH_API_KEY found, minting demo key...
Demo API Key: sk_demo_a3T-ldJZ0ZqDJ0TE7...


In [4]:
# Step 4: Mint and Upload Environment Key
ENVIRONMENT_API_KEY = secrets.token_hex(32)
print(f'Minted env key: {ENVIRONMENT_API_KEY[:12]}...{ENVIRONMENT_API_KEY[-4:]}')

pub_resp = httpx.get(f'{SYNTH_API_BASE}/api/v1/crypto/public-key', 
                     headers={'Authorization': f'Bearer {API_KEY}'}, timeout=30)
pub_resp.raise_for_status()
pubkey_b64 = pub_resp.json()['public_key']

key_bytes = base64.b64decode(pubkey_b64, validate=True)
box = SealedBox(PublicKey(key_bytes))
ciphertext = box.encrypt(ENVIRONMENT_API_KEY.encode('utf-8'))
ciphertext_b64 = base64.b64encode(ciphertext).decode('ascii')

upload_resp = httpx.post(
    f'{SYNTH_API_BASE}/api/v1/env-keys',
    headers={'Authorization': f'Bearer {API_KEY}', 'Content-Type': 'application/json'},
    json={'name': 'ENVIRONMENT_API_KEY', 'ciphertext_b64': ciphertext_b64},
    timeout=30
)
upload_resp.raise_for_status()
print(f'Uploaded env key: {upload_resp.json()}')

Minted env key: 115860176ce8...47c3
Uploaded env key: {'id': '88c0a188-d511-4f23-a59b-9e1fa46ceda9', 'name': 'ENVIRONMENT_API_KEY', 'updated_at': '2025-12-27T05:15:01.143970Z', 'org_id': 'e77ef3a8-677d-4ddd-92d6-0f114d6bbdaf', 'upserted': True}


In [5]:
# Step 5: Define Banking77 Task App Factory
from urllib.parse import urlparse, urlunparse

BANKING77_LABELS = [
    "activate_my_card", "age_limit", "apple_pay_or_google_pay", "atm_support", "automatic_top_up",
    "balance_not_updated_after_bank_transfer", "balance_not_updated_after_cheque_or_cash_deposit",
    "beneficiary_not_allowed", "cancel_transfer", "card_about_to_expire", "card_acceptance",
    "card_arrival", "card_delivery_estimate", "card_linking", "card_not_working",
    "card_payment_fee_charged", "card_payment_not_recognised", "card_payment_wrong_exchange_rate",
    "card_swallowed", "cash_withdrawal_charge", "cash_withdrawal_not_recognised", "change_pin",
    "compromised_card", "contactless_not_working", "country_support", "declined_card_payment",
    "declined_cash_withdrawal", "declined_transfer", "direct_debit_payment_not_recognised",
    "disposable_card_limits", "edit_personal_details", "exchange_charge", "exchange_rate",
    "exchange_via_app", "extra_charge_on_statement", "failed_transfer", "fiat_currency_support",
    "get_disposable_virtual_card", "get_physical_card", "getting_spare_card", "getting_virtual_card",
    "lost_or_stolen_card", "lost_or_stolen_phone", "order_physical_card", "passcode_forgotten",
    "pending_card_payment", "pending_cash_withdrawal", "pending_top_up", "pending_transfer",
    "pin_blocked", "receiving_money", "Refund_not_showing_up", "request_refund",
    "reverted_card_payment?", "supported_cards_and_currencies", "terminate_account",
    "top_up_by_bank_transfer_charge", "top_up_by_card_charge", "top_up_by_cash_or_cheque",
    "top_up_failed", "top_up_limits", "top_up_reverted", "topping_up_by_card",
    "transaction_charged_twice", "transfer_fee_charged", "transfer_into_account",
    "transfer_not_received_by_recipient", "transfer_timing", "unable_to_verify_identity",
    "verify_my_identity", "verify_source_of_funds", "verify_top_up", "virtual_card_not_working",
    "visa_or_mastercard", "why_verify_identity", "wrong_amount_of_cash_received",
    "wrong_exchange_rate_for_cash_withdrawal",
]
TOOL_NAME = "banking77_classify"
TOOL_SCHEMA = {
    "type": "function",
    "function": {
        "name": TOOL_NAME,
        "description": "Return the predicted banking77 intent label.",
        "parameters": {"type": "object", "properties": {"intent": {"type": "string"}}, "required": ["intent"]},
    },
}

# Dataset cache
_dataset_cache = {}
_label_names = None

def load_dataset_split(split: str):
    global _label_names
    if split not in _dataset_cache:
        ds = load_dataset("banking77", split=split, trust_remote_code=False)
        _dataset_cache[split] = ds
        if _label_names is None and hasattr(ds.features.get("label"), "names"):
            _label_names = ds.features["label"].names
    return _dataset_cache[split]

def get_sample(split: str, index: int) -> dict:
    ds = load_dataset_split(split)
    idx = index % len(ds)
    row = ds[idx]
    label_idx = int(row.get("label", 0))
    label_text = _label_names[label_idx] if _label_names and label_idx < len(_label_names) else f"label_{label_idx}"
    return {"index": idx, "split": split, "text": str(row.get("text", "")), "label": label_text}

def format_intents() -> str:
    return "\n".join(f"{i+1}. {l}" for i, l in enumerate(BANKING77_LABELS))

def normalize_intent(intent: str) -> str:
    return intent.lower().replace("_", " ").strip()

def score_prediction(predicted: str, expected: str) -> tuple:
    is_correct = normalize_intent(predicted) == normalize_intent(expected)
    return is_correct, 1.0 if is_correct else 0.0

def normalize_chat_completion_url(url: str) -> str:
    u = (url or "").rstrip("/")
    if not u:
        return "/chat/completions"
    parsed = urlparse(u)
    path = parsed.path.rstrip("/")
    if path.endswith("/chat/completions"):
        return u
    new_path = f"{path}/chat/completions"
    return urlunparse((parsed.scheme, parsed.netloc, new_path, parsed.params, parsed.query, parsed.fragment))

# Pydantic models
class EnvConfig(BaseModel):
    seed: int = 0
    config: dict = {}

class PolicyConfig(BaseModel):
    config: dict = {}

class RolloutRequest(BaseModel):
    run_id: str = ""
    env: EnvConfig = EnvConfig()
    policy: PolicyConfig = PolicyConfig()
    mode: str = "rollout"

class RolloutMetrics(BaseModel):
    episode_returns: list = []
    mean_return: float = 0.0
    num_steps: int = 1
    num_episodes: int = 1
    outcome_score: float = 0.0
    events_score: float = 0.0
    details: dict = {}

class RolloutResponse(BaseModel):
    run_id: str = ""
    branches: dict = {}
    metrics: RolloutMetrics = RolloutMetrics()
    aborted: bool = False
    trace_correlation_id: Optional[str] = None
    trace: Optional[dict] = None
    pipeline_metadata: dict = {}

def create_banking77_task_app(system_prompt: str, env_api_key: str):
    """Factory to create a Banking77 task app with a specific system prompt."""
    _http_client_holder = {"client": None}
    
    @asynccontextmanager
    async def lifespan(app: FastAPI):
        _http_client_holder["client"] = httpx.AsyncClient(timeout=120.0)
        load_dataset_split("train")
        load_dataset_split("test")
        yield
        await _http_client_holder["client"].aclose()
    
    app = FastAPI(lifespan=lifespan)
    app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"])
    
    def check_auth(request: Request) -> bool:
        if not env_api_key:
            return True
        auth = request.headers.get("Authorization", "") or request.headers.get("X-API-Key", "")
        token = auth.replace("Bearer ", "").strip()
        return token == env_api_key
    
    @app.get("/health")
    async def health(request: Request):
        return {"status": "healthy", "authorized": check_auth(request)}
    
    @app.get("/health/rollout")
    async def health_rollout(request: Request):
        return {"ok": True, "authorized": check_auth(request)}
    
    @app.post("/rollout")
    async def rollout(request: Request, body: RolloutRequest):
        if not check_auth(request):
            raise HTTPException(status_code=401, detail="Unauthorized")

        policy_config = body.policy.config or {}
        split = (body.env.config or {}).get("split", "train")
        seed = body.env.seed or 0

        # DEBUG: Log received policy config
        print(f"[ROLLOUT] seed={seed}, split={split}, mode={body.mode}", flush=True)
        print(f"[ROLLOUT] policy_config keys: {list(policy_config.keys())}", flush=True)

        sample = get_sample(split, seed)
        intents_list = format_intents()

        user_msg = f"Customer Query: {sample['text']}\n\nAvailable Intents:\n{intents_list}\n\nClassify this query into one of the above banking intents using the tool call."
        messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_msg}]

        inference_url_raw = policy_config.get("inference_url", "")

        # DEBUG: Log raw inference_url before normalization
        print(f"[ROLLOUT] RAW inference_url: {inference_url_raw}", flush=True)

        if not inference_url_raw:
            print(f"[ROLLOUT] ERROR: Missing inference_url!", flush=True)
            raise HTTPException(status_code=400, detail="Missing inference_url")

        inference_url = normalize_chat_completion_url(inference_url_raw)

        # DEBUG: Log normalized inference_url
        print(f"[ROLLOUT] NORMALIZED inference_url: {inference_url}", flush=True)

        model = policy_config.get("model", "gpt-4.1-nano")
        payload = {
            "model": model,
            "messages": messages,
            "tools": [TOOL_SCHEMA],
            "tool_choice": "required",
            "max_completion_tokens": policy_config.get("max_completion_tokens", 256)
        }
        if policy_config.get("temperature", 0.0) != 0.0:
            payload["temperature"] = policy_config["temperature"]

        headers = {"Content-Type": "application/json"}
        auth_header = request.headers.get("Authorization") or request.headers.get("X-API-Key")
        if auth_header:
            headers["X-API-Key"] = auth_header.replace("Bearer ", "").strip()

        try:
            print(f"[ROLLOUT] POST to: {inference_url}", flush=True)
            resp = await _http_client_holder["client"].post(inference_url, json=payload, headers=headers)
            print(f"[ROLLOUT] Response status: {resp.status_code}", flush=True)
            if resp.status_code != 200:
                print(f"[ROLLOUT] Error response: {resp.text[:300]}", flush=True)
                if resp.status_code == 307:
                    print(f"[ROLLOUT] 307 REDIRECT! Location: {resp.headers.get('location', 'N/A')}", flush=True)
                raise HTTPException(status_code=resp.status_code, detail=f"LLM error: {resp.text[:500]}")
            response_json = resp.json()
        except httpx.HTTPError as e:
            print(f"[ROLLOUT] HTTPError: {e}", flush=True)
            raise HTTPException(status_code=502, detail=f"LLM call failed: {e}")

        predicted_intent = ""
        choices = response_json.get("choices", [])
        if choices:
            message = choices[0].get("message", {})
            tool_calls = message.get("tool_calls", [])
            if tool_calls:
                for tc in tool_calls:
                    if tc.get("function", {}).get("name") == TOOL_NAME:
                        try:
                            args = json.loads(tc["function"].get("arguments", "{}"))
                            predicted_intent = args.get("intent", "")
                        except: pass
            if not predicted_intent:
                predicted_intent = message.get("content", "").strip().split()[0] if message.get("content") else ""
        if not predicted_intent:
            predicted_intent = "__NO_PREDICTION__"

        expected_intent = sample["label"]
        is_correct, reward = score_prediction(predicted_intent, expected_intent)

        trace_correlation_id = policy_config.get("trace_correlation_id")
        if not trace_correlation_id:
            from urllib.parse import urlsplit, parse_qs
            try:
                parsed = urlsplit(policy_config.get("inference_url", ""))
                cid_vals = parse_qs(parsed.query or "").get("cid", [])
                if cid_vals: trace_correlation_id = cid_vals[0]
            except: pass

        llm_model = response_json.get("model") if isinstance(response_json, dict) else None
        trace = {
            "messages": messages,
            "response": response_json,
            "correlation_id": trace_correlation_id,
            "model": llm_model,
            "metadata": {"env": "banking77", "split": sample["split"], "index": sample["index"], "correct": is_correct}
        }
        metrics = RolloutMetrics(
            episode_returns=[reward], mean_return=reward, num_steps=1, num_episodes=1,
            outcome_score=reward, events_score=reward, details={"correct": is_correct}
        )

        return RolloutResponse(
            run_id=body.run_id, branches={}, metrics=metrics, aborted=False,
            trace_correlation_id=trace_correlation_id, trace=trace,
            pipeline_metadata={"inference_url": policy_config.get("inference_url", "")}
        )

    return app

print('Task app factory defined (with debug logging)')

Task app factory defined (with debug logging)


In [None]:
# Step 6: Cloudflare Tunnel Helpers (from SDK)
import nest_asyncio
nest_asyncio.apply()

# Import tunnel helpers from SDK - no need to define inline!
from synth_ai.sdk.tunnels import (
    rotate_tunnel,
    open_managed_tunnel,
    track_process,
    verify_tunnel_dns_resolution,
    kill_port,
    wait_for_health_check,
)
from synth_ai.sdk.task import run_server_background

print('Tunnel helpers imported from SDK')

In [None]:
# Step 7: Start Baseline Task App with Cloudflare Tunnel

BASELINE_SYSTEM_PROMPT = "You are an expert banking assistant that classifies customer queries into banking intents. Given a customer message, respond with exactly one intent label from the provided list using the `banking77_classify` tool."
USER_PROMPT = "Customer Query: {query}\n\nAvailable Intents:\n{available_intents}\n\nClassify this query into one of the above banking intents using the tool call."

# Create baseline task app
baseline_app = create_banking77_task_app(BASELINE_SYSTEM_PROMPT, ENVIRONMENT_API_KEY)

# Kill port if in use, start server in background
kill_port(TASK_APP_PORT)
run_server_background(baseline_app, TASK_APP_PORT)

# Wait for local health check
print(f'Waiting for baseline task app on port {TASK_APP_PORT}...')
await wait_for_health_check("localhost", TASK_APP_PORT, ENVIRONMENT_API_KEY, timeout=30.0)
print('Baseline task app ready!')

# Get tunnel from backend using SDK function
print('\nProvisioning Cloudflare tunnel for baseline...')
baseline_tunnel = await rotate_tunnel(API_KEY, TASK_APP_PORT, reason="baseline_notebook")
BASELINE_TUNNEL_HOSTNAME = baseline_tunnel['hostname']
BASELINE_TASK_APP_URL = f'https://{BASELINE_TUNNEL_HOSTNAME}'
print(f'Baseline tunnel: {BASELINE_TUNNEL_HOSTNAME}')

# Start cloudflared with tracking (auto-cleanup on exit)
track_process(open_managed_tunnel(baseline_tunnel['tunnel_token']))

# Verify tunnel is reachable
print('Waiting for tunnel DNS...')
await verify_tunnel_dns_resolution(BASELINE_TASK_APP_URL, name="baseline", api_key=ENVIRONMENT_API_KEY)

print(f'\nBaseline task app URL: {BASELINE_TASK_APP_URL}')

In [8]:
# Step 8: Run GEPA Optimization (using tunnel URL)
async def run_gepa():
    config_body = {
        'prompt_learning': {
            'algorithm': 'gepa',
            'run_local': False,  # Run on remote backend
            'task_app_url': BASELINE_TASK_APP_URL,
            'task_app_api_key': ENVIRONMENT_API_KEY,
            'env_name': 'banking77',
            'initial_prompt': {
                'messages': [
                    {'role': 'system', 'order': 0, 'pattern': BASELINE_SYSTEM_PROMPT},
                    {'role': 'user', 'order': 1, 'pattern': USER_PROMPT},
                ],
                'wildcards': {'query': 'REQUIRED', 'available_intents': 'OPTIONAL'},
            },
            'policy': {'model': 'gpt-4.1-nano', 'provider': 'openai', 'temperature': 0.0, 'max_completion_tokens': 256},
            'gepa': {
                'env_name': 'banking77',
                'evaluation': {'seeds': list(range(30)), 'validation_seeds': list(range(50, 56))},
                'rollout': {'budget': 50, 'max_concurrent': 5, 'minibatch_size': 5},
                'mutation': {'rate': 0.3, 'llm_model': 'gpt-4.1-nano'},
                'population': {'initial_size': 3, 'num_generations': 2, 'children_per_generation': 2},
                'archive': {'size': 5, 'pareto_set_size': 10},
                'token': {'counting_model': 'gpt-4'},
            },
        },
    }

    print(f'Creating GEPA job (task_app_url={BASELINE_TASK_APP_URL})...')
    async with httpx.AsyncClient(timeout=30) as client:
        resp = await client.post(
            f'{SYNTH_API_BASE}/api/prompt-learning/online/jobs',
            json={'algorithm': 'gepa', 'config_body': config_body},
            headers={'Authorization': f'Bearer {API_KEY}'}
        )
        if resp.status_code != 200:
            print(f'ERROR: {resp.status_code} - {resp.text[:500]}')
            resp.raise_for_status()
        job_id = resp.json()['job_id']
    print(f'Job ID: {job_id}')

    print('Polling...')
    start = time.time()
    last_status = None
    job = None
    
    while True:
        async with httpx.AsyncClient(timeout=30) as client:
            resp = await client.get(
                f'{SYNTH_API_BASE}/api/prompt-learning/online/jobs/{job_id}',
                headers={'Authorization': f'Bearer {API_KEY}'}
            )
            resp.raise_for_status()
            job = resp.json()
        
        status = job['status']
        elapsed = int(time.time() - start)
        best = job.get('best_train_score') or job.get('best_score')
        
        if status != last_status or elapsed % 15 == 0:
            print(f'    [{elapsed}s] {status} (best={best})')
            last_status = status
        
        if status in ['succeeded', 'failed', 'cancelled']:
            break
        await asyncio.sleep(3)

    print(f'\nFINAL: {status}')
    if status == 'succeeded':
        best = job.get('best_score') or job.get('best_train_score')
        print(f'BEST SCORE: {best}')
    elif status == 'failed':
        print(f'ERROR: {job.get("error")}')
    
    return job

job = await run_gepa()

Creating GEPA job (task_app_url=https://task-8001-12511.usesynth.ai)...
Job ID: pl_2606908964e941c8
Polling...
    [0s] queued (best=None)
[ROLLOUT] seed=0, split=train, mode=eval
[ROLLOUT] policy_config keys: ['model', 'provider', 'temperature', 'max_completion_tokens', 'inference_url', 'trace_correlation_id', 'api_base', 'base_url']
[ROLLOUT] RAW inference_url: https://synth-backend-dev-docker.onrender.com/api/interceptor/v1/baseline-0-517dac51?cid=trace_validation-0-b47fe941
[ROLLOUT] NORMALIZED inference_url: https://synth-backend-dev-docker.onrender.com/api/interceptor/v1/baseline-0-517dac51/chat/completions?cid=trace_validation-0-b47fe941
[ROLLOUT] POST to: https://synth-backend-dev-docker.onrender.com/api/interceptor/v1/baseline-0-517dac51/chat/completions?cid=trace_validation-0-b47fe941
[ROLLOUT] Response status: 200
[ROLLOUT] seed=4, split=train, mode=eval
[ROLLOUT] policy_config keys: ['model', 'provider', 'temperature', 'max_completion_tokens', 'inference_url', 'trial_id', '

ERROR:    Exception in ASGI application
Traceback (most recent call last):
  File "/Users/joshpurtell/Documents/GitHub/monorepo/.venv/lib/python3.11/site-packages/uvicorn/protocols/http/httptools_impl.py", line 409, in run_asgi
    result = await app(  # type: ignore[func-returns-value]
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/joshpurtell/Documents/GitHub/monorepo/.venv/lib/python3.11/site-packages/uvicorn/middleware/proxy_headers.py", line 60, in __call__
    return await self.app(scope, receive, send)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/joshpurtell/Documents/GitHub/monorepo/.venv/lib/python3.11/site-packages/fastapi/applications.py", line 1134, in __call__
    await super().__call__(scope, receive, send)
  File "/Users/joshpurtell/Documents/GitHub/monorepo/.venv/lib/python3.11/site-packages/starlette/applications.py", line 113, in __call__
    await self.middleware_stack(scope, receive, send)
  File "/Users/joshpurtell/Docu

[ROLLOUT] seed=16, split=train, mode=eval
[ROLLOUT] policy_config keys: ['model', 'provider', 'temperature', 'max_completion_tokens', 'inference_url', 'trial_id', 'trace_correlation_id', 'version_id']
[ROLLOUT] RAW inference_url: https://synth-backend-dev-docker.onrender.com/api/interceptor/v1/pl-2606908964e941c8-i0000-t0004-s16?cid=trace_pl-2606908964e941c8-i0000-t0004-s16
[ROLLOUT] NORMALIZED inference_url: https://synth-backend-dev-docker.onrender.com/api/interceptor/v1/pl-2606908964e941c8-i0000-t0004-s16/chat/completions?cid=trace_pl-2606908964e941c8-i0000-t0004-s16
[ROLLOUT] POST to: https://synth-backend-dev-docker.onrender.com/api/interceptor/v1/pl-2606908964e941c8-i0000-t0004-s16/chat/completions?cid=trace_pl-2606908964e941c8-i0000-t0004-s16
[ROLLOUT] seed=0, split=train, mode=eval
[ROLLOUT] policy_config keys: ['model', 'provider', 'temperature', 'max_completion_tokens', 'inference_url', 'trial_id', 'trace_correlation_id', 'version_id']
[ROLLOUT] RAW inference_url: https://syn

In [None]:
# Step 9: Run Formal Eval Jobs (Baseline vs Optimized)

EVAL_SEEDS = list(range(100, 120))  # 20 held-out test samples

async def run_eval_job(task_app_url: str, task_app_api_key: str, seeds: list, mode: str) -> dict:
    """Run an eval job and wait for completion."""
    async with httpx.AsyncClient(timeout=30) as client:
        resp = await client.post(
            f'{SYNTH_API_BASE}/api/eval/jobs',
            json={
                'task_app_url': task_app_url,
                'task_app_api_key': task_app_api_key,
                'env_name': 'banking77',
                'seeds': seeds,
                'env_config': {'split': 'test'},
                'policy': {'model': 'gpt-4.1-nano', 'provider': 'openai'},
                'mode': mode,
                'max_concurrent': 10,
            },
            headers={'Authorization': f'Bearer {API_KEY}'}
        )
        if resp.status_code != 200:
            print(f'ERROR creating {mode} eval job: {resp.status_code} - {resp.text[:300]}')
            return {'status': 'failed', 'error': resp.text}
        
        job_id = resp.json()['job_id']
        print(f'  {mode} eval job: {job_id}')
    
    start = time.time()
    while True:
        async with httpx.AsyncClient(timeout=30) as client:
            resp = await client.get(
                f'{SYNTH_API_BASE}/api/eval/jobs/{job_id}',
                headers={'Authorization': f'Bearer {API_KEY}'}
            )
            if resp.status_code != 200:
                print(f'  Error polling: {resp.status_code}')
                await asyncio.sleep(2)
                continue
            job = resp.json()
        
        status = job.get('status', '')
        elapsed = int(time.time() - start)
        
        if status in ['completed', 'failed']:
            break
        
        if elapsed % 10 == 0:
            results = job.get('results') or {}
            completed = results.get('completed', 0)
            total = results.get('total', len(seeds))
            print(f'    [{elapsed}s] {status} ({completed}/{total})')
        
        await asyncio.sleep(2)
    
    async with httpx.AsyncClient(timeout=30) as client:
        resp = await client.get(
            f'{SYNTH_API_BASE}/api/eval/jobs/{job_id}/results',
            headers={'Authorization': f'Bearer {API_KEY}'}
        )
        if resp.status_code != 200:
            return {'status': 'failed', 'error': f'Failed to get results: {resp.status_code}'}
        return resp.json()

if job['status'] == 'succeeded':
    print("GEPA Job Succeeded!\n")
    
    # Get job details with error handling
    async with httpx.AsyncClient(timeout=30) as client:
        resp = await client.get(
            f'{SYNTH_API_BASE}/api/prompt-learning/online/jobs/{job["job_id"]}',
            headers={'Authorization': f'Bearer {API_KEY}'}
        )
        if resp.status_code != 200:
            print(f'ERROR getting job details: {resp.status_code} - {resp.text[:200]}')
            job_details = {}
        else:
            job_details = resp.json()
    
    best_snapshot = job_details.get('best_snapshot') or {}
    
    # Try to get optimized prompt from job_details or from the job itself
    baseline_score_info = best_snapshot.get('baseline_score_info') or {}
    best_score_info = best_snapshot.get('best_score_info') or {}
    baseline_train = baseline_score_info.get('mean_score') or baseline_score_info.get('score')
    optimized_train = best_score_info.get('mean_score') or best_score_info.get('score') or job.get('best_score')
    
    best_prompt = best_snapshot.get('best_prompt', {})
    best_prompt_messages = best_prompt.get('messages', [])
    
    # Extract optimized system prompt
    optimized_system = None
    for msg in best_prompt_messages:
        if msg.get('role') == 'system':
            optimized_system = msg.get('pattern') or msg.get('content')
            break
    
    # If we didn't get optimized_system from best_snapshot, use baseline (for eval comparison)
    if not optimized_system:
        print("NOTE: Could not extract optimized prompt from job details, using baseline for comparison")
        optimized_system = BASELINE_SYSTEM_PROMPT
    
    print('=' * 60)
    print('BASELINE SYSTEM PROMPT')
    print('=' * 60)
    print(BASELINE_SYSTEM_PROMPT)
    
    print('\n' + '=' * 60)
    print('OPTIMIZED SYSTEM PROMPT (from GEPA)')
    print('=' * 60)
    print(optimized_system[:800] + "..." if len(optimized_system) > 800 else optimized_system)
    
    print('\n' + '=' * 60)
    print('GEPA TRAINING RESULTS')
    print('=' * 60)
    if baseline_train:
        print(f"Baseline Train:  {baseline_train:.1%}")
    if optimized_train:
        print(f"Optimized Train: {optimized_train:.1%}")
    if baseline_train and optimized_train:
        print(f"Training Lift:   {optimized_train - baseline_train:+.1%}")
    
    print('\n' + '=' * 60)
    print(f'FORMAL EVAL JOBS (test split, seeds {EVAL_SEEDS[0]}-{EVAL_SEEDS[-1]})')
    print('=' * 60)
    
    # Start optimized task app on different port using SDK helpers
    print(f'\nStarting optimized task app on port {OPTIMIZED_TASK_APP_PORT}...')
    optimized_app = create_banking77_task_app(optimized_system, ENVIRONMENT_API_KEY)
    
    kill_port(OPTIMIZED_TASK_APP_PORT)
    run_server_background(optimized_app, OPTIMIZED_TASK_APP_PORT)
    await wait_for_health_check("localhost", OPTIMIZED_TASK_APP_PORT, ENVIRONMENT_API_KEY, timeout=30.0)
    print('Optimized task app ready!')
    
    # Get tunnel for optimized using SDK
    print('Provisioning Cloudflare tunnel for optimized...')
    optimized_tunnel = await rotate_tunnel(API_KEY, OPTIMIZED_TASK_APP_PORT, reason="optimized_notebook")
    OPTIMIZED_TUNNEL_HOSTNAME = optimized_tunnel['hostname']
    OPTIMIZED_TASK_APP_URL = f'https://{OPTIMIZED_TUNNEL_HOSTNAME}'
    print(f'Optimized tunnel: {OPTIMIZED_TUNNEL_HOSTNAME}')
    
    # Start cloudflared with tracking (auto-cleanup on exit)
    track_process(open_managed_tunnel(optimized_tunnel['tunnel_token']))
    await verify_tunnel_dns_resolution(OPTIMIZED_TASK_APP_URL, name="optimized", api_key=ENVIRONMENT_API_KEY)
    
    # Run baseline eval
    print('\nRunning BASELINE eval job...')
    baseline_results = await run_eval_job(
        task_app_url=BASELINE_TASK_APP_URL,
        task_app_api_key=ENVIRONMENT_API_KEY,
        seeds=EVAL_SEEDS,
        mode='baseline'
    )
    
    baseline_summary = baseline_results.get('summary', {})
    baseline_eval_score = baseline_summary.get('mean_score')
    print(f'  Baseline eval: {baseline_eval_score:.1%}' if baseline_eval_score is not None else '  Baseline eval: N/A')
    
    # Run optimized eval
    print('\nRunning OPTIMIZED eval job...')
    optimized_results = await run_eval_job(
        task_app_url=OPTIMIZED_TASK_APP_URL,
        task_app_api_key=ENVIRONMENT_API_KEY,
        seeds=EVAL_SEEDS,
        mode='optimized'
    )
    
    optimized_summary = optimized_results.get('summary', {})
    optimized_eval_score = optimized_summary.get('mean_score')
    print(f'  Optimized eval: {optimized_eval_score:.1%}' if optimized_eval_score is not None else '  Optimized eval: N/A')
    
    # Final comparison
    print('\n' + '=' * 60)
    print('FINAL COMPARISON')
    print('=' * 60)
    print(f"Training:")
    if baseline_train:
        print(f"  Baseline:  {baseline_train:.1%}")
    if optimized_train:
        print(f"  Optimized: {optimized_train:.1%}")
    if baseline_train and optimized_train:
        print(f"  Lift:      {optimized_train - baseline_train:+.1%}")
    
    print(f"\nEval (seeds {EVAL_SEEDS[0]}-{EVAL_SEEDS[-1]}, held-out):")
    if baseline_eval_score is not None:
        print(f"  Baseline:  {baseline_eval_score:.1%}")
    if optimized_eval_score is not None:
        print(f"  Optimized: {optimized_eval_score:.1%}")
    if baseline_eval_score is not None and optimized_eval_score is not None:
        eval_lift = optimized_eval_score - baseline_eval_score
        print(f"  Lift:      {eval_lift:+.1%}")
        
        if eval_lift > 0:
            print("\n>>> OPTIMIZATION GENERALIZES TO HELD-OUT DATA!")
        elif eval_lift == 0:
            print("\n=== Same performance on held-out data")
        else:
            print("\n<<< Baseline better on held-out (possible overfitting)")
else:
    print(f"Job did not succeed: {job.get('status')}")

In [None]:
# Step 10: Cleanup
from synth_ai.sdk.tunnels import cleanup_all

print('Cleaning up cloudflared processes...')
cleanup_all()
print('Demo complete!')