# Tutorial: Subagent Playground Worker Verifier Generator

Audience:
- Researchers isolating non-planner components to diagnose orchestration failures.

Learning goals:
- Build deterministic test harnesses for Worker/Verifier/Generator.
- Separate planner errors from downstream component errors.


## Outline

1. Create mock tools and worker
2. Build verifier policy simulator
3. Build generator summarizer
4. Run component-level regression tests


In [None]:
from __future__ import annotations

from dataclasses import dataclass
from typing import Dict, Any, List

MOCK_WEB = {
    "capital of france": "Paris",
    "capital of japan": "Tokyo",
}

def mock_google_search(query: str) -> Dict[str, Any]:
    return {"query": query, "result": MOCK_WEB.get(query.lower(), "UNKNOWN")}


In [None]:
@dataclass
class WorkerOutput:
    command: str
    result: Any

def worker_execute(tool_name: str, payload: Dict[str, Any]) -> WorkerOutput:
    if tool_name == "Google_Search_Tool":
        q = payload["query"]
        result = mock_google_search(q)
        return WorkerOutput(command=f"tool.execute(query={q!r})", result=result)

    if tool_name == "Base_Generator_Tool":
        text = payload.get("text", "")
        return WorkerOutput(command="tool.execute(text=...)", result=text[:120])

    raise ValueError(f"Unsupported tool in this lab: {tool_name}")


In [None]:
def verifier_decision(memory_actions: List[Dict[str, Any]], question: str) -> str:
    # Simple stand-in policy:
    # STOP if latest result contains expected keyword for this toy example.
    if not memory_actions:
        return "CONTINUE"

    latest = memory_actions[-1]["result"]
    as_text = str(latest).lower()

    if "france" in question.lower() and "paris" in as_text:
        return "STOP"
    if "japan" in question.lower() and "tokyo" in as_text:
        return "STOP"

    return "CONTINUE"

def generator_direct(question: str, memory_actions: List[Dict[str, Any]]) -> str:
    if not memory_actions:
        return "I do not have enough evidence."
    return f"Question: {question}\nAnswer draft: {memory_actions[-1]['result']}"


## Step 1 - Run a component-level integration test

This mimics a minimal non-planner slice: Worker -> Verifier -> Generator.


In [None]:
question = "What is the capital of France?"
memory_actions: List[Dict[str, Any]] = []

worker_out = worker_execute("Google_Search_Tool", {"query": "capital of france"})
memory_actions.append({"tool": "Google_Search_Tool", "result": worker_out.result})

decision = verifier_decision(memory_actions, question)
final_answer = generator_direct(question, memory_actions)

print("Worker command:", worker_out.command)
print("Verifier decision:", decision)
print("Generator output:
", final_answer)


## Step 2 - Regression tests

Build a test set to isolate failures by component.


In [None]:
test_cases = [
    {"q": "What is the capital of France?", "query": "capital of france", "expected": "STOP"},
    {"q": "What is the capital of Japan?", "query": "capital of japan", "expected": "STOP"},
    {"q": "What is the capital of Spain?", "query": "capital of spain", "expected": "CONTINUE"},
]

ok = 0
for item in test_cases:
    mem = []
    out = worker_execute("Google_Search_Tool", {"query": item['query']})
    mem.append({"tool": "Google_Search_Tool", "result": out.result})
    pred = verifier_decision(mem, item['q'])
    passed = pred == item['expected']
    ok += int(passed)
    print(item['q'], "->", pred, "| expected", item['expected'], "| pass=", passed)

print(f"pass rate: {ok}/{len(test_cases)}")


## Exercises

1. Replace `verifier_decision` with a stricter policy and compare false-stop rate.
2. Add timeout/error simulation in `worker_execute`.
3. Add generator formatting constraints and evaluate answer faithfulness.
