In [None]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Vertex AI End-to-End Evaluation for A2A Reimbursement Agent (in memory server)

This notebook demonstrates how to evaluate a Reimbursement A2A Agent using Vertex AI Evaluation services.

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/a2aproject/a2a-samples/blob/main/notebooks/a2a_evaluation.ipynb">
      <img width="32px" src="https://www.gstatic.com/pantheon/images/bigquery/welcome_page/colab-logo.svg" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2Fa2aproject%2Fa2a-samples%2Fmain%2Fnotebooks%2Fa2a_evaluation.ipynb">
      <img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/a2aproject/a2a-samples/main/notebooks/a2a_evaluation.ipynb">
      <img src="https://www.gstatic.com/images/branding/gcpiconscolors/vertexai/v1/32px.svg" alt="Vertex AI logo"><br> Open in Vertex AI Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/a2aproject/a2a-samples/blob/main/notebooks/a2a_evaluation.ipynb">
      <img width="32px" src="https://www.svgrepo.com/download/217753/github.svg" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

| Author |
| --- |
| [Junjie Bu](https://github.com/mindpower) |

## Prerequisites

-  **Google Cloud Project:** You need a Google Cloud Project with the Vertex AI API enabled.
- **Agent Logic:** The core logic for the Reimbursement Agent (e.g., a `ReimbursementAgentExecutor` class) must be defined or importable within this notebook. This executor should have a method like `async def execute(self, message_payload: a2a.types.MessagePayload) -> a2a.types.Message:`.

## 1. Setup and Installation

First, install the required dependencies:

In [None]:
%pip install --upgrade --quiet google-cloud-aiplatform httpx "a2a-sdk==0.2.16" "google-adk==1.2.0"

### Authenticate your notebook environment (Colab only)

If you're running this notebook on Google Colab, run the cell below to authenticate your environment.

In [None]:
import sys

if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user()

### Import libraries

In [None]:
from collections.abc import AsyncIterable
import json
import logging
import os

# General
import random
import string
from typing import Any

from IPython.display import HTML, Markdown, display
from a2a.server.agent_execution import AgentExecutor, RequestContext
from a2a.server.apps.starlette_app import A2AStarletteApplication
from a2a.server.events import EventQueue
from a2a.server.request_handlers.default_request_handler import DefaultRequestHandler
from a2a.server.tasks import InMemoryTaskStore, TaskUpdater
from a2a.types import (
    AgentCapabilities,
    AgentCard,
    AgentSkill,
    DataPart,
    Part,
    Task,
    TaskState,
    TextPart,
)
from a2a.utils import (
    new_agent_parts_message,
    new_agent_text_message,
    new_task,
)
from a2a.utils.errors import MethodNotImplementedError
from google.adk.agents.llm_agent import LlmAgent
from google.adk.artifacts import InMemoryArtifactService

# Build agent with adk
from google.adk.events import Event
from google.adk.memory.in_memory_memory_service import InMemoryMemoryService
from google.adk.runners import Runner
from google.adk.sessions import InMemorySessionService
from google.adk.tools.tool_context import ToolContext

# Evaluate agent
from google.cloud import aiplatform
from google.genai import types
import pandas as pd
import plotly.graph_objects as go

# Starlette Test Client
from starlette.testclient import TestClient
from vertexai.preview.evaluation import EvalTask

## 2. Configuration

In [None]:
# --- Google Cloud Configuration ---
PROJECT_ID = "[your-project-id]"  # @param {type: "string", placeholder: "[your-project-id]", isTemplate: true}
LOCATION = "us-central1"  # @param {type:"string"} Fill in your Google Cloud region

if not PROJECT_ID:
    raise ValueError("Please set your PROJECT_ID.")
os.environ["GOOGLE_CLOUD_PROJECT"] = PROJECT_ID
os.environ["GOOGLE_CLOUD_LOCATION"] = LOCATION
os.environ["GOOGLE_GENAI_USE_VERTEXAI"] = "True"

aiplatform.init(project=PROJECT_ID, location=LOCATION)

In [None]:
EXPERIMENT_NAME = "evaluate-a2a"  # @param {type:"string"}
BUCKET_NAME = "[your-bucket-name]"  # @param {type: "string", placeholder: "[your-bucket-name]", isTemplate: true}
BUCKET_URI = f"gs://{BUCKET_NAME}"

## Defining Reimbursement Agent

In [None]:
# Local cache of created request_ids for demo purposes.
request_ids = set()


def create_request_form(
    date: str | None = None,
    amount: str | None = None,
    purpose: str | None = None,
) -> dict[str, Any]:
    """Create a request form for the employee to fill out.

    Args:
        date (str): The date of the request. Can be an empty string.
        amount (str): The requested amount. Can be an empty string.
        purpose (str): The purpose of the request. Can be an empty string.

    Returns:
        dict[str, Any]: A dictionary containing the request form data.
    """
    request_id = "request_id_" + str(random.randint(1000000, 9999999))
    request_ids.add(request_id)
    return {
        "request_id": request_id,
        "date": date if date else "<transaction date>",
        "amount": amount if amount else "<transaction dollar amount>",
        "purpose": (
            purpose
            if purpose
            else "<business justification/purpose of the transaction>"
        ),
    }


def return_form(
    form_request: dict[str, Any],
    tool_context: ToolContext,
    instructions: str | None = None,
) -> dict[str, Any]:
    """Returns a structured json object indicating a form to complete.

    Args:
        form_request (dict[str, Any]): The request form data.
        tool_context (ToolContext): The context in which the tool operates.
        instructions (str): Instructions for processing the form. Can be an empty string.

    Returns:
        dict[str, Any]: A JSON dictionary for the form response.
    """
    if isinstance(form_request, str):
        form_request = json.loads(form_request)

    tool_context.actions.skip_summarization = True
    tool_context.actions.escalate = True
    form_dict = {
        "type": "form",
        "form": {
            "type": "object",
            "properties": {
                "date": {
                    "type": "string",
                    "format": "date",
                    "description": "Date of expense",
                    "title": "Date",
                },
                "amount": {
                    "type": "string",
                    "format": "number",
                    "description": "Amount of expense",
                    "title": "Amount",
                },
                "purpose": {
                    "type": "string",
                    "description": "Purpose of expense",
                    "title": "Purpose",
                },
                "request_id": {
                    "type": "string",
                    "description": "Request id",
                    "title": "Request ID",
                },
            },
            "required": list(form_request.keys()),
        },
        "form_data": form_request,
        "instructions": instructions,
    }
    return json.dumps(form_dict)


def reimburse(request_id: str) -> dict[str, Any]:
    """Reimburse the amount of money to the employee for a given request_id."""
    if request_id not in request_ids:
        return {
            "request_id": request_id,
            "status": "Error: Invalid request_id.",
        }
    return {"request_id": request_id, "status": "approved"}


class ReimbursementAgent:
    """An agent that handles reimbursement requests."""

    SUPPORTED_CONTENT_TYPES = ["text", "text/plain"]

    def __init__(self):
        self._agent = self._build_agent()
        self._user_id = "remote_agent"
        self._runner = Runner(
            app_name=self._agent.name,
            agent=self._agent,
            artifact_service=InMemoryArtifactService(),
            session_service=InMemorySessionService(),
            memory_service=InMemoryMemoryService(),
        )

    def get_processing_message(self) -> str:
        return "Processing the reimbursement request..."

    def _build_agent(self) -> LlmAgent:
        """Builds the LLM agent for the reimbursement agent."""
        return LlmAgent(
            model="gemini-2.0-flash-001",
            name="reimbursement_agent",
            description=(
                "This agent handles the reimbursement process for the employees"
                " given the amount and purpose of the reimbursement."
            ),
            instruction="""
    You are an agent who handles the reimbursement process for employees.

    When you receive a reimbursement request, you should first create a new request form using create_request_form(). Only provide default values if they are provided by the user, otherwise use an empty string as the default value.
      1. 'Date': the date of the transaction.
      2. 'Amount': the dollar amount of the transaction.
      3. 'Business Justification/Purpose': the reason for the reimbursement.

    Once you created the form, you should return the result of calling return_form with the form data from the create_request_form call.

    Once you received the filled-out form back from the user, you should then check the form contains all required information:
      1. 'Date': the date of the transaction.
      2. 'Amount': the value of the amount of the reimbursement being requested.
      3. 'Business Justification/Purpose': the item/object/artifact of the reimbursement.

    If you don't have all of the information, you should reject the request directly by calling the request_form method, providing the missing fields.


    For valid reimbursement requests, you can then use reimburse() to reimburse the employee.
      * In your response, you should include the request_id and the status of the reimbursement request.

    """,
            tools=[
                create_request_form,
                reimburse,
                return_form,
            ],
        )

    async def stream(self, query, session_id) -> AsyncIterable[dict[str, Any]]:
        session = await self._runner.session_service.get_session(
            app_name=self._agent.name,
            user_id=self._user_id,
            session_id=session_id,
        )
        content = types.Content(role="user", parts=[types.Part.from_text(text=query)])
        if session is None:
            session = await self._runner.session_service.create_session(
                app_name=self._agent.name,
                user_id=self._user_id,
                state={},
                session_id=session_id,
            )
        async for event in self._runner.run_async(
            user_id=self._user_id, session_id=session.id, new_message=content
        ):
            if event.is_final_response():
                response = ""
                if (
                    event.content
                    and event.content.parts
                    and event.content.parts[0].text
                ):
                    response = "\n".join(
                        [p.text for p in event.content.parts if p.text]
                    )
                elif (
                    event.content
                    and event.content.parts
                    and any([True for p in event.content.parts if p.function_response])
                ):
                    response = next(
                        p.function_response.model_dump() for p in event.content.parts
                    )
                yield {
                    "is_task_complete": True,
                    "content": response,
                }
            else:
                yield {
                    "is_task_complete": False,
                    "updates": self.get_processing_message(),
                }

## Implement `AgentExecutor`

In [None]:
class ReimbursementAgentExecutor(AgentExecutor):
    """Reimbursement AgentExecutor Example."""

    def __init__(self):
        self.agent = ReimbursementAgent()

    async def execute(
        self,
        context: RequestContext,
        event_queue: EventQueue,
    ) -> None:
        query = context.get_user_input()
        task = context.current_task

        # This agent always produces Task objects. If this request does
        # not have current task, create a new one and use it.
        if not task:
            task = new_task(context.message)
            event_queue.enqueue_event(task)
        updater = TaskUpdater(event_queue, task.id, task.contextId)
        # invoke the underlying agent, using streaming results. The streams
        # now are update events.
        async for item in self.agent.stream(query, task.contextId):
            is_task_complete = item["is_task_complete"]
            artifacts = None
            if not is_task_complete:
                updater.update_status(
                    TaskState.working,
                    new_agent_text_message(item["updates"], task.contextId, task.id),
                )
                continue
            # If the response is a dictionary, assume its a form
            if isinstance(item["content"], dict):
                # Verify it is a valid form
                if (
                    "response" in item["content"]
                    and "result" in item["content"]["response"]
                ):
                    data = json.loads(item["content"]["response"]["result"])
                    updater.update_status(
                        TaskState.input_required,
                        new_agent_parts_message(
                            [Part(root=DataPart(data=data))],
                            task.contextId,
                            task.id,
                        ),
                        final=True,
                    )
                    continue
                else:
                    updater.update_status(
                        TaskState.failed,
                        new_agent_text_message(
                            "Reaching an unexpected state",
                            task.contextId,
                            task.id,
                        ),
                        final=True,
                    )
                    break
            else:
                # Emit the appropriate events
                updater.add_artifact(
                    [Part(root=TextPart(text=item["content"]))], name="form"
                )
                updater.complete()
                break

    async def cancel(
        self, request: RequestContext, event_queue: EventQueue
    ) -> Task | None:
        raise MethodNotImplementedError(
            "ReimbursementAgentExecutor does not support cancel operation."
        )

## Define A2A Key Objects

In [None]:
capabilities = AgentCapabilities(streaming=True)
skill = AgentSkill(
    id="process_reimbursement",
    name="Process Reimbursement Tool",
    description="Helps with the reimbursement process for users given the amount and purpose of the reimbursement.",
    tags=["reimbursement"],
    examples=["Can you reimburse me $20 for my lunch with the clients?"],
)
agent_card = AgentCard(
    name="Reimbursement Agent",
    description="This agent handles the reimbursement process for the employees given the amount and purpose of the reimbursement.",
    url="http://localhost/agent",  # Placeholder, not used by TestClient
    # url=f'http://{host}:{port}/',
    version="1.0.0",
    default_input_modes=ReimbursementAgent.SUPPORTED_CONTENT_TYPES,
    default_output_modes=ReimbursementAgent.SUPPORTED_CONTENT_TYPES,
    capabilities=capabilities,
    skills=[skill],
)
request_handler = DefaultRequestHandler(
    agent_executor=ReimbursementAgentExecutor(),
    task_store=InMemoryTaskStore(),
)
server = A2AStarletteApplication(agent_card=agent_card, http_handler=request_handler)

# Build the Starlette ASGI app
# This `starlette_app` can be served by Uvicorn or used with TestClient
expense_starlette_app = server.build()

In [None]:
# Basic logging setup (helpful for seeing what the handler does)
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [None]:
# TestClient should be used as a context manager or closed explicitly
with TestClient(expense_starlette_app) as client:
    logger.info("\n--- Test 1: Get Agent Card ---")
    response = client.get("/.well-known/agent.json")
    assert response.status_code == 200
    agent_card_data = response.json()
    print(f"SUCCESS: Agent Card received: {agent_card_data['name']}")
    print("A2AClient initialized.")

    print("\n--- Quick Test : Non-streaming RPC - message/send ---")
    message_id_send = "colab-msg-007"
    rpc_request_send_msg = {
        "jsonrpc": "2.0",
        "id": "colab-req-send-msg-1",
        "method": "message/send",
        "params": {
            "message": {
                "role": "user",
                "parts": [
                    {
                        "kind": "text",
                        "text": "Hello Agent, Please reimburse me $20 for my lunch with the clients on 06/01/2025?",
                    }
                ],  # good one
                "messageId": message_id_send,
                "kind": "message",
                "contextId": "colab-session-xyz",
            }
        },
    }
    response = client.post("/", json=rpc_request_send_msg)
    assert response.status_code == 200
    rpc_response_send_msg = response.json()
    print(f"message/send response: {json.dumps(rpc_response_send_msg, indent=2)}")
    print(f"SUCCESS: message/send for '{message_id_send}' passed.")

## Define Eval helper functions

Initiate a set of helper functions to print tutorial results.

In [None]:
# @title helper functions


def get_id(length: int = 8) -> str:
    """Generate a uuid of a specified length (default=8)."""
    return "".join(random.choices(string.ascii_lowercase + string.digits, k=length))


def parse_a2a_output_to_dictionary(rpc_response_send_msg: dict) -> dict[str, Any]:
    """Parse ADK event output into a structured dictionary format, with the predicted trajectory dumped as a JSON string."""

    final_response = ""
    predicted_trajectory_list = []

    if (
        "result" in rpc_response_send_msg
        and "artifacts" in rpc_response_send_msg["result"]
    ):
        for artifact in rpc_response_send_msg["result"]["artifacts"]:
            if artifact and "parts" in artifact:
                for part in artifact["parts"]:
                    if "kind" in part and part["kind"] == "text" and "text" in part:
                        final_response = part["text"]

    if final_response == "":
        state = ""
        if (
            "result" in rpc_response_send_msg
            and "status" in rpc_response_send_msg["result"]
        ):
            state = rpc_response_send_msg["result"]["status"]["state"]
        final_response = state
    # Dump the collected trajectory list into a JSON string
    final_output = {
        "response": str(final_response),
        "predicted_trajectory": json.dumps(predicted_trajectory_list),
    }
    return final_output


def parse_adk_output_to_dictionary(events: list[Event]) -> dict[str, Any]:
    """Parse ADK event output into a structured dictionary format, with the predicted trajectory dumped as a JSON string."""

    final_response = ""
    predicted_trajectory_list = []

    for event in events:
        # Ensure content and parts exist before accessing them
        if not event.content or not event.content.parts:
            continue

        # Iterate through ALL parts in the event's content
        for part in event.content.parts:
            if part.function_call:
                tool_info = {
                    "tool_name": part.function_call.name,
                    "tool_input": dict(part.function_call.args),
                }
                # Ensure we don't add duplicates if the same call appears somehow
                if tool_info not in predicted_trajectory_list:
                    predicted_trajectory_list.append(tool_info)

            # The final text response is usually in the last event from the model
            if event.content.role == "model" and part.text:
                # Overwrite response; the last text response found is likely the final one
                final_response = part.text.strip()

    # Dump the collected trajectory list into a JSON string
    final_output = {
        "response": str(final_response),
        "predicted_trajectory": json.dumps(predicted_trajectory_list),
    }

    return final_output


def format_output_as_markdown(output: dict) -> str:
    """Convert the output dictionary to a formatted markdown string."""
    markdown = "### AI Response\n"
    markdown += f"{output['response']}\n\n"

    if output["predicted_trajectory"]:
        output["predicted_trajectory"] = json.loads(output["predicted_trajectory"])
        markdown += "### Function Calls\n"
        for call in output["predicted_trajectory"]:
            markdown += f"- **Function**: `{call['tool_name']}`\n"
            markdown += "  - **Arguments**:\n"
            for key, value in call["tool_input"].items():
                markdown += f"    - `{key}`: `{value}`\n"

    return markdown


def display_eval_report(eval_result: pd.DataFrame) -> None:
    """Display the evaluation results."""
    metrics_df = pd.DataFrame.from_dict(eval_result.summary_metrics, orient="index").T
    display(Markdown("### Summary Metrics"))
    display(metrics_df)

    display(Markdown("### Row-wise Metrics"))
    display(eval_result.metrics_table)


def display_drilldown(row: pd.Series) -> None:
    """Displays a drill-down view for trajectory data within a row."""

    style = "white-space: pre-wrap; width: 800px; overflow-x: auto;"

    if not (
        isinstance(row["predicted_trajectory"], list)
        and isinstance(row["reference_trajectory"], list)
    ):
        return

    for predicted_trajectory, reference_trajectory in zip(
        row["predicted_trajectory"], row["reference_trajectory"]
    ):
        display(
            HTML(
                f"<h3>Tool Names:</h3><div style='{style}'>{predicted_trajectory['tool_name'], reference_trajectory['tool_name']}</div>"
            )
        )

        if not (
            isinstance(predicted_trajectory.get("tool_input"), dict)
            and isinstance(reference_trajectory.get("tool_input"), dict)
        ):
            continue

        for tool_input_key in predicted_trajectory["tool_input"]:
            print("Tool Input Key: ", tool_input_key)

            if tool_input_key in reference_trajectory["tool_input"]:
                print(
                    "Tool Values: ",
                    predicted_trajectory["tool_input"][tool_input_key],
                    reference_trajectory["tool_input"][tool_input_key],
                )
            else:
                print(
                    "Tool Values: ",
                    predicted_trajectory["tool_input"][tool_input_key],
                    "N/A",
                )
        print("\n")
    display(HTML("<hr>"))


def display_dataframe_rows(
    df: pd.DataFrame,
    columns: list[str] | None = None,
    num_rows: int = 3,
    display_drilldown: bool = False,
) -> None:
    """Displays a subset of rows from a DataFrame, optionally including a drill-down view."""

    if columns:
        df = df[columns]

    base_style = "font-family: monospace; font-size: 14px; white-space: pre-wrap; width: auto; overflow-x: auto;"
    header_style = base_style + "font-weight: bold;"

    for _, row in df.head(num_rows).iterrows():
        for column in df.columns:
            display(
                HTML(
                    f"<span style='{header_style}'>{column.replace('_', ' ').title()}: </span>"
                )
            )
            display(HTML(f"<span style='{base_style}'>{row[column]}</span><br>"))

        display(HTML("<hr>"))

        if (
            display_drilldown
            and "predicted_trajectory" in df.columns
            and "reference_trajectory" in df.columns
        ):
            display_drilldown(row)


def plot_bar_plot(
    eval_result: pd.DataFrame, title: str, metrics: list[str] = None
) -> None:
    fig = go.Figure()
    data = []

    summary_metrics = eval_result.summary_metrics
    if metrics:
        summary_metrics = {
            k: summary_metrics[k]
            for k, v in summary_metrics.items()
            if any(selected_metric in k for selected_metric in metrics)
        }

    data.append(
        go.Bar(
            x=list(summary_metrics.keys()),
            y=list(summary_metrics.values()),
            name=title,
        )
    )

    fig = go.Figure(data=data)

    # Change the bar mode
    fig.update_layout(barmode="group")
    fig.show()


def display_radar_plot(eval_results, title: str, metrics=None):
    """Plot the radar plot."""
    fig = go.Figure()
    summary_metrics = eval_results.summary_metrics
    if metrics:
        summary_metrics = {
            k: summary_metrics[k]
            for k, v in summary_metrics.items()
            if any(selected_metric in k for selected_metric in metrics)
        }

    min_val = min(summary_metrics.values())
    max_val = max(summary_metrics.values())

    fig.add_trace(
        go.Scatterpolar(
            r=list(summary_metrics.values()),
            theta=list(summary_metrics.keys()),
            fill="toself",
            name=title,
        )
    )
    fig.update_layout(
        title=title,
        polar=dict(radialaxis=dict(visible=True, range=[min_val, max_val])),
        showlegend=True,
    )
    fig.show()

### Assemble the A2A agents

The Vertex AI Gen AI Evaluation works directly with 'Queryable' agents, and also lets you add your own custom functions with a specific structure (signature).

In this case, you assemble the agent using a custom function. The function triggers the agent for a given input and parse the agent outcome to extract the response and called tools.

In [None]:
def a2a_parsed_outcome(query):
    # TestClient should be used as a context manager or closed explicitly
    # query = "Hello Agent, Please reimburse me $20 for my lunch with the clients on 06/01/2025?"

    with TestClient(expense_starlette_app) as client:
        print("\n--- Get Agent Card ---")
        response = client.get("/.well-known/agent.json")
        assert response.status_code == 200
        agent_card_data = response.json()
        #    assert agent_card_data["name"] == MY_COLAB_AGENT_CARD.name
        print(f"--- SUCCESS: Agent Card received: {agent_card_data['name']} ---")
        print("--- A2AClient initialized. ---")
        print(f"Query: {query}")

        message_id_send = f"colab-msg-{get_id()}"
        rpc_request_send_msg = {
            "jsonrpc": "2.0",
            "id": f"colab-req-send-msg-{get_id()}",
            "method": "message/send",
            "params": {
                "message": {
                    "role": "user",
                    "parts": [{"kind": "text", "text": query}],  # good one
                    "messageId": message_id_send,
                    "kind": "message",
                    "contextId": "colab-session-xyz",
                }
            },
        }
        response = client.post("/", json=rpc_request_send_msg)
        assert response.status_code == 200
        rpc_response_send_msg = response.json()
        print(f"SUCCESS: message/send for '{message_id_send}' Finished")
        return parse_a2a_output_to_dictionary(rpc_response_send_msg)

### Test the A2A agent

Query your A2A agent with some quick examples.

In [None]:
response = a2a_parsed_outcome(query="Get product details for shoes")
display(Markdown(format_output_as_markdown(response)))

response = a2a_parsed_outcome(
    query="Hello Agent, Please reimburse me $20 for my lunch with the clients on 06/01/2025?"
)
display(Markdown(format_output_as_markdown(response)))

response = a2a_parsed_outcome(
    query="Hello Agent, Please reimburse me $311 for my flights from SFO to SEA on 06/11/2025?"
)
display(Markdown(format_output_as_markdown(response)))

response = a2a_parsed_outcome(
    query="Hello Agent, Please reimburse me $50 for my lunch with the clients on Jan 2nd,2024?"
)
display(Markdown(format_output_as_markdown(response)))

### Prepare Agent Evaluation dataset

To evaluate your AI agent using the Vertex AI Gen AI Evaluation service, you need a specific dataset depending on what aspects you want to evaluate of your agent.  

This dataset should include the prompts given to the agent. It can also contain the ideal or expected response (ground truth) and the intended sequence of tool calls the agent should take (reference trajectory) representing the sequence of tools you expect agent calls for each given prompt.

> Optionally, you can provide both generated responses and predicted trajectory (**Bring-Your-Own-Dataset scenario**).

Below you have an example of dataset you might have with a customer support agent with user prompt and the reference trajectory.

In [None]:
# @title Define eval datasets
# The reference trajectory are empty in this example.
eval_data_a2a = {
    "prompt": [
        "Get product details for shoes",
        "Hello Agent, Please reimburse me $20 for my lunch with the clients on 06/01/2025?",
        "Hello Agent, Please reimburse me $20 for my lunch with the clients",
        "Please reimburse me $312 for my meal with the clients on 06/05/2025?",
        "Please reimburse me $1234 for my flight to Seattle on 06/11/2025?",
    ],
    "reference_trajectory": [
        [],
        [],
        [],
        [],
        [],
    ],
}

eval_sample_dataset = pd.DataFrame(eval_data_a2a)

In [None]:
display_dataframe_rows(eval_sample_dataset, num_rows=30)

### Evaluate final response

Similar to model evaluation, you can evaluate the final response of the agent using Vertex AI Gen AI Evaluation.

#### Set response metrics

After agent inference, Vertex AI Gen AI Evaluation provides several metrics to evaluate generated responses. You can use computation-based metrics to compare the response to a reference (if needed) and using existing or custom model-based metrics to determine the quality of the final response.

Check out the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) to learn more.

In [None]:
response_metrics = ["safety", "coherence"]

In [None]:
EXPERIMENT_RUN = f"response-{get_id()}"

response_eval_task = EvalTask(
    dataset=eval_sample_dataset,
    metrics=response_metrics,
    experiment=EXPERIMENT_NAME,
    output_uri_prefix=BUCKET_URI + "/response-metric-eval",
)

response_eval_result = response_eval_task.evaluate(
    runnable=a2a_parsed_outcome, experiment_run_name=EXPERIMENT_RUN
)

display_eval_report(response_eval_result)

#### Visualize evaluation results

Print new evaluation result sample.

In [None]:
display_dataframe_rows(response_eval_result.metrics_table, num_rows=5)