Skip to content

feat: BaseEvalService declaration and surrounding data models #1353

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jul 1, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
157 changes: 157 additions & 0 deletions src/google/adk/evaluation/base_eval_service.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import annotations

from abc import ABC
from abc import abstractmethod
from typing import AsyncGenerator
from typing import Optional

from pydantic import alias_generators
from pydantic import BaseModel
from pydantic import ConfigDict
from pydantic import Field

from .eval_case import Invocation
from .eval_metrics import EvalMetric
from .eval_result import EvalCaseResult


class EvaluateConfig(BaseModel):
"""Contains configurations need to run an evaluations."""

model_config = ConfigDict(
alias_generator=alias_generators.to_camel,
populate_by_name=True,
)

eval_metrics: list[EvalMetric] = Field(
description="""The list of metrics to be used in Eval.""",
)


class InferenceConfig(BaseModel):
"""Contains configurations need to run inferences."""

model_config = ConfigDict(
alias_generator=alias_generators.to_camel,
populate_by_name=True,
)

labels: Optional[dict[str, str]] = Field(
default=None,
description="""Labels with user-defined metadata to break down billed
charges.""",
)


class InferenceRequest(BaseModel):
"""Represent a request to perform inferences for the eval cases in an eval set."""

model_config = ConfigDict(
alias_generator=alias_generators.to_camel,
populate_by_name=True,
)

app_name: str = Field(
description="""The name of the app to which the eval case belongs to."""
)

eval_set_id: str = Field(description="""Id of the eval set.""")

eval_case_ids: Optional[list[str]] = Field(
default=None,
description="""Id of the eval cases for which inferences need to be
generated.

All the eval case ids should belong to the EvalSet.

If the list of eval case ids are empty or not specified, then all the eval cases
in an eval set are evaluated.
""",
)

inference_config: InferenceConfig = Field(
description="""The config to use for inferencing.""",
)


class InferenceResult(BaseModel):
"""Contains inference results for a single eval case."""

model_config = ConfigDict(
alias_generator=alias_generators.to_camel,
populate_by_name=True,
)

app_name: str = Field(
description="""The name of the app to which the eval case belongs to."""
)

eval_set_id: str = Field(description="""Id of the eval set.""")

eval_case_id: str = Field(
description="""Id of the eval case for which inferences were generated.""",
)

inferences: list[Invocation] = Field(
description="""Inferences obtained from the Agent for the eval case."""
)

session_id: Optional[str] = Field(
description="""Id of the inference session."""
)


class EvaluateRequest(BaseModel):
model_config = ConfigDict(
alias_generator=alias_generators.to_camel,
populate_by_name=True,
)

inference_results: list[InferenceResult] = Field(
description="""A list of inferences that need to be evaluated.""",
)

evaluate_config: EvaluateConfig = Field(
description="""The config to use for evaluations.""",
)


class BaseEvalService(ABC):
"""A service to run Evals for an ADK agent."""

@abstractmethod
async def perform_inference(
self,
inference_request: InferenceRequest,
) -> AsyncGenerator[InferenceResult, None]:
"""Returns InferenceResult obtained from the Agent as and when they are available.

Args:
inference_request: The request for generating inferences.
"""

@abstractmethod
async def evaluate(
self,
evaluate_request: EvaluateRequest,
) -> AsyncGenerator[EvalCaseResult, None]:
"""Returns EvalCaseResult for each item as and when they are available.

Args:
evaluate_request: The request to perform metric evaluations on the
inferences.
"""
14 changes: 14 additions & 0 deletions src/google/adk/evaluation/eval_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,30 @@

from __future__ import annotations

from enum import Enum
from typing import Optional
from typing import Union

from pydantic import alias_generators
from pydantic import BaseModel
from pydantic import ConfigDict
from typing_extensions import TypeAlias

from .eval_case import Invocation
from .evaluator import EvalStatus


class PrebuiltMetrics(Enum):
TOOL_TRAJECTORY_AVG_SCORE = "tool_trajectory_avg_score"

RESPONSE_EVALUATION_SCORE = "response_evaluation_score"

RESPONSE_MATCH_SCORE = "response_match_score"


MetricName: TypeAlias = Union[str, PrebuiltMetrics]


class EvalMetric(BaseModel):
"""A metric used to evaluate a particular aspect of an eval case."""

Expand Down
89 changes: 89 additions & 0 deletions src/google/adk/evaluation/metric_evaluator_registry.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import annotations

import logging

from ..errors.not_found_error import NotFoundError
from .eval_metrics import EvalMetric
from .eval_metrics import MetricName
from .eval_metrics import PrebuiltMetrics
from .evaluator import Evaluator
from .response_evaluator import ResponseEvaluator
from .trajectory_evaluator import TrajectoryEvaluator

logger = logging.getLogger("google_adk." + __name__)


class MetricEvaluatorRegistry:
"""A registry for metric Evaluators."""

_registry: dict[str, type[Evaluator]] = {}

def get_evaluator(self, eval_metric: EvalMetric) -> Evaluator:
"""Returns an Evaluator for the given metric.
A new instance of the Evaluator is returned.
Args:
eval_metric: The metric for which we need the Evaluator.
Raises:
NotFoundError: If there is no evaluator for the metric.
"""
if eval_metric.metric_name not in self._registry:
raise NotFoundError(f"{eval_metric.metric_name} not found in registry.")

return self._registry[eval_metric.metric_name](eval_metric=eval_metric)

def register_evaluator(
self, metric_name: MetricName, evaluator: type[Evaluator]
):
"""Registers an evaluator given the metric name.
If a mapping already exist, then it is updated.
"""
if metric_name in self._registry:
logger.info(
"Updating Evaluator class for %s from %s to %s",
metric_name,
self._registry[metric_name],
evaluator,
)

self._registry[str(metric_name)] = evaluator


def _get_default_metric_evaluator_registry() -> MetricEvaluatorRegistry:
"""Returns an instance of MetricEvaluatorRegistry with standard metrics already registered in it."""
metric_evaluator_registry = MetricEvaluatorRegistry()

metric_evaluator_registry.register_evaluator(
metric_name=PrebuiltMetrics.TOOL_TRAJECTORY_AVG_SCORE,
evaluator=type(TrajectoryEvaluator),
)
metric_evaluator_registry.register_evaluator(
metric_name=PrebuiltMetrics.RESPONSE_EVALUATION_SCORE,
evaluator=type(ResponseEvaluator),
)
metric_evaluator_registry.register_evaluator(
metric_name=PrebuiltMetrics.RESPONSE_MATCH_SCORE,
evaluator=type(ResponseEvaluator),
)

return metric_evaluator_registry


DEFAULT_METRIC_EVALUATOR_REGISTRY = _get_default_metric_evaluator_registry()
19 changes: 18 additions & 1 deletion src/google/adk/evaluation/response_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,24 @@
class ResponseEvaluator(Evaluator):
"""Runs response evaluation for agents."""

def __init__(self, threshold: float, metric_name: str):
def __init__(
self,
threshold: Optional[float] = None,
metric_name: Optional[str] = None,
eval_metric: Optional[EvalMetric] = None,
):
if (threshold is not None and eval_metric) or (
metric_name is not None and eval_metric
):
raise ValueError(
"Either eval_metric should be specified or both threshold and"
" metric_name should be specified."
)

if eval_metric:
threshold = eval_metric.threshold
metric_name = eval_metric.metric_name

if "response_evaluation_score" == metric_name:
self._metric_name = MetricPromptTemplateExamples.Pointwise.COHERENCE
elif "response_match_score" == metric_name:
Expand Down
18 changes: 16 additions & 2 deletions src/google/adk/evaluation/trajectory_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from __future__ import annotations

from typing import Any
from typing import cast
from typing import Optional

from google.genai import types as genai_types
import pandas as pd
Expand All @@ -24,6 +24,7 @@
from typing_extensions import override

from .eval_case import Invocation
from .eval_metrics import EvalMetric
from .evaluation_constants import EvalConstants
from .evaluator import EvalStatus
from .evaluator import EvaluationResult
Expand All @@ -34,7 +35,20 @@
class TrajectoryEvaluator(Evaluator):
"""Evaluates tool use trajectories for accuracy."""

def __init__(self, threshold: float):
def __init__(
self,
threshold: Optional[float] = None,
eval_metric: Optional[EvalMetric] = None,
):
if threshold is not None and eval_metric:
raise ValueError(
"Either eval_metric should be specified or threshold should be"
" specified."
)

if eval_metric:
threshold = eval_metric.threshold

self._threshold = threshold

@override
Expand Down
Loading