# Setup

In [None]:
#@title Install Tau-bench and GEPA
!git clone https://github.com/sierra-research/tau-bench.git
%cd tau-bench/
!pip install -e . --quiet

%cd ..
!pip install gepa --quiet

!pip install retry --quiet

%cd tau-bench/

In [None]:
#@title Setup and Authentication
import logging
import os

from google.genai import types
#@markdown Configure Vertex AI Access

GCP_PROJECT = '' #@param {type: 'string'}
GCP_LOCATION = 'us-central1' #@param {type: 'string'}

os.environ['GOOGLE_GENAI_USE_VERTEXAI'] = 'true'
os.environ['GOOGLE_CLOUD_PROJECT'] = GCP_PROJECT
os.environ['GOOGLE_CLOUD_LOCATION'] = GCP_LOCATION

# Set a logging verbosity suited for this experiment. See
# https://github.com/google/adk-python/issues/1852 for context
class _FilterInferenceWarnings(logging.Filter):
  """Filters out Vertex inference warning about non-text parts in response."""

  def filter(self, record):
    if record.levelname != 'WARNING':
      return True
    message_identifier = record.getMessage()
    return not message_identifier.startswith(
        'Warning: there are non-text parts in the response:'
    )


types.logger.addFilter(_FilterInferenceWarnings())

In [None]:
#@title Discover Tau-bench Tool Definitions
# This section dynamically discovers and imports tool definitions from the
# tau-bench library. The code below is searches for the base 'Tool' class and
# then looks for all subclasses in the 'retail' and 'airline' domains.
import sys, pathlib, importlib, inspect, ast

REPO_ROOT = pathlib.Path.cwd()
PKG_ROOT = REPO_ROOT / 'tau_bench'
DOMAINS = ['retail', 'airline']

if str(REPO_ROOT) not in sys.path:
  sys.path.insert(0, str(REPO_ROOT))

def import_Tool():
  """Finds and imports the base `Tool` class from the tau-bench library.

  It tries several known locations for the `Tool` class. As a fallback, it
  searches for any file defining 'class Tool(' and attempts to import from
  there.

  Returns:
    The `Tool` class object.

  Raises:
    ImportError: If the `Tool` class cannot be located.
  """
  candidates = [
    'tau_bench.core.tool:Tool',
    'tau_bench.core.tools:Tool',
    'tau_bench.tools:Tool',
    'tau_bench.envs.core.tool:Tool',
    'tau_bench.envs.core.tools:Tool',
    'tau_bench.envs.tool:Tool',
  ]
  for spec in candidates:
    mod, name = spec.split(':')
    try:
      return getattr(importlib.import_module(mod), name)
    except Exception:
      pass
  # Fallback: find any file that defines 'class Tool(' and import it
  for p in PKG_ROOT.rglob('*.py'):
    try:
      txt = p.read_text(encoding='utf-8', errors='ignore')
    except Exception:
      continue
    if 'class Tool(' in txt:
      mod = '.'.join(p.relative_to(REPO_ROOT).with_suffix('').parts)
      try:
        return getattr(importlib.import_module(mod), 'Tool')
      except Exception:
        pass
  raise ImportError('Could not locate the Tool class in tau_bench')

Tool = import_Tool()

def module_name_from_path(pyfile: pathlib.Path) -> str:
  rel = pyfile.relative_to(REPO_ROOT).with_suffix('')
  return '.'.join(rel.parts)

def ast_inherits_tool(pyfile: pathlib.Path) -> list:
  """Returns a list of fully-qualified class names.

  These include 'Tool' or '*.Tool'.
  """
  out = []
  try:
    code = pyfile.read_text(encoding='utf-8', errors='ignore')
    tree = ast.parse(code, filename=str(pyfile))
  except Exception:
    return out
  modname = module_name_from_path(pyfile)
  for node in ast.walk(tree):
    if isinstance(node, ast.ClassDef):
      inherits = False
      for base in node.bases:
        try:
          base_txt = ast.unparse(base)
        except Exception:
          if hasattr(base, "id"):
            base_txt = base.id
          elif hasattr(base, "attr"):
            base_txt = base.attr
          else:
            base_txt = ''
        if base_txt == 'Tool' or base_txt.endswith('.Tool'):
          inherits = True
          break
      if inherits:
        out.append(f'{modname}.{node.name}')
  return out

def collect_for_domain(domain: str) -> list:
  """Collects all tool classes for a given domain (e.g., 'retail').

  It first tries to import modules and check for subclasses of `Tool` using
  `issubclass`. If importing fails for any reason, it falls back to a
  heuristic based on AST parsing to identify tool classes.

  Args:
    domain: The domain to search for tools in (e.g., 'retail', 'airline').

  Returns:
    A list of tool classes found for the given domain.
  """
  tools_dir = PKG_ROOT / 'envs' / domain / 'tools'
  if not tools_dir.exists():
    return []
  results = []
  for pyfile in sorted(tools_dir.rglob('*.py')):
    if pyfile.name == '__init__.py':
      continue
    modname = module_name_from_path(pyfile)
    # Try import-based check first
    try:
      mod = importlib.import_module(modname)
      for _, cls in inspect.getmembers(mod, inspect.isclass):
        if getattr(cls, '__module__', None) != mod.__name__:
          continue
        if cls is Tool:
          continue
        try:
          if issubclass(cls, Tool):
            results.append(cls)
        except Exception:
          pass
    except Exception:
      # Import failed → AST heuristic
      results.extend(ast_inherits_tool(pyfile))
  # Dedup preserve order
  seen, deduped = set(), []
  for q in results:
    if q not in seen:
      seen.add(q)
      deduped.append(q)
  return deduped

tool_classes_by_domain = {d: collect_for_domain(d) for d in DOMAINS}
tool_definitions_by_domain = {}
for domain, tool_classes in tool_classes_by_domain.items():
  tool_definitions_by_domain[domain] = []
  for tool_class in tool_classes:
    tool_info = tool_class.get_info()
    if tool_info.get('type') != 'function' or not tool_info.get('function'):
      continue
    tool_definitions_by_domain[domain].append(tool_info['function'])

# Classes and Functions

In [None]:
#@title Core Logic for Tau-Bench Evaluation
# This section defines the main components for running tau-bench evaluations and
# integrating them with GEPA:
#   - A custom runner for tau-bench experiments.
#   - Data structures for trajectories and outputs.
#   - A GEPA adapter that bridges GEPA's optimization process with tau-bench.


from concurrent.futures import ThreadPoolExecutor
from datetime import datetime
import os
import json
import multiprocessing
import random
from retry import retry
import traceback
from typing import List

import tau_bench_agent as tau_bench_agent_lib
from tau_bench.envs import get_env
from tau_bench.run import display_metrics
from tau_bench.types import EnvRunResult, RunConfig
from litellm import provider_list
from tau_bench.envs.user import UserStrategy


def custom_run(
    config: RunConfig,
    print_results: bool = False,
    custom_system_instruction: str = None) -> List[EnvRunResult]:
  """Runs a set of tau-bench tasks with a given agent configuration.

  This is a customized version of the standard tau-bench run function, adapted
  for this experiment's needs. It handles environment setup, agent creation,
  task execution in parallel, and result aggregation.

  Args:
    config: A RunConfig object specifying the environment, models, and other
      parameters for the run.
    print_results: If True, prints the result of each task as it completes.
    custom_system_instruction: An optional system instruction to use for the
      agent, overriding the default.

  Returns:
    A list of EnvRunResult objects, one for each completed task.
  """
  if config.env not in ['retail', 'airline']:
    raise ValueError('Only retail and airline envs are supported')
  if config.model_provider not in provider_list:
    raise ValueError('Invalid model provider')
  if config.user_model_provider not in provider_list:
    raise ValueError('Invalid user model provider')
  if config.agent_strategy not in [
      'tool-calling', 'act', 'react', 'few-shot'
  ]:
    raise ValueError('Invalid agent strategy')
  if config.task_split not in ['train', 'test', 'dev']:
    raise ValueError('Invalid task split')
  if config.user_strategy not in [item.value for item in UserStrategy]:
    raise ValueError('Invalid user strategy')

  random.seed(config.seed)
  time_str = datetime.now().strftime('%m%d%H%M%S')
  model_name = config.model.split('/')[-1]
  ckpt_filename = (
      f'{config.agent_strategy}-{model_name}-{config.temperature}_range_'
      f'{config.start_index}-{config.end_index}_user-{config.user_model}-'
      f'{config.user_strategy}_{time_str}.json'
  )
  ckpt_path = os.path.join(config.log_dir, ckpt_filename)
  if not os.path.exists(config.log_dir):
    os.makedirs(config.log_dir)

  print(f'Loading user with strategy: {config.user_strategy}')
  env = get_env(
    config.env,
    user_strategy=config.user_strategy,
    user_model=config.user_model,
    user_provider=config.user_model_provider,
    task_split=config.task_split,
  )
  if custom_system_instruction:
    env.wiki = custom_system_instruction
  agent = tau_bench_agent_lib.adk_agent_factory(
    tools_info=env.tools_info,
    wiki=env.wiki,
    config=config,
  )
  if config.end_index == -1:
    end_index = len(env.tasks)
  else:
    end_index = min(config.end_index, len(env.tasks))
  results: List[EnvRunResult] = []
  lock = multiprocessing.Lock()
  if config.task_ids and len(config.task_ids) > 0:
    print(f'Running tasks {config.task_ids} (checkpoint path: {ckpt_path})')
  else:
    print(
        f'Running tasks {config.start_index} to {end_index} '
        f'(checkpoint path: {ckpt_path})'
    )
  for i in range(config.num_trials):
    if config.task_ids and len(config.task_ids) > 0:
      idxs = config.task_ids
    else:
      idxs = list(range(config.start_index, end_index))
    if config.shuffle:
      random.shuffle(idxs)

    @retry(tries=3, delay=10, backoff=2)
    def _run_with_retry(idx: int) -> EnvRunResult:
      isolated_env = get_env(
          config.env,
          user_strategy=config.user_strategy,
          user_model=config.user_model,
          task_split=config.task_split,
          user_provider=config.user_model_provider,
          task_index=idx,
      )
      if print_results:
        print(f'Running task {idx}')
      res = agent.solve(
          env=isolated_env,
          task_index=idx,
      )
      return EnvRunResult(
          task_id=idx,
          reward=res.reward,
          info=res.info,
          traj=res.messages,
          trial=i,
      )

    def _run(idx: int) -> EnvRunResult:
      try:
        result = _run_with_retry(idx)
      except Exception as e:
        logging.warning('Inference error: %s', str(e))
        result = EnvRunResult(
            task_id=idx,
            reward=0.0,
            info={'error': str(e), 'traceback': traceback.format_exc()},
            traj=[],
            trial=i,
        )

      if print_results:
        print(
            '✅' if result.reward == 1 else '❌',
            f'task_id={idx}',
            # result.info,
        )
        print('-----')
      with lock:
        data = []
        if os.path.exists(ckpt_path):
          with open(ckpt_path, 'r') as f:
            data = json.load(f)
        with open(ckpt_path, 'w') as f:
          json.dump(data + [result.model_dump()], f, indent=2)
      return result

    with ThreadPoolExecutor(max_workers=config.max_concurrency) as executor:
      res = list(executor.map(_run, idxs))
      results.extend(res)

  display_metrics(results)

  with open(ckpt_path, 'w') as f:
    json.dump([result.model_dump() for result in results], f, indent=2)
    print(f'\n📄 Results saved to {ckpt_path}\n')
  return results

In [None]:
#@title GEPA Adapter for Tau-bench
# The TauBenchAdapter is the core component that allows GEPA to optimize prompts
# for tau-bench. It implements the GEPAAdapter interface, defining how to:
#   1. `evaluate`: Run a candidate prompt on a batch of tau-bench tasks and
#      return scores.
#   2. `make_reflective_dataset`: Convert evaluation results into a dataset that
#      can be used for reflection and prompt improvement.

from typing import List, Any, TypedDict
import json
import gepa
from gepa.core.adapter import EvaluationBatch, GEPAAdapter


class TauBenchDataInst(TypedDict):
  env: str
  task_id: int


class TauBenchTrajectory(TypedDict):
    result_traj: list[dict]


class TauBenchRolloutOutput(TypedDict):
    env: str
    task_id: int
    reward: float
    task_info: dict


def refine_tau_bench_trajectory(traj: list[dict[str, Any]]) -> None:
  """Removes unnecessary info from the trajectory, in place."""
  for content in traj:
    for part in content["parts"]:
      # Drop all fields that are not populated.
      to_drop = []
      for key in part:
        if not part[key]:
          to_drop.append(key)
      for key in to_drop:
        del part[key]

      # For function calls / responses only keep function names, input arguments
      # and outputs.
      if fc := part.get("function_call"):
        part["function_call"] = dict(name=fc["name"], args=fc["args"])
      if fr := part.get("function_response"):
        part["function_response"] = dict(name=fr["name"], args=fr["response"])


class TauBenchAdapter(GEPAAdapter[
    TauBenchDataInst,
    TauBenchTrajectory,
    TauBenchRolloutOutput,
]):
  """A GEPA adapter for evaluating agent performance on tau-bench benchmark."""

  def __init__(
    self,
    agent_model='gemini-2.5-flash',
    agent_model_provider='vertex_ai',
    user_model='gemini-2.5-pro',
    user_model_provider='vertex_ai',
    agent_strategy='tool-calling',
    user_strategy='llm',
    system_instruction_name='system_instruction',
    tools_description: list[dict[str, Any]] | None = None,
    max_concurrency=4,
  ):
    """Initializes the TauBenchAdapter.

    Args:
      agent_model: The model to use for the agent.
      agent_model_provider: The provider for the agent model.
      user_model: The model to use for simulating the user.
      user_model_provider: The provider for the user model.
      agent_strategy: The agent strategy to use (e.g., 'tool-calling').
      user_strategy: The user simulation strategy (e.g., 'llm').
      system_instruction_name: The key in the candidate dictionary that holds
        the system instruction.
      tools_description: Describes each of the availble tools. This is used as context
        for the prompt proposer.
      max_concurrency: The maximum number of tasks to run in parallel.
    """
    self._agent_model = agent_model
    self._agent_model_provider = agent_model_provider
    self._user_model = user_model
    self._user_model_provider = user_model_provider
    self._agent_strategy = agent_strategy
    self._user_strategy = user_strategy
    self._max_concurrency = max_concurrency
    self._system_instruction_name = system_instruction_name
    self._tools_description = tools_description

  def evaluate(
      self,
      batch: list[TauBenchDataInst],
      candidate: dict[str, str],
      capture_traces: bool = False,
  ) -> EvaluationBatch[TauBenchTrajectory, TauBenchRolloutOutput]:
    """Evaluates a candidate prompt on a batch of tau-bench tasks.

    This method is called by GEPA during the optimization loop. It takes a
    candidate prompt, runs it against the specified tasks from tau-bench, and
    returns the results.

    Args:
      batch: A list of task instances to evaluate on. Each instance specifies
        the environment and task ID.
      candidate: A dictionary containing the components to be evaluated,
        including the system instruction.
      capture_traces: (Not used in this adapter) Whether to capture detailed
        traces.

    Returns:
      An EvaluationBatch object containing scores, outputs, and trajectories for
      each task in the batch.
    """
    del capture_traces  # Not used.
    env = batch[0]['env']
    task_ids = [inst['task_id'] for inst in batch]
    tau_bench_run_config = RunConfig(
        env=env,
        model=self._agent_model,
        model_provider=self._agent_model_provider,
        user_model=self._user_model,
        user_model_provider=self._user_model_provider,
        agent_strategy=self._agent_strategy,
        user_strategy=self._user_strategy,
        max_concurrency=self._max_concurrency,
        task_ids=task_ids
    )
    tau_bench_results = custom_run(
        tau_bench_run_config,
        custom_system_instruction=candidate.get(self._system_instruction_name),
    )

    outputs = []
    trajectories = []
    scores = []
    for res in tau_bench_results:
      outputs.append(
          TauBenchRolloutOutput(
              env=env,
              task_id=res.task_id,
              reward=res.reward,
              task_info=res.info))
      result_traj = res.traj
      refine_tau_bench_trajectory(result_traj)
      trajectories.append(TauBenchTrajectory(result_traj=result_traj))
      scores.append(res.reward)

    return EvaluationBatch(
        scores=scores, outputs=outputs, trajectories=trajectories)

  def make_reflective_dataset(
      self,
      candidate: dict[str, str],
      eval_batch: EvaluationBatch[TauBenchTrajectory, TauBenchRolloutOutput],
      components_to_update: list[str]
  ) -> dict[str, list[dict[str, Any]]]:
    """Creates a dataset for reflection based on evaluation results.

    This method transforms the trajectories and scores from an evaluation run
    into a structured format that a reflection model can use to generate
    suggestions for improving the prompt.

    Args:
      candidate: The candidate that was evaluated.
      eval_batch: The results of the evaluation.
      components_to_update: A list of component names that the reflection
        should focus on improving.

    Returns:
      A dictionary where keys are component names and values are lists of
      data instances for reflection.
    """
    system_instruction = candidate[self._system_instruction_name]

    tool_definitions = json.dumps(
        self._tools_description,
        indent=2,
        default=str,
    )

    inputs = '\n\n'.join([
        f'# System Instruction\n{system_instruction}',
        f'# Tool Definitions\n{tool_definitions}',
    ])
    ret_d: dict[str, list[dict[str, Any]]] = {}
    for comp in components_to_update:
      items: list[dict[str, Any]] = []
      trace_instances = list(zip(
          eval_batch.trajectories,
          eval_batch.scores,
          eval_batch.outputs,
          strict=True,
      ))
      for trace_instance in trace_instances:
        traj, score, _ = trace_instance
        if score > 0:
          feedback = f'The agent successfully resolved all customer issues'
        else:
          feedback = (
              f'The agent failed to resolve all customer issues correctly'
          )
        d = {
            'Inputs': inputs,
            'Generated Outputs': json.dumps(traj, indent=2, default=str),
            'Feedback': feedback
        }
        items.append(d)
      if items:
        ret_d[comp] = items
    assert ret_d, (
        'empty reflective dataset for components '
        f'{[comp for comp in components_to_update]}'
    )
    return ret_d


# Run Experiment

In [None]:
#@title Experiment Configuration and Execution
# This section sets up and runs the GEPA optimization experiment.
# Here we define all the parameters for the tau-bench environment, the GEPA
# optimization loop, and the models to be used.

#@markdown Tau-bench config
tau_bench_env = 'retail' #@param ['retail', 'airline']
agent_model = 'gemini-2.5-flash' #@param ['gemini-2.5-flash', 'gemini-2.0-flash']
agent_model_provider = 'vertex_ai' #@param ['vertex_ai', 'google']
user_model = 'gemini-2.5-flash' #@param ['gemini-2.5-flash', 'gemini-2.5-pro']
user_model_provider = 'vertex_ai' #@param ['vertex_ai', 'google']
max_concurrency = 8 #@param {type:"integer"}
num_eval_trials = 4 #@param {type: 'integer'}
#@markdown GEPA config
training_set_size = 20 #@param {type:"integer"}
eval_set_size = 20 #@param {type:"integer"}
rnd_seed = 42 #@param {type:"integer"}
max_metric_calls = 500 #@param {type:"integer"}
reflection_model = 'gemini-2.5-pro' #@param ['gemini-2.5-flash', 'gemini-2.5-pro']
reflection_minibatch_size = 3 #@param {type:"integer"}

#@markdown Dataset and Candidate Setup
random.seed(rnd_seed)
domain_to_size = {'retail': 115, 'airline': 50}
all_tasks = set(range(domain_to_size[tau_bench_env]))
training_task_ids = random.sample(list(all_tasks), training_set_size)
eval_task_ids = random.sample(
    list(all_tasks - set(training_task_ids)),
    eval_set_size,
)
test_task_ids = list(all_tasks - set(training_task_ids) - set(eval_task_ids))

training_set = [
    TauBenchDataInst(env=tau_bench_env, task_id=task_id)
    for task_id in training_task_ids
]
eval_set = [
    TauBenchDataInst(env=tau_bench_env, task_id=task_id)
    for task_id in eval_task_ids
]

system_instruction_name = 'system_instruction'

SEED_SYSTEM_INSTRUCTION = (
    'you are a customer support agent helping customers resolve their '
    'issues by using the right tools'
)

seed_candidate = {
    system_instruction_name: SEED_SYSTEM_INSTRUCTION,
}

In [None]:
#@title Run GEPA Optimization
# With the configuration and adapter in place, this section creates the adapter
# instance and calls `gepa.optimize()` to start the Automatic Prompt
# Optimization (APO) process.
import litellm

tau_bench_adapter = TauBenchAdapter(
    agent_model=agent_model,
    agent_model_provider=agent_model_provider,
    user_model=user_model,
    user_model_provider=user_model_provider,
    agent_strategy='tool-calling',
    user_strategy='llm',
    system_instruction_name=system_instruction_name,
    tools_description=tool_definitions_by_domain[tau_bench_env],
    max_concurrency=max_concurrency,
)

gepa_results = gepa.optimize(
    seed_candidate=seed_candidate,
    trainset=training_set,
    valset=eval_set,
    task_lm=None, # this must be None when a custom adapter is used
    adapter=tau_bench_adapter,
    max_metric_calls=max_metric_calls,
    reflection_lm = (
        lambda prompt: litellm.completion_with_retries(
            model=f'vertex_ai/{reflection_model}',
            messages=[{"role": "user", "content": prompt}],
            num_retries=4, initial_delay=1, max_delay=1,
        ).choices[0].message.content
    ),
    reflection_minibatch_size=reflection_minibatch_size,
)
list(enumerate(gepa_results.val_aggregate_scores))

In [None]:
#@title Evaluate All Candidates


# This is the prompt from https://arxiv.org/pdf/2406.12045
DEFAULT_SYSTEM_INSTRUCTION = '''# Retail agent policy

As a retail agent, you can help users cancel or modify pending orders, return or exchange delivered orders, modify their default user address, or provide information about their own profile, orders, and related products.

- At the beginning of the conversation, you have to authenticate the user identity by locating their user id via email, or via name + zip code. This has to be done even when the user already provides the user id.

- Once the user has been authenticated, you can provide the user with information about order, product, profile information, e.g. help the user look up order id.

- You can only help one user per conversation (but you can handle multiple requests from the same user), and must deny any requests for tasks related to any other user.

- Before taking consequential actions that update the database (cancel, modify, return, exchange), you have to list the action detail and obtain explicit user confirmation (yes) to proceed.

- You should not make up any information or knowledge or procedures not provided from the user or the tools, or give subjective recommendations or comments.

- You should at most make one tool call at a time, and if you take a tool call, you should not respond to the user at the same time. If you respond to the user, you should not make a tool call.

- You should transfer the user to a human agent if and only if the request cannot be handled within the scope of your actions.

## Domain basic

- All times in the database are EST and 24 hour based. For example "02:30:00" means 2:30 AM EST.

- Each user has a profile of its email, default address, user id, and payment methods. Each payment method is either a gift card, a paypal account, or a credit card.

- Our retail store has 50 types of products. For each type of product, there are variant items of different options. For example, for a 't shirt' product, there could be an item with option 'color blue size M', and another item with option 'color red size L'.

- Each product has an unique product id, and each item has an unique item id. They have no relations and should not be confused.

- Each order can be in status 'pending', 'processed', 'delivered', or 'cancelled'. Generally, you can only take action on pending or delivered orders.

- Exchange or modify order tools can only be called once. Be sure that all items to be changed are collected into a list before making the tool call!!!

## Cancel pending order

- An order can only be cancelled if its status is 'pending', and you should check its status before taking the action.

- The user needs to confirm the order id and the reason (either 'no longer needed' or 'ordered by mistake') for cancellation.

- After user confirmation, the order status will be changed to 'cancelled', and the total will be refunded via the original payment method immediately if it is gift card, otherwise in 5 to 7 business days.

## Modify pending order

- An order can only be modified if its status is 'pending', and you should check its status before taking the action.

- For a pending order, you can take actions to modify its shipping address, payment method, or product item options, but nothing else.

### Modify payment

- The user can only choose a single payment method different from the original payment method.

- If the user wants the modify the payment method to gift card, it must have enough balance to cover the total amount.

- After user confirmation, the order status will be kept 'pending'. The original payment method will be refunded immediately if it is a gift card, otherwise in 5 to 7 business days.

### Modify items

- This action can only be called once, and will change the order status to 'pending (items modifed)', and the agent will not be able to modify or cancel the order anymore. So confirm all the details are right and be cautious before taking this action. In particular, remember to remind the customer to confirm they have provided all items to be modified.

- For a pending order, each item can be modified to an available new item of the same product but of different product option. There cannot be any change of product types, e.g. modify shirt to shoe.

- The user must provide a payment method to pay or receive refund of the price difference. If the user provides a gift card, it must have enough balance to cover the price difference.

## Return delivered order

- An order can only be returned if its status is 'delivered', and you should check its status before taking the action.

- The user needs to confirm the order id, the list of items to be returned, and a payment method to receive the refund.

- The refund must either go to the original payment method, or an existing gift card.

- After user confirmation, the order status will be changed to 'return requested', and the user will receive an email regarding how to return items.

## Exchange delivered order

- An order can only be exchanged if its status is 'delivered', and you should check its status before taking the action. In particular, remember to remind the customer to confirm they have provided all items to be exchanged.

- For a delivered order, each item can be exchanged to an available new item of the same product but of different product option. There cannot be any change of product types, e.g. modify shirt to shoe.

- The user must provide a payment method to pay or receive refund of the price difference. If the user provides a gift card, it must have enough balance to cover the price difference.

- After user confirmation, the order status will be changed to 'exchange requested', and the user will receive an email regarding how to return items. There is no need to place a new order.
'''


all_system_instructions = [
    DEFAULT_SYSTEM_INSTRUCTION,
    SEED_SYSTEM_INSTRUCTION,
    gepa_results.best_candidate['system_instruction'],
]


system_instruction_to_eval_results = {}
for system_instruction in all_system_instructions:
  tau_bench_run_config = RunConfig(
      env=tau_bench_env,
      model=agent_model,
      model_provider=agent_model_provider,
      user_model=user_model,
      user_model_provider=user_model_provider,
      agent_strategy='tool-calling',
      user_strategy='llm',
      max_concurrency=max_concurrency,
      num_trials=num_eval_trials,
      task_ids=test_task_ids,
  )
  tau_bench_results = custom_run(
      tau_bench_run_config,
      custom_system_instruction=system_instruction,
  )
  total = len(tau_bench_results)
  numerator = sum(1 for res in tau_bench_results if res.reward == 1)
  print(
      f'average reward (total={total}): {numerator/total if total > 0 else 0}'
  )
  system_instruction_to_eval_results[system_instruction] = tau_bench_results

In [None]:
print(gepa_results.best_candidate['system_instruction'])