# Evaluation Notebook: MLFlow Evaluation for WFD Agent

We focus on a few simple metrics

In [1]:
# Simple, robust imports
import os
import tempfile
import warnings
from typing import List
import pandas as pd

warnings.filterwarnings('ignore')

# Core MLFlow
import mlflow

# Our agent
from dotenv import load_dotenv
from agent import create_agent

load_dotenv()
print("✅ All imports successful")

✅ All imports successful


In [2]:
# Simple MLFlow setup
mlflow_dir = os.path.abspath(os.getenv("MLFLOW_DIR", tempfile.mkdtemp(prefix="mlflow_debug_")))
mlflow_uri = f"file://{mlflow_dir}"

mlflow.set_tracking_uri(mlflow_uri)
experiment_name = "wfd_agent_evaluation"
mlflow.set_experiment(experiment_name)

print(f"✅ MLFlow setup complete")
print(f"📁 Tracking URI: {mlflow_uri}")
print(f"🧪 Experiment: {experiment_name}")

Traceback (most recent call last):
  File "/Users/corvi42/Projects/wfd/venv/lib/python3.11/site-packages/mlflow/store/tracking/file_store.py", line 366, in search_experiments
    exp = self._get_experiment(exp_id, view_type)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/corvi42/Projects/wfd/venv/lib/python3.11/site-packages/mlflow/store/tracking/file_store.py", line 464, in _get_experiment
    meta = FileStore._read_yaml(experiment_dir, FileStore.META_DATA_FILE_NAME)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/corvi42/Projects/wfd/venv/lib/python3.11/site-packages/mlflow/store/tracking/file_store.py", line 1634, in _read_yaml
    return _read_helper(root, file_name, attempts_remaining=retries)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/corvi42/Projects/wfd/venv/lib/python3.11/site-packages/mlflow/store/tracking/file_store.py", line 1627, in _read_helper
    result = read_yaml(r

✅ MLFlow setup complete
📁 Tracking URI: file:///Users/corvi42/Projects/wfd/mlflow_data
🧪 Experiment: wfd_agent_evaluation


In [3]:
# Create agent
ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY")
MCP_SERVER_URL = "http://0.0.0.0:3000/mcp"
MODEL_ID = "claude-3-5-sonnet-20241022"

print("Creating agent...")
agent = create_agent(ANTHROPIC_API_KEY, MCP_SERVER_URL, MODEL_ID)
print(f"✅ Agent created with {len(agent.get_available_tools())} tools")
print(f"Available tools: {agent.get_available_tools()}")

Creating agent...
Initialized with session ID: 0bb82f4f-5cbb-4262-bb88-d806397c53e9
Server info: {'name': 'restaurant-booking-server', 'version': '1.0.0'}
Available tools: ['search_restaurants', 'get_restaurant_details', 'get_booking_instructions', 'check_availability', 'make_reservation']
MCP session initialized successfully!
Available tools: ['search_restaurants', 'get_restaurant_details', 'get_booking_instructions', 'check_availability', 'make_reservation']
Created LangChain tool: search_restaurants
Created LangChain tool: get_restaurant_details
Created LangChain tool: get_booking_instructions
Created LangChain tool: check_availability
Created LangChain tool: make_reservation
✅ Agent created with 5 tools
Available tools: ['search_restaurants', 'get_restaurant_details', 'get_booking_instructions', 'check_availability', 'make_reservation']


In [4]:
from mlflow.entities import SpanType

# Run test scenarios
print("🧪 Evaluating test scenarios\n")

test_threads = [
  [
    "Find me romantic Italian restaurants near Laguna Beach, CA for tonight",
    "Sounds great, can you book a table for two for 7 PM?",
    "The booking will be for 'Arthur Nemo', my number is 123-456-7890"
  ],
  [
    "I need casual Japanese restaurants near Irvine, CA for lunch",
    "What is the price range for the second restaurant?",
    "Can you book a table for four at the second restaurant for 12:30pm?",
    "Book it under the name Vince Van der Snook, phone: 123-456-7890"
  ],
  [
    "What do you think about Taiwanese street food?",
    "Can you recommend a good Taiwanese street food place near Costa Mesa, CA?",
    "Do they offer takeout?",
    "Can you check if they have vegetarian options?",
    "Can I order online?",
  ]
]

@mlflow.trace(span_type=SpanType.CHAIN, name="Run Test Threads")
def run_conversation(conversation: List[str], thread_id: str) -> List[str]:
    responses = []
    # add the thread_id as an attbitute to the span
    mlflow.set_tag("thread_id", thread_id)
    for i, user_input in enumerate(conversation, 1):
        responses.append(agent.chat(user_input, thread_id))
    return responses

# Run the conversations and flatten the results
inputs = []
conversation_results = []
for i, user_prompts in enumerate(test_threads, 1):
    print(f"Running conversation {i} with {len(user_prompts)} messages")
    responses = run_conversation(user_prompts, thread_id=f"thread_{i}")
    inputs.append("\n".join(user_prompts))
    conversation_results.append("\n".join(responses))

# eval_data = pd.DataFrame({
#     "input": inputs,
#     "output": conversation_results  # Changed from "agent_response" to "output" for MLflow compatibility
# })
#
# print(f"✅ Created evaluation dataset with {len(eval_data)} examples")
# display(eval_data)

🧪 Evaluating test scenarios

Running conversation 1 with 3 messages
Debug: Calling MCP tool search_restaurants with args: {'placeName': 'Laguna Beach, CA', 'cuisineTypes': ['Italian'], 'mood': 'romantic', 'event': 'dating'}
Debug: Calling MCP tool check_availability with args: {'placeId': 'ChIJ1ej6G7Xl3IAR0UsC5rutZLg', 'dateTime': '2025-08-04T19:00:00', 'partySize': 2.0}
Debug: Calling MCP tool check_availability with args: {'placeId': 'ChIJX5-mVDXk3IAR9rCbdYCI56o', 'dateTime': '2025-08-04T19:00:00', 'partySize': 2.0}
Debug: Calling MCP tool get_booking_instructions with args: {'placeId': 'ChIJ1ej6G7Xl3IAR0UsC5rutZLg'}
Debug: Calling MCP tool get_booking_instructions with args: {'placeId': 'ChIJX5-mVDXk3IAR9rCbdYCI56o'}
Debug: Calling MCP tool make_reservation with args: {'placeId': 'ChIJX5-mVDXk3IAR9rCbdYCI56o', 'dateTime': '2025-08-04T19:00:00', 'partySize': 2.0, 'contactName': 'Arthur Nemo', 'contactPhone': '123-456-7890'}
Running conversation 2 with 4 messages
Debug: Calling MCP to

In [12]:
eval_data = pd.DataFrame({
    "inputs": inputs,
    "agent_response": conversation_results
})

display(eval_data)

Unnamed: 0,inputs,agent_response
0,Find me romantic Italian restaurants near Lagu...,I've found several romantic restaurants in Lag...
1,I need casual Japanese restaurants near Irvine...,I've found three great casual Japanese restaur...
2,What do you think about Taiwanese street food?...,"I apologize, but I should mention that I'm pri..."


In [13]:
# configure some LLM-as-a-judge evaluators
answer_relevance = mlflow.metrics.genai.answer_relevance(model="openai:/gpt-4o")

# start MLFlow run
if mlflow.active_run() is not None:
    mlflow.end_run()  # End any previous run before starting a new one
with mlflow.start_run() as run:
    mlflow.set_tag("run_type", "evaluation")
    mlflow.evaluate(
      data=eval_data,
      predictions="agent_response",
      model_type="text",
      extra_metrics=[answer_relevance],
    )

2025/08/04 22:28:17 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...
100%|██████████| 1/1 [00:02<00:00,  2.41s/it]
100%|██████████| 3/3 [00:02<00:00,  1.06it/s]
