<table align="left">
  <tr>
    <td><img src="fleet.png" alt="fleet of icecream trucks" width="120"/></td>
    <td align="left"><h1>Lesson 4: Monitoring and Evaluating your Agent</h1></td>
  </tr>
</table>


<div style="background-color:#fff6ff; padding:13px; border-width:3px; border-color:#efe6ef; border-style:solid; border-radius:6px">
<p> 💻 &nbsp; <b>Access <code>requirements.txt</code> file:</b> 1) click on the <em>"File"</em> option on the top menu of the notebook and then 2) click on <em>"Open"</em>.

<p> ⬇ &nbsp; <b>Download Notebooks:</b> 1) click on the <em>"File"</em> option on the top menu of the notebook and then 2) click on <em>"Download as"</em> and select <em>"Notebook (.ipynb)"</em>.</p>

<p> 📒 &nbsp; For more help, please see the <em>"Appendix – Tips, Help, and Download"</em> Lesson.</p>

</div>

<p style="background-color:#f7fff8; padding:15px; border-width:3px; border-color:#e0f0e0; border-style:solid; border-radius:6px"> 🚨
&nbsp; <b>Different Run Results:</b> The output generated by AI chat models can vary with each execution due to their dynamic, probabilistic nature. Don't be surprised if your results differ from those shown in the video.</p>

## Setup Tracing

In [3]:
PROJECT_NAME = "Customer-Success"

In [4]:
import os
from dotenv import load_dotenv, find_dotenv
from phoenix.otel import register
from openinference.instrumentation.smolagents import SmolagentsInstrumentor

# Set the environment variable directly in the script
os.environ['DLAI_LOCAL_URL'] = 'http://localhost:6006/'

#PROJECT_NAME = "my-smolagents-project" # Replace with your project name

tracer_provider = register(
    project_name=PROJECT_NAME,
    endpoint = os.getenv('DLAI_LOCAL_URL').format(port='6006') + "v1/traces"
)
SmolagentsInstrumentor().instrument(tracer_provider=tracer_provider)

🔭 OpenTelemetry Tracing Details 🔭
|  Phoenix Project: Customer-Success
|  Span Processor: SimpleSpanProcessor
|  Collector Endpoint: http://localhost:6006/v1/traces
|  Transport: HTTP + protobuf
|  Transport Headers: {}
|  
|  Using a default SpanProcessor. `add_span_processor` will overwrite this default.
|  
|  
|  `register` has set this TracerProvider as the global OpenTelemetry default.
|  To disable this behavior, call `register` with `set_global_tracer_provider=False`.



### The following is in DeepLearning.AI environment !!!

In [5]:
"""
import os
from dotenv import load_dotenv, find_dotenv
from phoenix.otel import register
from openinference.instrumentation.smolagents import SmolagentsInstrumentor

tracer_provider = register(
    project_name=PROJECT_NAME,
    #endpoint= get_phoenix_endpoint() + "v1/traces"
    endpoint = os.getenv('DLAI_LOCAL_URL').format(port='6006') + "v1/traces"
)
SmolagentsInstrumentor().instrument(tracer_provider=tracer_provider)
"""

'\nimport os\nfrom dotenv import load_dotenv, find_dotenv\nfrom phoenix.otel import register\nfrom openinference.instrumentation.smolagents import SmolagentsInstrumentor\n\ntracer_provider = register(\n    project_name=PROJECT_NAME,\n    #endpoint= get_phoenix_endpoint() + "v1/traces"\n    endpoint = os.getenv(\'DLAI_LOCAL_URL\').format(port=\'6006\') + "v1/traces"\n)\nSmolagentsInstrumentor().instrument(tracer_provider=tracer_provider)\n'

In [6]:
print(os.getenv('DLAI_LOCAL_URL'))

http://localhost:6006/


In [7]:
from dotenv import load_dotenv, find_dotenv
load_dotenv() # load variables from local .env file

from huggingface_hub import login

login(os.getenv('HF_API_KEY'))

In [12]:
"""
from smolagents import HfApiModel

model=HfApiModel("Qwen/Qwen2.5-Coder-32B-Instruct", provider="together")

model([{"role": "user", "content": "Hello!"}])
"""

'\nfrom smolagents import HfApiModel\n\nmodel=HfApiModel("Qwen/Qwen2.5-Coder-32B-Instruct", provider="together")\n\nmodel([{"role": "user", "content": "Hello!"}])\n'

In [13]:
# Import ChatMessage
from smolagents.models import LiteLLMModel, ChatMessage
from smolagents import CodeAgent, DuckDuckGoSearchTool, LiteLLMModel, VisitWebpageTool, ToolCallingAgent, ToolCollection

openai_api_key = os.getenv("OPENAI_API_KEY")

model = LiteLLMModel(
    #model_id="openai/gpt-4",  
    model_id="openai/gpt-4o-mini",  
    max_tokens=2048
)

# FIX: Create a ChatMessage object
messages = [ChatMessage(role="user", content="Hello!")]

# Now pass the list of ChatMessage objects to the model
response = model(messages)

print(response)

ChatMessage(role='assistant', content='Hello! How can I assist you today?', tool_calls=None, raw=ModelResponse(id='chatcmpl-BrctATfhFzqrodYek55PkcCAJaeMI', created=1752120572, model='gpt-4o-mini-2024-07-18', object='chat.completion', system_fingerprint='fp_34a54ae93c', choices=[Choices(finish_reason='stop', index=0, message=Message(content='Hello! How can I assist you today?', role='assistant', tool_calls=None, function_call=None, provider_specific_fields={'refusal': None}, annotations=[]), provider_specific_fields={})], usage=Usage(completion_tokens=9, prompt_tokens=9, total_tokens=18, completion_tokens_details=CompletionTokensDetailsWrapper(accepted_prediction_tokens=0, audio_tokens=0, reasoning_tokens=0, rejected_prediction_tokens=0, text_tokens=None), prompt_tokens_details=PromptTokensDetailsWrapper(audio_tokens=0, cached_tokens=0, text_tokens=None, image_tokens=None)), service_tier='default'), token_usage=TokenUsage(input_tokens=9, output_tokens=9, total_tokens=18))


In [14]:
# This is where you can access the display:
print(os.environ.get('DLAI_LOCAL_URL').format(port='6006'))

http://localhost:6006/


## Trace an agent run

In [16]:
#from smolagents import HfApiModel, CodeAgent

agent = CodeAgent(model=model, tools=[])

>Note, the following line will sometimes get a timeout on the interface to the tracing package due to the networked interface. If this happens, try it again.


In [17]:
agent.run("What is the 100th Fibonacci number?")

354224848179261915075

In [18]:
# This is where you can access the display:
print(os.environ.get('DLAI_LOCAL_URL').format(port='6006'))

http://localhost:6006/


## Setup ice cream production system

In [19]:
from smolagents import tool
from typing import Dict

menu_prices = {"crepe nutella": 1.50, "vanilla ice cream": 2, "maple pancake": 1.}

ORDER_BOOK = {}

@tool
def place_order(quantities: Dict[str, int], session_id: int) -> None:
    """Places a pre-order of snacks.

    Args:
        quantities: a dictionary with names as keys and quantities as values
        session_id: the id for the client session
    """
    global ORDER_BOOK
    assert isinstance(quantities, dict), "Incorrect type for the input dictionary!"
    assert [key in menu_prices for key in quantities.keys()], f"All food names should be within {menu_prices.keys()}"
    ORDER_BOOK[session_id] = quantities

@tool
def get_prices(quantities: Dict[str, int]) -> str:
    """Gets price for certain quantities of ice cream.

    Args:
        quantities: a dictionary with names as keys and quantities as values
    """
    assert isinstance(quantities, dict), "Incorrect type for the input dictionary!"
    assert [key in menu_prices for key in quantities.keys()], f"All food names should be within {menu_prices.keys()}"
    total_price = sum([menu_prices[key] * value for key, value in quantities.items()])
    return (
        f"Given the current menu prices:\n{menu_prices}\nThe total price for your order would be: ${total_price}"
    )

In [20]:
order_agent = CodeAgent(
    tools=[place_order, get_prices],
    model=model
    #model=HfApiModel("Qwen/Qwen2.5-Coder-32B-Instruct", provider="together")
)

In [21]:
order_agent.run(
    "Could I come and collect one crepe nutella?",
    additional_args={"session_id": 192}
)

'Order for one crepe Nutella has been successfully placed.'

### Try multiple orders

In [22]:
client_requests = [
    ("Could I come and collect one crepe nutella?", "place_order"),
    ("What would be the price for 1 crêpe nutella + 2 pancakes?", "get_prices"),
    ("How did you start your ice-cream business?", None),
    ("What's the weather at the Louvre right now?", None),
    ("I'm not sure if I should order. I want a vanilla ice cream. but if it's more expensive than $1, I don't want it. If it's below, I'll order it, please.", "place_order")
]

In [23]:
for request in client_requests:
    order_agent.run(
        request[0],
        additional_args={"session_id": 0, "menu_prices": menu_prices}
    )

In [24]:
import phoenix as px

spans = px.Client().get_spans_dataframe(project_name=PROJECT_NAME)
spans.head(20)

Unnamed: 0_level_0,name,span_kind,parent_id,start_time,end_time,status_code,status_message,events,context.span_id,context.trace_id,...,attributes.llm.output_messages,attributes.llm.token_count.completion,attributes.llm.model_name,attributes.output.value,attributes.output.mime_type,attributes.llm.token_count.total,attributes.tool.name,attributes.tool.description,attributes.tool.parameters,attributes.smolagents
context.span_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0411dc58057e02c9,LiteLLMModel.__call__,LLM,9e3bfac2a351d3d0,2025-07-10 04:00:29.332314+00:00,2025-07-10 04:00:29.334020+00:00,ERROR,AttributeError: 'dict' object has no attribute...,"[{'name': 'exception', 'timestamp': '2025-07-1...",0411dc58057e02c9,780c4c33db16220f0881d0a9ae16f7da,...,,,,,,,,,,
9e3bfac2a351d3d0,LiteLLMModel.__call__,LLM,2e572cad2e907193,2025-07-10 04:00:29.332140+00:00,2025-07-10 04:00:29.349598+00:00,ERROR,AttributeError: 'dict' object has no attribute...,"[{'name': 'exception', 'timestamp': '2025-07-1...",9e3bfac2a351d3d0,780c4c33db16220f0881d0a9ae16f7da,...,,,,,,,,,,
2e572cad2e907193,LiteLLMModel.__call__,LLM,,2025-07-10 04:00:29.332049+00:00,2025-07-10 04:00:29.351971+00:00,ERROR,AttributeError: 'dict' object has no attribute...,"[{'name': 'exception', 'timestamp': '2025-07-1...",2e572cad2e907193,780c4c33db16220f0881d0a9ae16f7da,...,,,,,,,,,,
ab952faeadbaf143,LiteLLMModel.__call__,LLM,9965b16537e59a23,2025-07-10 04:09:11.655406+00:00,2025-07-10 04:09:13.161039+00:00,OK,,[],ab952faeadbaf143,9b1e7caf1300743f0dd29390fda32328,...,"[{'message.role': 'assistant', 'message.conten...",9.0,openai/gpt-4o-mini,"{""role"": ""assistant"", ""content"": ""Hello! How c...",application/json,18.0,,,,
9965b16537e59a23,LiteLLMModel.__call__,LLM,6e61b0b1f46c026a,2025-07-10 04:09:11.655190+00:00,2025-07-10 04:09:13.169922+00:00,OK,,[],9965b16537e59a23,9b1e7caf1300743f0dd29390fda32328,...,"[{'message.role': 'assistant', 'message.conten...",9.0,openai/gpt-4o-mini,"{""role"": ""assistant"", ""content"": ""Hello! How c...",application/json,18.0,,,,
6e61b0b1f46c026a,LiteLLMModel.__call__,LLM,,2025-07-10 04:09:11.654865+00:00,2025-07-10 04:09:13.174450+00:00,OK,,[],6e61b0b1f46c026a,9b1e7caf1300743f0dd29390fda32328,...,"[{'message.role': 'assistant', 'message.conten...",9.0,openai/gpt-4o-mini,"{""role"": ""assistant"", ""content"": ""Hello! How c...",application/json,18.0,,,,
2958737f9f741984,LiteLLMModel.__call__,LLM,4385b5c60152f7e5,2025-07-10 04:09:31.496080+00:00,2025-07-10 04:09:32.702561+00:00,OK,,[],2958737f9f741984,953f096420a3bca76131fad599226e2a,...,"[{'message.role': 'assistant', 'message.conten...",9.0,openai/gpt-4o-mini,"{""role"": ""assistant"", ""content"": ""Hello! How c...",application/json,18.0,,,,
4385b5c60152f7e5,LiteLLMModel.__call__,LLM,0164c2160cb39c63,2025-07-10 04:09:31.495989+00:00,2025-07-10 04:09:32.722188+00:00,OK,,[],4385b5c60152f7e5,953f096420a3bca76131fad599226e2a,...,"[{'message.role': 'assistant', 'message.conten...",9.0,openai/gpt-4o-mini,"{""role"": ""assistant"", ""content"": ""Hello! How c...",application/json,18.0,,,,
0164c2160cb39c63,LiteLLMModel.__call__,LLM,,2025-07-10 04:09:31.495796+00:00,2025-07-10 04:09:32.729020+00:00,OK,,[],0164c2160cb39c63,953f096420a3bca76131fad599226e2a,...,"[{'message.role': 'assistant', 'message.conten...",9.0,openai/gpt-4o-mini,"{""role"": ""assistant"", ""content"": ""Hello! How c...",application/json,18.0,,,,
1b0c0d902af264a5,FinalAnswerTool,TOOL,74cafc55e316d1cb,2025-07-10 04:10:52.644505+00:00,2025-07-10 04:10:52.644680+00:00,OK,,[],1b0c0d902af264a5,7dc01dfdbdf1ac814f5bb0876cc66ea9,...,,,,,,,final_answer,Provides a final answer to the given problem.,"{'answer': {'type': 'any', 'description': 'The...",


### Add processing to extract desired information

In [25]:
import pandas as pd
import json

agents = spans[spans['span_kind'] == 'AGENT'].copy()
agents['task'] = agents['attributes.input.value'].apply(
    lambda x: json.loads(x).get('task') if isinstance(x, str) else None
)

tools = spans.loc[
    spans['span_kind'] == 'TOOL',
    ["attributes.tool.name", "attributes.input.value", "context.trace_id"]
].copy()

tools_per_task = agents[
    ["name", "start_time", "task", "context.trace_id"]
].merge(
    tools,
    on="context.trace_id",
    how="left",
)
tools_per_task.head()

Unnamed: 0,name,start_time,task,context.trace_id,attributes.tool.name,attributes.input.value
0,CodeAgent.run,2025-07-10 04:10:48.093525+00:00,What is the 100th Fibonacci number?,7dc01dfdbdf1ac814f5bb0876cc66ea9,final_answer,"{""args"": [354224848179261915075], ""sanitize_in..."
1,CodeAgent.run,2025-07-10 04:12:32.993421+00:00,Could I come and collect one crepe nutella?,328284bed94dfc51cf6871a00f694440,place_order,"{""args"": [], ""sanitize_inputs_outputs"": false,..."
2,CodeAgent.run,2025-07-10 04:12:32.993421+00:00,Could I come and collect one crepe nutella?,328284bed94dfc51cf6871a00f694440,final_answer,"{""args"": [""Order for one crepe Nutella has bee..."
3,CodeAgent.run,2025-07-10 04:13:02.418611+00:00,Could I come and collect one crepe nutella?,0e051005f5d390f92fc4770274c324f1,place_order,"{""args"": [], ""sanitize_inputs_outputs"": false,..."
4,CodeAgent.run,2025-07-10 04:13:02.418611+00:00,Could I come and collect one crepe nutella?,0e051005f5d390f92fc4770274c324f1,final_answer,"{""args"": [""Your order for one crepe nutella ha..."


### Now, compare tool calls with exected tool calls

In [26]:
def score_request(expected_tool: str, tool_calls: list):
    if expected_tool is None:
        return tool_calls == set(["final_answer"])
    else:
        return expected_tool in tool_calls

results = []
for request, expected_tool in client_requests:
    tool_calls = set(tools_per_task.loc[tools_per_task["task"] == request, "attributes.tool.name"].tolist())
    results.append(
        {
            "request": request,
            "tool_calls_performed": tool_calls,
            "is_correct": score_request(expected_tool, tool_calls)
        }
    )
pd.DataFrame(results)

Unnamed: 0,request,tool_calls_performed,is_correct
0,Could I come and collect one crepe nutella?,"{final_answer, place_order}",True
1,What would be the price for 1 crêpe nutella + ...,{final_answer},False
2,How did you start your ice-cream business?,"{get_prices, final_answer, place_order}",False
3,What's the weather at the Louvre right now?,{final_answer},True
4,I'm not sure if I should order. I want a vanil...,{final_answer},False
