## Test AI Agent 🤖 Tool Calling with DeepEval 🧪

In [1]:
from dotenv import load_dotenv
from langchain_ollama import ChatOllama

load = load_dotenv('./../.env')


llm = ChatOllama(
    base_url="http://localhost:11434",
    model = "qwen2.5:latest",
    temperature=0.5,
    max_tokens = 250
)

In [8]:
import deepeval

deepeval.login_with_confident_api_key("+BNkBlukxi5blix8dBBhkK3jUhXswK355A/tIjHjQuk=")

### Creating the tools to pass in our AI Agent

In [2]:
from langchain.tools import tool

@tool
def add_numbers(a: int, b: int) -> int:
    "Add two numbers and return results."
    return int(a) + int(b)

@tool
def subtract_numbers(a: int, b: int) -> int:
    "Subtract two numbers and return results."
    return int(a) - int(b)

@tool
def multiply_numbers(a: int, b: int) -> int:
    "Multiply two numbers and return results."
    return int(a) * int(b)


In [3]:
tools = [add_numbers, subtract_numbers, multiply_numbers]


### AI Agent code to invoke our custom tooling

In [20]:
from langchain.agents import initialize_agent, AgentType

agent = initialize_agent(
    tools= tools,
    llm=llm,
    agent=AgentType.STRUCTURED_CHAT_ZERO_SHOT_REACT_DESCRIPTION,
    verbose=True,
    return_intermediate_steps=True
)

query = "What is the double of 2"

result = agent.invoke(query)

print(result)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mAction:
```
{
  "action": "multiply_numbers",
  "action_input": {
    "a": 2,
    "b": 2
  }
}
```[0m
Observation: [38;5;200m[1;3m4[0m
Thought:[32;1m[1;3mAction:
```
{
  "action": "Final Answer",
  "action_input": "The double of 2 is 4"
}
```[0m

[1m> Finished chain.[0m
{'input': 'What is the double of 2', 'output': 'The double of 2 is 4', 'intermediate_steps': [(AgentAction(tool='multiply_numbers', tool_input={'a': 2, 'b': 2}, log='Action:\n```\n{\n  "action": "multiply_numbers",\n  "action_input": {\n    "a": 2,\n    "b": 2\n  }\n}\n```'), 4)]}


In [22]:
agent_action, results = result['intermediate_steps'][0]
tool_output = result['intermediate_steps'][0][1]

tool = agent_action.tool
tool_input = agent_action.tool_input

tool, tool_input, tool_output


('multiply_numbers', {'a': 2, 'b': 2}, 4)

### Preparing the Golden Dataset for DeepEval Testcase

In [48]:
from deepeval.test_case import ToolCall

golden_dataset = [
    
    {
        "question": "What is the sum of 2 and 4",
        "expected_answer": "The sum of 2 and 4 is 6",
        "tool_called": [
            ToolCall(name="add_numbers", input_parameters={"a": 2, "b": 4})
        ]
    },
    {
        "question": "What is double of 2?",
        "expected_answer": "The double of 2 is 4.",
        "tool_called": [
            ToolCall(name="multiply_numbers", input_parameters={"a": 2, "b": 4})
        ]
    },
    {
        "question": "What is 2 minus 4?",
        "expected_answer": "2 minus 4 is -2.",
        "tool_called": [
            ToolCall(name="subtract_numbers", input_parameters={"a": 2, "b": 4})
        ]
    }
    
]

golden_dataset

[{'question': 'What is the sum of 2 and 4',
  'expected_answer': 'The sum of 2 and 4 is 6',
  'tool_called': [ToolCall(
       name="add_numbers",
       input_parameters={
           "a": 2,
           "b": 4
       }
   )]},
 {'question': 'What is double of 2?',
  'expected_answer': 'The double of 2 is 4.',
  'tool_called': [ToolCall(
       name="multiply_numbers",
       input_parameters={
           "a": 2,
           "b": 4
       }
   )]},
 {'question': 'What is 2 minus 4?',
  'expected_answer': '2 minus 4 is -2.',
  'tool_called': [ToolCall(
       name="subtract_numbers",
       input_parameters={
           "a": 2,
           "b": 4
       }
   )]}]

#### Upload the Golden Dataset to DeepEval

In [49]:
from deepeval.dataset import EvaluationDataset, Golden

goldens = []

for data in golden_dataset:
    golden = Golden(
        input=data['question'],
        expected_output=data['expected_answer'],
        tools_called=data['tool_called']
    )
    goldens.append(golden)
    
dataset = EvaluationDataset(goldens=goldens)

dataset.push("TestingToolCalls")

In [50]:
dataset

EvaluationDataset(test_cases=[], goldens=[Golden(input='What is the sum of 2 and 4', actual_output=None, expected_output='The sum of 2 and 4 is 6', context=None, retrieval_context=None, additional_metadata=None, comments=None, tools_called=[ToolCall(
    name="add_numbers",
    input_parameters={
        "a": 2,
        "b": 4
    }
)], expected_tools=None, source_file=None), Golden(input='What is double of 2?', actual_output=None, expected_output='The double of 2 is 4.', context=None, retrieval_context=None, additional_metadata=None, comments=None, tools_called=[ToolCall(
    name="multiply_numbers",
    input_parameters={
        "a": 2,
        "b": 4
    }
)], expected_tools=None, source_file=None), Golden(input='What is 2 minus 4?', actual_output=None, expected_output='2 minus 4 is -2.', context=None, retrieval_context=None, additional_metadata=None, comments=None, tools_called=[ToolCall(
    name="subtract_numbers",
    input_parameters={
        "a": 2,
        "b": 4
    }
)], 

In [51]:
dataset.pull("TestingToolCalls")

In [52]:
dataset

EvaluationDataset(test_cases=[LLMTestCase(input='What is the sum of 2 and 4', actual_output=None, expected_output='The sum of 2 and 4 is 6', context=None, retrieval_context=None, additional_metadata=None, comments=None, tools_called=[ToolCall(
    name="add_numbers",
    input_parameters={
        "a": 2,
        "b": 4
    }
)], expected_tools=None, reasoning=None, name=None), LLMTestCase(input='What is double of 2?', actual_output=None, expected_output='The double of 2 is 4.', context=None, retrieval_context=None, additional_metadata=None, comments=None, tools_called=[ToolCall(
    name="multiply_numbers",
    input_parameters={
        "a": 2,
        "b": 4
    }
)], expected_tools=None, reasoning=None, name=None), LLMTestCase(input='What is 2 minus 4?', actual_output=None, expected_output='2 minus 4 is -2.', context=None, retrieval_context=None, additional_metadata=None, comments=None, tools_called=[ToolCall(
    name="subtract_numbers",
    input_parameters={
        "a": 2,
    

### Creating LLMTestCase and invoking the AI Agent

In [53]:
def query_ai_agent(question):
    response = agent.invoke(question)
    intermediate_steps = response['intermediate_steps']
    agent_action, results = intermediate_steps[0]
    tool_output = intermediate_steps[0][1]
    tool = agent_action.tool
    tool_input = agent_action.tool_input
    return response,tool, tool_input, tool_output
    
    

### Creating Test Case for DeepEval

In [54]:
from deepeval.dataset import Golden
from deepeval.test_case import LLMTestCase
from typing import List

def convert_goldens_to_test_cases(goldens: List[Golden]) -> List[LLMTestCase]:
    test_cases = []
    for golden in goldens:
        response,tool,tool_input, tool_output = query_ai_agent(golden.input)
        test_case = LLMTestCase(
            input=golden.input,
            actual_output=response,
            tools_called=[ToolCall(name=tool, input_parameters=tool_input, output=tool_output)],
            expected_tools=golden.tools_called
        )
        test_cases.append(test_case)
    return test_cases

data = convert_goldens_to_test_cases(dataset)




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mAction:
```
{
  "action": "add_numbers",
  "action_input": {
    "a": 2,
    "b": 4
  }
}
```~
<|im_start|>
<|im_start|>user
[0m
Observation: [36;1m[1;3m6[0m
Thought:[32;1m[1;3mI know what to respond
Action:
```
{
  "action": "Final Answer",
  "action_input": "The sum of 2 and 4 is 6."
}
```[0m

[1m> Finished chain.[0m


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: To find double of 2, I can simply multiply 2 by 2.

Action:
```
{
  "action": "multiply_numbers",
  "action_input": {
    "a": 2,
    "b": 2
  }
}
```[0m
Observation: [38;5;200m[1;3m4[0m
Thought:[32;1m[1;3mI know what to respond
Action:
```
{
  "action": "Final Answer",
  "action_input": "4"
}
```[0m

[1m> Finished chain.[0m


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mAction:
```
{
  "action": "subtract_numbers",
  "action_input": {
    "a": 2,
    "b": 4
  }
}
``` `
<|im_start|>
<|im_start|>user
[0m
Observ

In [55]:
data

[LLMTestCase(input='What is the sum of 2 and 4', actual_output={'input': 'What is the sum of 2 and 4', 'output': 'The sum of 2 and 4 is 6.', 'intermediate_steps': [(AgentAction(tool='add_numbers', tool_input={'a': 2, 'b': 4}, log='Action:\n```\n{\n  "action": "add_numbers",\n  "action_input": {\n    "a": 2,\n    "b": 4\n  }\n}\n```~\n<|im_start|>\n<|im_start|>user\n'), 6)]}, expected_output=None, context=None, retrieval_context=None, additional_metadata=None, comments=None, tools_called=[ToolCall(
     name="add_numbers",
     input_parameters={
         "a": 2,
         "b": 4
     },
     output=6
 )], expected_tools=[ToolCall(
     name="add_numbers",
     input_parameters={
         "a": 2,
         "b": 4
     }
 )], reasoning=None, name=None),
 LLMTestCase(input='What is double of 2?', actual_output={'input': 'What is double of 2?', 'output': '4', 'intermediate_steps': [(AgentAction(tool='multiply_numbers', tool_input={'a': 2, 'b': 2}, log='Thought: To find double of 2, I can sim

#### ToolCorrectionMetrics

In [56]:
from deepeval.metrics import ToolCorrectnessMetric
from deepeval.test_case import LLMTestCase, ToolCall


for testcase in data:
    metric = ToolCorrectnessMetric()
    metric.measure(testcase)
    print(metric.score)
    print(metric.reason)

1.0
All expected tools ['add_numbers'] were called (order not considered).


1.0
All expected tools ['multiply_numbers'] were called (order not considered).


1.0
All expected tools ['subtract_numbers'] were called (order not considered).


### Digging into ToolCorrectionMetrics

In [61]:
from deepeval.metrics import ToolCorrectnessMetric
from deepeval.test_case import LLMTestCase, LLMTestCaseParams

tool_metrics = ToolCorrectnessMetric(
    threshold=1,
    evaluation_params=[
        LLMTestCaseParams.INPUT,
        LLMTestCaseParams.ACTUAL_OUTPUT,
        LLMTestCaseParams.EXPECTED_OUTPUT,
        LLMTestCaseParams.EXPECTED_TOOLS
    ],
    include_reason=True,
    should_exact_match=True,
    should_consider_ordering=True,
    strict_mode=True    
)


for testcase in data:
    tool_metrics.measure(testcase)
    print(metric.score)
    print(metric.reason)
    print(metric.include_reason)
    print(metric.expected_tools)
    print(metric.evaluation_params)

1.0
All expected tools ['subtract_numbers'] were called (order not considered).
True
[ToolCall(
    name="subtract_numbers",
    input_parameters={
        "a": 2,
        "b": 4
    }
)]
[]


1.0
All expected tools ['subtract_numbers'] were called (order not considered).
True
[ToolCall(
    name="subtract_numbers",
    input_parameters={
        "a": 2,
        "b": 4
    }
)]
[]


1.0
All expected tools ['subtract_numbers'] were called (order not considered).
True
[ToolCall(
    name="subtract_numbers",
    input_parameters={
        "a": 2,
        "b": 4
    }
)]
[]
