In [7]:

import importlib
from pathlib import Path
import sys

# Add the current directory to the path to access the benchmarks module
notebook_path = Path().absolute()
sys.path.append(str(notebook_path.parent))

# Import the setup function
import benchmarks.setup_benchmark_env
importlib.reload(benchmarks.setup_benchmark_env)
from benchmarks.setup_benchmark_env import setup_benchmark_environment

# Set up the benchmark environment
config = setup_benchmark_environment()

print("*** Benchmark environment setup ***")
print(f"Specialist ID: {config['specialist_id']}")
print(f"Model: {config['model']}")
print(f"Runs per test: {config['runs_per_test']}")
print(f"User ID: {config['user_id']}")


*** Benchmark environment setup ***
Specialist ID: hubspot
Model: gpt-4o
Runs per test: 1
User ID: benchmark_test1


In [8]:
import json

with open("hubspot_benchmark.json") as f:
    bench_data = json.load(f)
    
bench_data

[{'intent': 'List latest contacts.',
  'expected_tool_calls': [{'tool_name': 'hubspot__list-or-search-contacts__ListOrSearchContacts',
    'tool_input': '{"limit":10,"sorts":"createdate:DESCENDING"}',
    'optional_tool_input_keys': ['limit'],
    'tool_output': {'status': 'completed'}}]},
 {'intent': 'Megan needs to create a contact record for David Clarkson, david.clarkson@innovatech.com, +44 20 7946 0123, innovatech.com so the sales team can track and engage with him.',
  'expected_tool_calls': [{'tool_name': 'hubspot__create-contact__CreateContact',
    'tool_input': '{"email":"david.clarkson@innovatech.com","phone":"+44 20 7946 0123","website":"innovatech.com","firstName":"David","lastName":"Clarkson"}',
    'optional_tool_input_keys': ['website', 'company'],
    'tool_output': {'status': 'completed'}}]},
 {'intent': "Greg needs to search for the deal 'Corporate Financial Overhaul' to check its status and recent activity.",
  'expected_tool_calls': [{'tool_name': 'hubspot__list-or

In [9]:
import json
import sys
import os
import importlib
from pathlib import Path

# Add the current directory to the path to access the benchmarks module
notebook_path = Path().absolute()
sys.path.append(str(notebook_path.parent))

import benchmarks.superface_client
importlib.reload(benchmarks.superface_client)
from benchmarks.superface_client import get_configuration_url

# Import the Superface client
import superface.client
importlib.reload(superface.client)
from superface.client import Superface

# Initialize Superface
superface = Superface(
    api_key=config['superface_api_key']
)

# Get user ID from environment variables
user_id = config['user_id']
if not user_id:
    raise ValueError("SUPERFACE_BENCHMARK_USER_ID environment variable is required")

# Extract unique tools from benchmark data
unique_tools = set()
for test_case in bench_data:
    for expected_call in test_case.get('expected_tool_calls', []):
        tool_name = expected_call.get('tool_name')
        if tool_name:
            unique_tools.add(tool_name)

print(f"Found {len(unique_tools)} unique tools in benchmark data:")
for tool in unique_tools:
    print(f"- {tool}")

# Check if each tool is connected
all_tools_connected = True
tool_connection_status = {}

for tool in unique_tools:
    try:
        connection_info = superface.is_tool_connected(user_id=user_id, tool_name=tool)
        is_connected = connection_info.get("connected", False)
        tool_connection_status[tool] = {
            "provider": connection_info.get("provider"),
            "connected": is_connected
        }
        
        if not is_connected:
            all_tools_connected = False
            print(f"ERROR: Tool {tool} is not connected!")
            
    except Exception as e:
        print(f"Error checking connection for {tool}: {str(e)}")
        tool_connection_status[tool] = {
            "provider": None,
            "connected": False,
            "error": str(e)
        }
        all_tools_connected = False

# Stop execution if any tool is not connected
if not all_tools_connected:
    print("\nBenchmark cannot proceed: Some required tools are not connected.")

    # Get configuration url
    try:
        config_url = get_configuration_url(
            user_id=user_id
        )
        print(f"\nPlease visit the following URL to configure your tools:")
        print(f"{config_url}")
    except Exception as e:
        print(f"Error getting configuration URL: {str(e)}")


    results = {
        "status": "failed",
        "reason": "Some required tools are not connected, use the configuration URL to connect them",
        "tool_connection_status": tool_connection_status,
        "configuration_url": config_url
    }
    raise SystemExit("Benchmark stopped due to disconnected tools")

# If we get here, all tools are connected
print("\nAll required tools are connected. Proceeding with benchmark.")

# Compile results
results = {
    "status": "ready",
    "benchmark_data": bench_data,
    "tool_connection_status": tool_connection_status
}

Found 5 unique tools in benchmark data:
- hubspot__list-or-search-companies__ListOrSearchCompanies
- hubspot__list-or-search-engagements__ListOrSearchEngagements
- hubspot__list-or-search-deals__ListOrSearchDeals
- hubspot__list-or-search-contacts__ListOrSearchContacts
- hubspot__create-contact__CreateContact

All required tools are connected. Proceeding with benchmark.


In [10]:
# Run the benchmark tests
import time
import json
import statistics
from datetime import datetime

import benchmarks.run_agent
importlib.reload(benchmarks.run_agent)
from benchmarks.run_agent import run

import benchmarks.compare_tool_calls
importlib.reload(benchmarks.compare_tool_calls)
from benchmarks.compare_tool_calls import compare_tool_calls

# Get number of runs per test from environment variable, default to 3
NUM_RUNS_PER_TEST = int(os.getenv("BENCHMARK_RUNS_PER_TEST", 3))

# Function to run a single benchmark test
def run_benchmark_test(test_case, user_id):
    intent = test_case.get('intent')
    expected_tool_calls = test_case.get('expected_tool_calls', [])
    
    print(f"Running test: {intent}")
    print(f"Expected tool calls: {json.dumps(expected_tool_calls, indent=2)}")
    
    # Run the test multiple times
    run_results = []
    test_start_time = time.time()
    overall_success = True
    overall_tool_calls_match = True
    
    for run_num in range(NUM_RUNS_PER_TEST):
        print(f"\n--- Run {run_num + 1}/{NUM_RUNS_PER_TEST} ---")
        
        start_time = time.time()
        try:
            result = run(
                prompt=intent,
                specialist_id=config['specialist_id'],
                user_id=user_id,
                model=config['model']
            )
            
            # Extract actual tool calls from the result
            actual_tool_calls = result.get('tool_calls', [])
            
            print(f"Extracted tool calls: {json.dumps(actual_tool_calls, indent=2)}")
            
            # Compare expected and actual tool calls
            tool_calls_match, mismatch_reason = compare_tool_calls(expected_tool_calls, actual_tool_calls)
            success = True
            error = None
            
            # Update overall status
            if not tool_calls_match:
                overall_tool_calls_match = False
                
        except Exception as e:
            result = None
            success = False
            error = str(e)
            tool_calls_match = False
            mismatch_reason = f"Exception occurred: {str(e)}"
            actual_tool_calls = []
            
            # Update overall status
            overall_success = False
            overall_tool_calls_match = False
        
        end_time = time.time()
        duration = end_time - start_time
        
        run_result = {
            "run_number": run_num + 1,
            "actual_tool_calls": actual_tool_calls,
            "tool_calls_match": tool_calls_match,
            "mismatch_reason": mismatch_reason if not tool_calls_match else None,
            "result": result,
            "success": success,
            "error": error,
            "duration_seconds": duration
        }
        run_results.append(run_result)
        
        print(f"Run completed in {duration:.2f} seconds")
        print(f"Success: {success}")
        if not success:
            print(f"Error: {error}")
        
        # Add a small delay between runs
        if run_num < NUM_RUNS_PER_TEST - 1:
            time.sleep(2)
    
    # Calculate statistics across runs
    success_rate = sum(1 for r in run_results if r['success']) / NUM_RUNS_PER_TEST * 100
    tool_call_match_rate = sum(1 for r in run_results if r.get('tool_calls_match', False)) / NUM_RUNS_PER_TEST * 100
    durations = [r['duration_seconds'] for r in run_results]
    avg_duration = statistics.mean(durations) if durations else 0
    min_duration = min(durations) if durations else 0
    max_duration = max(durations) if durations else 0
    std_dev_duration = statistics.stdev(durations) if len(durations) > 1 else 0
    test_duration = time.time() - test_start_time
    
    # Collect mismatch reasons
    mismatch_reasons = [r.get('mismatch_reason') for r in run_results if not r.get('tool_calls_match', True)]
    mismatch_summary = {}
    for reason in mismatch_reasons:
        if reason:
            mismatch_summary[reason] = mismatch_summary.get(reason, 0) + 1
    
    print(f"\n--- Test Statistics ---")
    print(f"Success rate: {success_rate:.2f}%")
    print(f"Tool call match rate: {tool_call_match_rate:.2f}%")
    print(f"Average duration: {avg_duration:.2f} seconds")
    print(f"Min duration: {min_duration:.2f} seconds")
    print(f"Max duration: {max_duration:.2f} seconds")
    print(f"Standard deviation: {std_dev_duration:.2f} seconds")
    
    if mismatch_summary:
        print("\nMismatch reasons:")
        for reason, count in mismatch_summary.items():
            print(f"- {reason} (occurred {count} times)")
    
    return {
        "intent": intent,
        "expected_tool_calls": expected_tool_calls,
        "runs": run_results,
        "stats": {
            "success_rate": success_rate,
            "tool_call_match_rate": tool_call_match_rate,
            "avg_duration": avg_duration,
            "min_duration": min_duration,
            "max_duration": max_duration,
            "std_dev_duration": std_dev_duration,
            "num_runs": NUM_RUNS_PER_TEST,
            "mismatch_summary": mismatch_summary
        },
        # Add top-level properties for easier access
        "success": overall_success,
        "tool_calls_match": overall_tool_calls_match,
        "duration_seconds": test_duration,
        "result": run_results[-1]["result"] if run_results and run_results[-1]["success"] else None,
        "error": None if overall_success else run_results[-1].get("error", "Unknown error")
    }

# Run all benchmark tests
benchmark_results = []
for i, test_case in enumerate(bench_data):
    print(f"\n=== Test {i+1}/{len(bench_data)} ===")
    result = run_benchmark_test(test_case, user_id)
    benchmark_results.append(result)
    print(f"Test completed in {result['duration_seconds']:.2f} seconds")
    print(f"Success: {result['success']}")
    if not result['success']:
        print(f"Error: {result['error']}")
    else:
        # Convert CrewOutput to string first, then truncate if needed
        result_str = str(result['result'])
        if len(result_str) > 200:
            print(f"Result: {result_str[:200]}...")
        else:
            print(f"Result: {result_str}")
    
    # Add a small delay between tests
    if i < len(bench_data) - 1:
        time.sleep(2)

# Import the benchmark utilities
import benchmarks.benchmark_results
importlib.reload(benchmarks.benchmark_results)
from benchmarks.benchmark_results import compile_benchmark_results, save_benchmark_results, print_benchmark_summary

# Compile and save results
final_results = compile_benchmark_results(
    benchmark_results=benchmark_results,
    bench_data=bench_data,
    num_runs_per_test=NUM_RUNS_PER_TEST,
    benchmark_name="HubSpot Superface Specialist"
)

# Print summary
print_benchmark_summary(final_results)

# Save results to file
results_filename = save_benchmark_results(final_results, prefix="hubspot_benchmark_results")
print(f"\nResults saved to {results_filename}")


=== Test 1/5 ===
Running test: List latest contacts.
Expected tool calls: [
  {
    "tool_name": "hubspot__list-or-search-contacts__ListOrSearchContacts",
    "tool_input": "{\"limit\":10,\"sorts\":\"createdate:DESCENDING\"}",
    "optional_tool_input_keys": [
      "limit"
    ],
    "tool_output": {
      "status": "completed"
    }
  }
]

--- Run 1/1 ---


Overriding of current TracerProvider is not allowed


[1m[95m# Agent:[00m [1m[92mHubspot Agent[00m
[95m## Task:[00m [92mList latest contacts.[00m


[1m[95m# Agent:[00m [1m[92mHubspot Agent[00m
[95m## Using tool:[00m [92mhubspot[00m
[95m## Tool Input:[00m [92m
"{\"intent\": \"list contacts\", \"data\": {\"order\": \"newest\"}}"[00m
[95m## Tool Output:[00m [92m
{'kind': 'content', 'content': 'Here are the most recently created contacts:\n\n1. **John Doe**\n   - Email: john.doe@acme.com\n   - Created At: March 21, 2025, 11:07 AM\n\n2. **Clara Reynolds**\n   - Email: clara.raynolds@acme.com\n   - Created At: March 21, 2025, 10:21 AM\n\n3. **David Clarkson**\n   - Email: david.clarkson@innovatech.com\n   - Created At: March 21, 2025, 10:02 AM\n\nIf you need more details or further actions, feel free to ask!', 'executions': [{'id': 'call_JEYxBexdSxhIaJqjB1SSM44v', 'groupId': 'pNWLxCx', 'toolCall': {'kind': 'function', 'id': 'call_JEYxBexdSxhIaJqjB1SSM44v', 'groupId': 'W8x5a8L', 'name': 'hubspot__list-or-search-contact

Overriding of current TracerProvider is not allowed


[1m[95m# Agent:[00m [1m[92mHubspot Agent[00m
[95m## Task:[00m [92mMegan needs to create a contact record for David Clarkson, david.clarkson@innovatech.com, +44 20 7946 0123, innovatech.com so the sales team can track and engage with him.[00m


[1m[95m# Agent:[00m [1m[92mHubspot Agent[00m
[95m## Using tool:[00m [92mhubspot[00m
[95m## Tool Input:[00m [92m
"{\"intent\": \"create_contact\", \"data\": {\"first_name\": \"David\", \"last_name\": \"Clarkson\", \"email\": \"david.clarkson@innovatech.com\", \"phone\": \"+44 20 7946 0123\", \"company\": \"innovatech.com\"}}"[00m
[95m## Tool Output:[00m [92m
{'kind': 'content', 'content': 'The contact "David Clarkson" with the email "david.clarkson@innovatech.com" already exists in the system with the ID 107757103005. If you need to update this contact\'s information, please let me know!', 'executions': [{'id': 'call_HhNVAJ2IUBSRMIPpbZZ3Cm6w', 'groupId': 'aQd8Xkh', 'toolCall': {'kind': 'function', 'id': 'call_HhNVAJ2IUBS

Overriding of current TracerProvider is not allowed


[1m[95m# Agent:[00m [1m[92mHubspot Agent[00m
[95m## Task:[00m [92mGreg needs to search for the deal 'Corporate Financial Overhaul' to check its status and recent activity.[00m


[1m[95m# Agent:[00m [1m[92mHubspot Agent[00m
[95m## Thought:[00m [92mTo find the status and recent activity of the deal named "Corporate Financial Overhaul," I will utilize our HubSpot CRM capabilities to search for the deal.[00m
[95m## Using tool:[00m [92mhubspot[00m
[95m## Tool Input:[00m [92m
"{\"intent\": \"search deal\", \"data\": {\"deal_name\": \"Corporate Financial Overhaul\"}}"[00m
[95m## Tool Output:[00m [92m
{'kind': 'content', 'content': 'I found a deal with the name "Corporate Financial Overhaul." Here are the details:\n\n- **Deal Name:** New Deal for ACME Ltd\n- **Amount:** $10,000\n- **Deal Stage:** Presentation Scheduled\n- **Pipeline:** Default\n- **Created At:** March 21, 2025\n- **Last Modified:** March 21, 2025\n\nIf you need more information or further actions

Overriding of current TracerProvider is not allowed


[1m[95m# Agent:[00m [1m[92mHubspot Agent[00m
[95m## Task:[00m [92mJohn needs to retrieve past engagements (calls, emails, meetings) related to Clara Reynolds to prepare the CEO for an important meeting.[00m


[1m[95m# Agent:[00m [1m[92mHubspot Agent[00m
[95m## Using tool:[00m [92mhubspot[00m
[95m## Tool Input:[00m [92m
"{\"intent\": \"find_contact\", \"data\": {\"fullName\": \"Clara Reynolds\"}}"[00m
[95m## Tool Output:[00m [92m
{'kind': 'content', 'content': 'I found the contact "Clara Reynolds" in the system. Here are the details:\n\n- **First Name:** Clara\n- **Last Name:** Reynolds\n- **Email:** clara.raynolds@acme.com\n- **Contact ID:** 107761446931\n- **Created At:** March 21, 2025\n- **Last Modified:** March 21, 2025\n\nIf you need further information or actions related to this contact, please let me know!', 'executions': [{'id': 'call_KroD4oXuIDy2Xrkgw72sCnD0', 'groupId': 'XE5EmaE', 'toolCall': {'kind': 'function', 'id': 'call_KroD4oXuIDy2Xrkgw72sCnD0

Overriding of current TracerProvider is not allowed


[1m[95m# Agent:[00m [1m[92mHubspot Agent[00m
[95m## Task:[00m [92mCreate a new lead, John Doe (john.doe@acme.com), and the company ACME Ltd (acme.com). Check for duplicate companies by name. Associate the lead with the company.[00m


[1m[95m# Agent:[00m [1m[92mHubspot Agent[00m
[95m## Using tool:[00m [92mhubspot[00m
[95m## Tool Input:[00m [92m
"{\"intent\": \"find_company\", \"data\": {\"name\": \"ACME Ltd\"}}"[00m
[95m## Tool Output:[00m [92m
{'kind': 'content', 'content': 'I found the company "ACME Ltd" in the CRM. Here are the details:\n\n- **Name:** ACME Ltd\n- **Domain:** acme.com\n- **Created At:** March 21, 2025\n- **Last Modified At:** March 21, 2025\n- **Company ID:** 31253238432\n\nIf you need further information or actions related to this company, please let me know!', 'executions': [{'id': 'call_R06kvJarEqnnJiZvmjQrM1wC', 'groupId': 'jdgFo6K', 'toolCall': {'kind': 'function', 'id': 'call_R06kvJarEqnnJiZvmjQrM1wC', 'groupId': 'q7FQ1fF', 'name': 'hu