In [1]:
import json

with open("hubspot_benchmark.json") as f:
    bench_data = json.load(f)
    
bench_data

[{'intent': 'List latest contacts.',
  'expected_tool_calls': [{'tool_name': 'hubspot__list-or-search-contacts__ListOrSearchContacts',
    'tool_input': '{"limit":10,"sorts":"createdate:DESCENDING"}',
    'tool_output': {'status': 'completed'}}]},
 {'intent': 'Megan needs to create a contact record for David Clarkson, david.clarkson@innovatech.com, +44 20 7946 0123, innovatech.com so the sales team can track and engage with him.',
  'expected_tool_calls': [{'tool_name': 'hubspot__create-contact__CreateContact',
    'tool_input': '{"email":"david.clarkson@innovatech.com","phone":"+44 20 7946 0123","website":"innovatech.com","firstName":"David","lastName":"Clarkson"}',
    'tool_output': {'status': 'completed'}}]},
 {'intent': "Greg needs to search for the deal 'Corporate Financial Overhaul' to check its status and recent activity.",
  'expected_tool_calls': [{'tool_name': 'hubspot__list-or-search-deals__ListOrSearchDeals',
    'tool_input': '{"query":"Corporate Financial Overhaul"}',
  

In [2]:
import json
import sys
import os
import importlib
from pathlib import Path
from dotenv import load_dotenv

# Add the current directory to the path to access the benchmarks module
notebook_path = Path().absolute()
sys.path.append(str(notebook_path.parent))

import benchmarks.superface_client
importlib.reload(benchmarks.superface_client)
from benchmarks.superface_client import get_configuration_url

# Load environment variables from .env file
load_dotenv()

# Import the Superface client
import superface.client
importlib.reload(superface.client)
from superface.client import Superface

# Initialize Superface
superface = Superface(
    api_key=os.getenv("SUPERFACE_API_KEY")
)

# Get user ID from environment variables
user_id = os.getenv("SUPERFACE_BENCHMARK_USER_ID")
if not user_id:
    raise ValueError("SUPERFACE_BENCHMARK_USER_ID environment variable is required")

# Extract unique tools from benchmark data
unique_tools = set()
for test_case in bench_data:
    for expected_call in test_case.get('expected_tool_calls', []):
        tool_name = expected_call.get('tool_name')
        if tool_name:
            unique_tools.add(tool_name)

print(f"Found {len(unique_tools)} unique tools in benchmark data:")
for tool in unique_tools:
    print(f"- {tool}")

# Check if each tool is connected
all_tools_connected = True
tool_connection_status = {}

for tool in unique_tools:
    try:
        connection_info = superface.is_tool_connected(user_id=user_id, tool_name=tool)
        is_connected = connection_info.get("connected", False)
        tool_connection_status[tool] = {
            "provider": connection_info.get("provider"),
            "connected": is_connected
        }
        
        if not is_connected:
            all_tools_connected = False
            print(f"ERROR: Tool {tool} is not connected!")
            
    except Exception as e:
        print(f"Error checking connection for {tool}: {str(e)}")
        tool_connection_status[tool] = {
            "provider": None,
            "connected": False,
            "error": str(e)
        }
        all_tools_connected = False

# Stop execution if any tool is not connected
if not all_tools_connected:
    print("\nBenchmark cannot proceed: Some required tools are not connected.")

    # Get configuration url
    try:
        config_url = get_configuration_url(
            user_id=user_id
        )
        print(f"\nPlease visit the following URL to configure your tools:")
        print(f"{config_url}")
    except Exception as e:
        print(f"Error getting configuration URL: {str(e)}")


    results = {
        "status": "failed",
        "reason": "Some required tools are not connected, use the configuration URL to connect them",
        "tool_connection_status": tool_connection_status,
        "configuration_url": config_url
    }
    raise SystemExit("Benchmark stopped due to disconnected tools")

# If we get here, all tools are connected
print("\nAll required tools are connected. Proceeding with benchmark.")

# Compile results
results = {
    "status": "ready",
    "benchmark_data": bench_data,
    "tool_connection_status": tool_connection_status
}

Found 3 unique tools in benchmark data:
- hubspot__create-contact__CreateContact
- hubspot__list-or-search-deals__ListOrSearchDeals
- hubspot__list-or-search-contacts__ListOrSearchContacts

All required tools are connected. Proceeding with benchmark.


In [4]:
# Run the benchmark tests
import time
import json
import statistics
from datetime import datetime

import benchmarks.run_agent
importlib.reload(benchmarks.run_agent)
from benchmarks.run_agent import run

# Get number of runs per test from environment variable, default to 3
NUM_RUNS_PER_TEST = int(os.getenv("BENCHMARK_RUNS_PER_TEST", 3))

# Function to compare expected and actual tool calls
def compare_tool_calls(expected_calls, actual_calls):
    if len(expected_calls) != len(actual_calls):
        mismatch_reason = f"Tool call count mismatch: Expected {len(expected_calls)}, got {len(actual_calls)}"
        print(mismatch_reason)
        return False, mismatch_reason
    
    for i, (expected, actual) in enumerate(zip(expected_calls, actual_calls)):
        print(f"\nComparing tool call {i+1}:")
        print(f"Expected: {json.dumps(expected, indent=2)}")
        print(f"Actual: {json.dumps(actual, indent=2)}")
        
        # Compare tool name
        if expected['tool_name'] != actual['tool_name']:
            mismatch_reason = f"Tool name mismatch: Expected {expected['tool_name']}, got {actual['tool_name']}"
            print(mismatch_reason)
            return False, mismatch_reason
        
        # Compare tool input
        expected_input = json.loads(expected['tool_input']) if isinstance(expected['tool_input'], str) else expected['tool_input']
        actual_input = json.loads(actual['tool_input']) if isinstance(actual['tool_input'], str) else actual['tool_input']
        
        # Check if all expected keys are in actual input
        for key, value in expected_input.items():
            if key not in actual_input:
                mismatch_reason = f"Missing key in tool input: {key}"
                print(mismatch_reason)
                return False, mismatch_reason
        
        # Compare tool output status
        if expected['tool_output']['status'] != actual['tool_output']['status']:
            mismatch_reason = f"Tool output status mismatch: Expected {expected['tool_output']['status']}, got {actual['tool_output']['status']}"
            print(mismatch_reason)
            return False, mismatch_reason
    
    print("All tool calls match!")
    return True, "All tool calls match"

# Function to run a single benchmark test
def run_benchmark_test(test_case, user_id):
    intent = test_case.get('intent')
    expected_tool_calls = test_case.get('expected_tool_calls', [])
    
    print(f"Running test: {intent}")
    print(f"Expected tool calls: {json.dumps(expected_tool_calls, indent=2)}")
    
    # Run the test multiple times
    run_results = []
    test_start_time = time.time()
    overall_success = True
    overall_tool_calls_match = True
    
    for run_num in range(NUM_RUNS_PER_TEST):
        print(f"\n--- Run {run_num + 1}/{NUM_RUNS_PER_TEST} ---")
        
        start_time = time.time()
        try:
            result = run(
                prompt=intent,
                specialist_id="hubspot",
                user_id=user_id,
                model="gpt-4o"
            )
            
            # Extract actual tool calls from the result
            actual_tool_calls = result.get('tool_calls', [])
            
            print(f"Extracted tool calls: {json.dumps(actual_tool_calls, indent=2)}")
            
            # Compare expected and actual tool calls
            tool_calls_match, mismatch_reason = compare_tool_calls(expected_tool_calls, actual_tool_calls)
            success = True
            error = None
            
            # Update overall status
            if not tool_calls_match:
                overall_tool_calls_match = False
                
        except Exception as e:
            result = None
            success = False
            error = str(e)
            tool_calls_match = False
            mismatch_reason = f"Exception occurred: {str(e)}"
            actual_tool_calls = []
            
            # Update overall status
            overall_success = False
            overall_tool_calls_match = False
        
        end_time = time.time()
        duration = end_time - start_time
        
        run_result = {
            "run_number": run_num + 1,
            "actual_tool_calls": actual_tool_calls,
            "tool_calls_match": tool_calls_match,
            "mismatch_reason": mismatch_reason if not tool_calls_match else None,
            "result": result,
            "success": success,
            "error": error,
            "duration_seconds": duration
        }
        run_results.append(run_result)
        
        print(f"Run completed in {duration:.2f} seconds")
        print(f"Success: {success}")
        if not success:
            print(f"Error: {error}")
        
        # Add a small delay between runs
        if run_num < NUM_RUNS_PER_TEST - 1:
            time.sleep(2)
    
    # Calculate statistics across runs
    success_rate = sum(1 for r in run_results if r['success']) / NUM_RUNS_PER_TEST * 100
    tool_call_match_rate = sum(1 for r in run_results if r.get('tool_calls_match', False)) / NUM_RUNS_PER_TEST * 100
    durations = [r['duration_seconds'] for r in run_results]
    avg_duration = statistics.mean(durations) if durations else 0
    min_duration = min(durations) if durations else 0
    max_duration = max(durations) if durations else 0
    std_dev_duration = statistics.stdev(durations) if len(durations) > 1 else 0
    test_duration = time.time() - test_start_time
    
    # Collect mismatch reasons
    mismatch_reasons = [r.get('mismatch_reason') for r in run_results if not r.get('tool_calls_match', True)]
    mismatch_summary = {}
    for reason in mismatch_reasons:
        if reason:
            mismatch_summary[reason] = mismatch_summary.get(reason, 0) + 1
    
    print(f"\n--- Test Statistics ---")
    print(f"Success rate: {success_rate:.2f}%")
    print(f"Tool call match rate: {tool_call_match_rate:.2f}%")
    print(f"Average duration: {avg_duration:.2f} seconds")
    print(f"Min duration: {min_duration:.2f} seconds")
    print(f"Max duration: {max_duration:.2f} seconds")
    print(f"Standard deviation: {std_dev_duration:.2f} seconds")
    
    if mismatch_summary:
        print("\nMismatch reasons:")
        for reason, count in mismatch_summary.items():
            print(f"- {reason} (occurred {count} times)")
    
    return {
        "intent": intent,
        "expected_tool_calls": expected_tool_calls,
        "runs": run_results,
        "stats": {
            "success_rate": success_rate,
            "tool_call_match_rate": tool_call_match_rate,
            "avg_duration": avg_duration,
            "min_duration": min_duration,
            "max_duration": max_duration,
            "std_dev_duration": std_dev_duration,
            "num_runs": NUM_RUNS_PER_TEST,
            "mismatch_summary": mismatch_summary
        },
        # Add top-level properties for easier access
        "success": overall_success,
        "tool_calls_match": overall_tool_calls_match,
        "duration_seconds": test_duration,
        "result": run_results[-1]["result"] if run_results and run_results[-1]["success"] else None,
        "error": None if overall_success else run_results[-1].get("error", "Unknown error")
    }

# Run all benchmark tests
benchmark_results = []
for i, test_case in enumerate(bench_data):
    print(f"\n=== Test {i+1}/{len(bench_data)} ===")
    result = run_benchmark_test(test_case, user_id)
    benchmark_results.append(result)
    print(f"Test completed in {result['duration_seconds']:.2f} seconds")
    print(f"Success: {result['success']}")
    if not result['success']:
        print(f"Error: {result['error']}")
    else:
        # Convert CrewOutput to string first, then truncate if needed
        result_str = str(result['result'])
        if len(result_str) > 200:
            print(f"Result: {result_str[:200]}...")
        else:
            print(f"Result: {result_str}")
    
    # Add a small delay between tests
    if i < len(bench_data) - 1:
        time.sleep(2)

# Compile final results
final_results = {
    "timestamp": datetime.now().isoformat(),
    "benchmark_name": "HubSpot Superface Specialist",
    "total_tests": len(bench_data),
    "successful_tests": sum(1 for r in benchmark_results if r['success']),
    "failed_tests": sum(1 for r in benchmark_results if not r['success']),
    "tool_calls_match_count": sum(1 for r in benchmark_results if r.get('tool_calls_match', False)),
    "results": benchmark_results
}

# For JSON serialization, convert CrewOutput objects to strings
serializable_results = []
for r in benchmark_results:
    serializable_result = r.copy()
    if r['success'] and r['result'] is not None:
        serializable_result['result'] = str(r['result'])
    # Make sure actual_tool_calls is included
    if 'actual_tool_calls' not in serializable_result:
        serializable_result['actual_tool_calls'] = []
    serializable_results.append(serializable_result)

print("\n=== Benchmark Summary ===")
print(f"Total tests: {final_results['total_tests']}")
print(f"Successful tests: {final_results['successful_tests']}")
print(f"Failed tests: {final_results['failed_tests']}")
print(f"Tests with matching tool calls: {final_results['tool_calls_match_count']}")
print(f"Success rate: {(final_results['successful_tests'] / final_results['total_tests']) * 100:.2f}%")
print(f"Tool call match rate: {(final_results['tool_calls_match_count'] / final_results['total_tests']) * 100:.2f}%")


# Save results to file
results_filename = f"./results/hubspot_benchmark_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
with open(results_filename, "w") as f:
    json.dump(final_results, f, indent=2)

print(f"\nResults saved to {results_filename}")


=== Test 1/3 ===
Running test: List latest contacts.
Expected tool calls: [
  {
    "tool_name": "hubspot__list-or-search-contacts__ListOrSearchContacts",
    "tool_input": "{\"limit\":10,\"sorts\":\"createdate:DESCENDING\"}",
    "tool_output": {
      "status": "completed"
    }
  }
]

--- Run 1/3 ---


Overriding of current TracerProvider is not allowed


[1m[95m# Agent:[00m [1m[92mHubspot Agent[00m
[95m## Task:[00m [92mList latest contacts.[00m


[1m[95m# Agent:[00m [1m[92mHubspot Agent[00m
[95m## Thought:[00m [92mI need to list the latest contacts from HubSpot. I will use the HubSpot tool to retrieve this information.[00m
[95m## Using tool:[00m [92mhubspot[00m
[95m## Tool Input:[00m [92m
"{\"intent\": \"list contacts\", \"data\": {\"limit\": 5, \"sort\": \"createdate\"}}"[00m
[95m## Tool Output:[00m [92m
{'kind': 'content', 'content': 'Here are the 5 most recently created contacts:\n\n1. **Email:** david.clarkson@innovatech.com\n   - **Created Date:** 2025-03-19\n   - **First Name:** Not provided\n   - **Last Name:** Not provided\n\n2. **Email:** mark.levin@growthdynamics.com\n   - **Created Date:** 2025-03-19\n   - **First Name:** Mark\n   - **Last Name:** Levin\n\n3. **Email:** john.doe@acme.com\n   - **Created Date:** 2025-03-07\n   - **First Name:** John\n   - **Last Name:** Doe\n\n4. **Email:** bh@h

Overriding of current TracerProvider is not allowed


[1m[95m# Agent:[00m [1m[92mHubspot Agent[00m
[95m## Task:[00m [92mList latest contacts.[00m


[1m[95m# Agent:[00m [1m[92mHubspot Agent[00m
[95m## Thought:[00m [92mI need to list the latest contacts in HubSpot.[00m
[95m## Using tool:[00m [92mhubspot[00m
[95m## Tool Input:[00m [92m
"{\"intent\": \"list latest contacts\", \"data\": {}}"[00m
[95m## Tool Output:[00m [92m
{'kind': 'content', 'content': 'Here are the latest contacts:\n\n1. **Email:** david.clarkson@innovatech.com\n   - **Created At:** March 19, 2025\n   - **Last Modified:** March 19, 2025\n\n2. **Name:** Mark Levin\n   - **Email:** mark.levin@growthdynamics.com\n   - **Created At:** March 19, 2025\n   - **Last Modified:** March 19, 2025\n\n3. **Name:** John Doe\n   - **Email:** john.doe@acme.com\n   - **Created At:** March 7, 2025\n   - **Last Modified:** March 11, 2025\n\n4. **Name:** Brian Halligan (Sample Contact)\n   - **Email:** bh@hubspot.com\n   - **Created At:** February 24, 2025\n   - *

Overriding of current TracerProvider is not allowed



--- Run 3/3 ---
[1m[95m# Agent:[00m [1m[92mHubspot Agent[00m
[95m## Task:[00m [92mList latest contacts.[00m


[1m[95m# Agent:[00m [1m[92mHubspot Agent[00m
[95m## Thought:[00m [92mI need to retrieve the latest contacts from Hubspot. I will use the hubspot tool to get this information.[00m
[95m## Using tool:[00m [92mhubspot[00m
[95m## Tool Input:[00m [92m
"{\"intent\": \"list latest contacts\", \"data\": {}}"[00m
[95m## Tool Output:[00m [92m
{'kind': 'content', 'content': 'Here are the latest contacts:\n\n1. **Email:** david.clarkson@innovatech.com\n   - **Created At:** March 19, 2025\n   - **Last Modified:** March 19, 2025\n\n2. **Name:** Mark Levin\n   - **Email:** mark.levin@growthdynamics.com\n   - **Created At:** March 19, 2025\n   - **Last Modified:** March 19, 2025\n\n3. **Name:** John Doe\n   - **Email:** john.doe@acme.com\n   - **Created At:** March 7, 2025\n   - **Last Modified:** March 11, 2025\n\n4. **Name:** Brian Halligan (Sample Contact)\n  

Overriding of current TracerProvider is not allowed



=== Test 2/3 ===
Running test: Megan needs to create a contact record for David Clarkson, david.clarkson@innovatech.com, +44 20 7946 0123, innovatech.com so the sales team can track and engage with him.
Expected tool calls: [
  {
    "tool_name": "hubspot__create-contact__CreateContact",
    "tool_input": "{\"email\":\"david.clarkson@innovatech.com\",\"phone\":\"+44 20 7946 0123\",\"website\":\"innovatech.com\",\"firstName\":\"David\",\"lastName\":\"Clarkson\"}",
    "tool_output": {
      "status": "completed"
    }
  }
]

--- Run 1/3 ---
[1m[95m# Agent:[00m [1m[92mHubspot Agent[00m
[95m## Task:[00m [92mMegan needs to create a contact record for David Clarkson, david.clarkson@innovatech.com, +44 20 7946 0123, innovatech.com so the sales team can track and engage with him.[00m


[1m[95m# Agent:[00m [1m[92mHubspot Agent[00m
[95m## Using tool:[00m [92mhubspot[00m
[95m## Tool Input:[00m [92m
"{\"intent\": \"create_contact\", \"data\": {\"email\": \"david.clarkson@i

Overriding of current TracerProvider is not allowed



--- Run 2/3 ---
[1m[95m# Agent:[00m [1m[92mHubspot Agent[00m
[95m## Task:[00m [92mMegan needs to create a contact record for David Clarkson, david.clarkson@innovatech.com, +44 20 7946 0123, innovatech.com so the sales team can track and engage with him.[00m


[1m[95m# Agent:[00m [1m[92mHubspot Agent[00m
[95m## Thought:[00m [92mI need to create a contact record for David Clarkson in HubSpot using the provided details. I will use the hubspot tool to create the contact.[00m
[95m## Using tool:[00m [92mhubspot[00m
[95m## Tool Input:[00m [92m
"{\"intent\": \"create_contact\", \"data\": {\"email\": \"david.clarkson@innovatech.com\", \"first_name\": \"David\", \"last_name\": \"Clarkson\", \"phone\": \"+44 20 7946 0123\", \"website\": \"innovatech.com\"}}"[00m
[95m## Tool Output:[00m [92m
{'kind': 'content', 'content': 'The contact with the email "david.clarkson@innovatech.com" already exists in the system with the ID 107387310699. If you need to update this cont

Overriding of current TracerProvider is not allowed


[1m[95m# Agent:[00m [1m[92mHubspot Agent[00m
[95m## Task:[00m [92mMegan needs to create a contact record for David Clarkson, david.clarkson@innovatech.com, +44 20 7946 0123, innovatech.com so the sales team can track and engage with him.[00m


[1m[95m# Agent:[00m [1m[92mHubspot Agent[00m
[95m## Thought:[00m [92mI need to create a contact record for David Clarkson with the information provided.[00m
[95m## Using tool:[00m [92mhubspot[00m
[95m## Tool Input:[00m [92m
"{\"intent\": \"create_contact\", \"data\": {\"properties\": {\"email\": \"david.clarkson@innovatech.com\", \"firstname\": \"David\", \"lastname\": \"Clarkson\", \"phone\": \"+44 20 7946 0123\", \"website\": \"innovatech.com\"}}}"[00m
[95m## Tool Output:[00m [92m
{'kind': 'content', 'content': 'The contact with the email "david.clarkson@innovatech.com" already exists in the system with the ID "107387310699". If you need to update this contact\'s information, please provide the details you wish to

Overriding of current TracerProvider is not allowed


[1m[95m# Agent:[00m [1m[92mHubspot Agent[00m
[95m## Task:[00m [92mGreg needs to search for the deal 'Corporate Financial Overhaul' to check its status and recent activity.[00m


[1m[95m# Agent:[00m [1m[92mHubspot Agent[00m
[95m## Thought:[00m [92mI need to search for the deal named 'Corporate Financial Overhaul' to check its status and recent activity in HubSpot.[00m
[95m## Using tool:[00m [92mhubspot[00m
[95m## Tool Input:[00m [92m
"{\"intent\": \"find deal\", \"data\": {\"deal_name\": \"Corporate Financial Overhaul\"}}"[00m
[95m## Tool Output:[00m [92m
{'kind': 'content', 'content': 'I found a deal with the name "Corporate Financial Overhaul." Here are the details:\n\n- **Deal Name:** New Deal with John Doe and Acme ltd\n- **Deal ID:** 34501834581\n- **Amount:** 0\n- **Deal Stage:** Appointment Scheduled\n- **Pipeline:** Default\n- **Created At:** March 11, 2025\n- **Last Modified:** March 19, 2025\n\nIf you need more information or further actions, ple

Overriding of current TracerProvider is not allowed


[1m[95m# Agent:[00m [1m[92mHubspot Agent[00m
[95m## Task:[00m [92mGreg needs to search for the deal 'Corporate Financial Overhaul' to check its status and recent activity.[00m


[1m[95m# Agent:[00m [1m[92mHubspot Agent[00m
[95m## Thought:[00m [92mI need to search for the deal named 'Corporate Financial Overhaul' to check its status and recent activity. To do so, I'll use the hubspot tool to find and retrieve details about the specific deal.[00m
[95m## Using tool:[00m [92mhubspot[00m
[95m## Tool Input:[00m [92m
"{\"intent\": \"find_deal\", \"data\": {\"deal_name\": \"Corporate Financial Overhaul\"}}"[00m
[95m## Tool Output:[00m [92m
{'kind': 'content', 'content': 'The deal named "Corporate Financial Overhaul" was not found in the system. If you have any other details or need further assistance, please let me know!', 'executions': [{'id': 'call_jsXHSSPklovzfJlq5991dJuc', 'groupId': 'WMt2JP3', 'toolCall': {'kind': 'function', 'id': 'call_jsXHSSPklovzfJlq5991

Overriding of current TracerProvider is not allowed



--- Run 3/3 ---
[1m[95m# Agent:[00m [1m[92mHubspot Agent[00m
[95m## Task:[00m [92mGreg needs to search for the deal 'Corporate Financial Overhaul' to check its status and recent activity.[00m


[1m[95m# Agent:[00m [1m[92mHubspot Agent[00m
[95m## Using tool:[00m [92mhubspot[00m
[95m## Tool Input:[00m [92m
"{\"intent\": \"search for a deal\", \"data\": {\"deal_name\": \"Corporate Financial Overhaul\"}}"[00m
[95m## Tool Output:[00m [92m
{'kind': 'content', 'content': 'I found a deal related to your search for "Corporate Financial Overhaul":\n\n- **Deal Name:** New Deal with John Doe and Acme ltd\n- **Deal ID:** 34501834581\n- **Amount:** 0\n- **Deal Stage:** Appointment Scheduled\n- **Pipeline:** Default\n- **Created At:** March 11, 2025\n- **Last Modified:** March 19, 2025\n\nIf you need more details or further assistance, feel free to ask!', 'executions': [{'id': 'call_Aur8l2Rqk277sgPOtFkv2nQT', 'groupId': '9HlLOyf', 'toolCall': {'kind': 'function', 'id': 'ca