# ComputerAgent HUD Integration for OSWorld

This notebook demonstrates how to use the ComputerAgent with HUD for OSWorld benchmarking.
The ComputerAgent integration provides the same interface as OperatorAgent but works with both Claude and OpenAI models.

In [None]:
# # Install dependencies if needed
# !uv venv 
# !source .venv/bin/activate
# !uv sync

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from dotenv import load_dotenv

# Load environment variables from ../.env
load_dotenv(dotenv_path='../.env')

# Required environment variables:
# - HUD_API_KEY (for HUD access)
# - ANTHROPIC_API_KEY (for Claude models)
# - OPENAI_API_KEY (for OpenAI models)

from pprint import pprint

## Quick single-task smoke test on OSWorld-Verified

The ComputerAgent integration can use Claude, OpenAI, UI-TARS, or composed models just like the original ComputerAgent:

In [None]:
from agent.integrations.hud import run_single_task

# Quick single-task smoke test on OSWorld-Verified
# You can swap "hud-evals/OSWorld-Verified" -> "hud-evals/SheetBench-V2" to test SheetBench.
await run_single_task(
    dataset="hud-evals/OSWorld-Verified",
    model="openai/computer-use-preview+openai/gpt-5-nano",  # or any supported model string
    task_id=155 # open last tab task (easy)
)

## Run OSWorld-Verified in parallel

In [None]:
import uuid
from agent.integrations.hud import run_full_dataset

# Full dataset evaluation (runs via HUD's run_dataset under the hood)
job_name = f"osworld-test-{str(uuid.uuid4())[:4]}"

results = await run_full_dataset(
    dataset="hud-evals/OSWorld-Verified",          # You can also pass a Dataset or a list[dict]
    job_name=job_name,                   # Optional; defaults to a timestamp for custom datasets
    model="openai/computer-use-preview", # Or any supported model string
    max_concurrent=20,                   # Tune to your infra
    max_steps=50,                        # Safety cap per task
    split="train[:3]"                    # Limit to just 3 tasks
)

# results is a list from hud.datasets.run_dataset; inspect/aggregate as needed
print(f"Job: {job_name}")
print(f"Total results: {len(results)}")
pprint(results[:3])  # preview

# Benchmark Composed Agents

In [None]:
import uuid
from agent.integrations.hud import run_full_dataset

models_to_test = [
    "openai/computer-use-preview+anthropic/claude-opus-4-20250514",
]
 

for model in models_to_test:
    # Full dataset evaluation (runs via HUD's run_dataset under the hood)
    job_uuid = str(uuid.uuid4())[:6]
    job_name = f"osworld {job_uuid} {model}"

    results = await run_full_dataset(
        dataset="hud-evals/OSWorld-Verified",
        job_name=job_name,                 
        model=model,
        max_concurrent=20,                   
        max_steps=75,
        trajectory_dir=f"trajectories/osworld_{job_uuid}",
        only_n_most_recent_images=3
    )

    # results is a list from hud.datasets.run_dataset; inspect/aggregate as needed
    print(f"Job: {job_name}")
    print(f"Total results: {len(results)}")