# AWorld MAS Task Execution

This notebook demonstrates transparent, step-by-step agent execution for a GAIA benchmark task.


In [None]:
# Task Information
task_id = "ec09fa32-d03f-4bf8-84b0-1f16922c3ae4"
level = 1
question = """Here's a fun riddle that I think you'll enjoy.

You have been selected to play the final round of the hit new game show "Pick That Ping-Pong". In this round, you will be competing for a large cash prize. Your job will be to pick one of several different numbered ping-pong balls, and then the game will commence. The host describes how the game works.

A device consisting of a winding clear ramp and a series of pistons controls the outcome of the game. The ramp feeds balls onto a platform. The platform has room for three ping-pong balls at a time. The three balls on the platform are each aligned with one of three pistons. At each stage of the game, one of the three pistons will randomly fire, ejecting the ball it strikes. If the piston ejects the ball in the first position on the platform the balls in the second and third position on the platform each advance one space, and the next ball on the ramp advances to the third position. If the piston ejects the ball in the second position, the ball in the first position is released and rolls away, the ball in the third position advances two spaces to occupy the first position, and the next two balls on the ramp advance to occupy the second and third positions on the platform. If the piston ejects the ball in the third position, the ball in the first position is released and rolls away, the ball in the second position advances one space to occupy the first position, and the next two balls on the ramp advance to occupy the second and third positions on the platform.

The ramp begins with 100 numbered ping-pong balls, arranged in ascending order from 1 to 100. The host activates the machine and the first three balls, numbered 1, 2, and 3, advance to the platform. Before the random firing of the pistons begins, you are asked which of the 100 balls you would like to pick. If your pick is ejected by one of the pistons, you win the grand prize, $10,000.

Which ball should you choose to maximize your odds of winning the big prize? Please provide your answer as the number of the ball selected."""
ground_truth = """3"""
file_name = ""
annotator_tools = """None"""

print("=" * 80)
print("TASK DETAILS")
print("=" * 80)
print(f"Task ID: {task_id}")
print(f"Difficulty Level: {level}")
print(f"Has File Attachment: {bool(file_name)}")
if file_name:
    print(f"  File: {file_name}")
print(f"Annotator Tools Used: {annotator_tools if annotator_tools else 'None'}")
print()
print("QUESTION:")
print("-" * 80)
print(question)
print()
print("GROUND TRUTH ANSWER:")
print("-" * 80)
print(ground_truth)
print("=" * 80)


## Setup & Configuration

Initialize the AWorld MAS framework with robust path detection.

In [None]:
# Setup: Path detection and imports
import sys
import os
import json
import logging
from pathlib import Path

# Initialize variables
agent_config = None
mcp_config = {}
available_servers = []

# Current directory paths
current_dir = Path.cwd()
parent_dir = current_dir.parent

print("=" * 80)
print("ENVIRONMENT SETUP")
print("=" * 80)

# Import AWorld modules
try:
    from aworld.agents.llm_agent import Agent
    from aworld.config.conf import AgentConfig, TaskConfig
    from aworld.core.task import Task
    from aworld.runner import Runners
    print("✓ AWorld modules imported successfully")
except ImportError as e:
    print(f"✗ ERROR importing AWorld modules: {e}")
    print("  Make sure AWorld is installed: pip install aworld")
    print("  Or from GitHub: pip install git+https://github.com/inclusionAI/AWorld.git")
    raise

# Load environment variables
try:
    from dotenv import load_dotenv

    # Search for .env file in common locations
    possible_env_paths = [
        current_dir / ".env",
        parent_dir / ".env",
        Path.home() / ".env",
    ]

    env_loaded = False
    for env_path in possible_env_paths:
        if env_path.exists():
            load_dotenv(env_path, override=True)
            print(f"✓ Loaded environment from: {env_path}")
            env_loaded = True
            break

    if not env_loaded:
        print("⚠ No .env file found, using system environment variables")

except ImportError:
    print("⚠ python-dotenv not installed, using system environment variables")

# Load MCP configuration
try:
    possible_mcp_paths = [
        current_dir / "mcp.json",
        parent_dir / "mcp.json",
        parent_dir / "examples" / "gaia" / "mcp.json",
    ]

    mcp_loaded = False
    for mcp_path in possible_mcp_paths:
        if mcp_path.exists():
            with open(mcp_path, "r", encoding="utf-8") as f:
                mcp_config = json.load(f)
                available_servers = list(mcp_config.get("mcpServers", {}).keys())
                print(f"✓ Loaded MCP config from: {mcp_path}")
                print(f"  Available MCP servers: {available_servers}")
                mcp_loaded = True
                break

    if not mcp_loaded:
        print("⚠ No mcp.json found, agent will run without MCP servers")

except Exception as e:
    print(f"⚠ Error loading MCP config: {e}")
    print("  Agent will run without MCP servers")

# Create agent configuration
try:
    agent_config = AgentConfig(
        llm_provider=os.getenv("LLM_PROVIDER", "openai"),
        llm_model_name=os.getenv("LLM_MODEL_NAME", "gpt-4o"),
        llm_base_url=os.getenv("LLM_BASE_URL"),
        llm_api_key=os.getenv("LLM_API_KEY"),
        llm_temperature=float(os.getenv("LLM_TEMPERATURE", "0.0")),
    )
    print("✓ Agent configuration created")
    print(f"  Provider: {agent_config.llm_config.llm_provider}")
    print(f"  Model: {agent_config.llm_config.llm_model_name}")
    print(f"  Temperature: {agent_config.llm_config.llm_temperature}")
except Exception as e:
    print(f"✗ ERROR creating agent config: {e}")
    raise

print("=" * 80)


## Agent Initialization

Create the GAIA super agent with MCP servers for tool execution.

In [None]:
# Create GAIA super agent
system_prompt = """You are a helpful AI assistant tasked with answering questions from the GAIA benchmark.

Your goal is to provide accurate, well-reasoned answers to complex questions that may require:
- Web searches and browsing
- File reading and analysis (PDF, Excel, images, code, etc.)
- Mathematical computations
- Multi-step reasoning
- Tool usage

When you have determined the final answer, provide it in this format:
<answer>your answer here</answer>

Be thorough, use available tools when needed, and show your reasoning."""

try:
    super_agent = Agent(
        conf=agent_config,
        name="gaia_super_agent",
        system_prompt=system_prompt,
        mcp_config=mcp_config,
        mcp_servers=available_servers,
        feedback_tool_result=True
    )
    print("✓ GAIA super agent created successfully")
    print(f"  Agent name: {super_agent.name}")
    print(f"  MCP servers: {super_agent.mcp_servers if super_agent.mcp_servers else 'None'}")

except Exception as e:
    print(f"✗ ERROR creating agent: {e}")
    import traceback
    traceback.print_exc()


## Task Execution

Run the task with the agent and capture the full execution trajectory.

In [None]:
# Execute the task
import time

# Prepare question with file path if needed
question_with_files = question
dataset_path = "/Users/kirito4499/Desktop/Projects/Python/aworld-notebooks/gaia_dataset/2023"
split = "validation"

if file_name:
    file_path = Path(dataset_path) / split / file_name
    question_with_files += ""
    print(f"Task includes file attachment: {file_path}")
    print(f"File exists: {file_path.exists()}")
    print()

print("=" * 80)
print("EXECUTING TASK")
print("=" * 80)
print("Starting agent execution...")
print()

# Create and run task
start_time = time.time()
task_result = None
task_response = None

try:
    task_obj = Task(
        input=question_with_files,
        agent=super_agent,
        conf=TaskConfig()
    )

    print(f"Task created with ID: {task_obj.id}")
    print("Running agent...")
    print()

    # Execute task
    task_result = Runners.sync_run_task(task=task_obj)
    task_response = task_result[task_obj.id]

    end_time = time.time()
    execution_time = end_time - start_time

    print("=" * 80)
    print("EXECUTION COMPLETE")
    print("=" * 80)
    print(f"✓ Status: {'Success' if task_response.success else 'Failed'}")
    print(f"✓ Execution time: {execution_time:.2f} seconds")
    print(f"✓ Steps taken: {len(task_response.trajectory) if task_response.trajectory else 'N/A'}")
    if hasattr(task_response, 'usage') and task_response.usage:
        print(f"✓ Token usage: {task_response.usage}")
    print()
    print("AGENT ANSWER:")
    print("-" * 80)
    print(task_response.answer)
    print("=" * 80)

except Exception as e:
    print(f"✗ ERROR during task execution: {e}")
    import traceback
    traceback.print_exc()


## Execution Trajectory

Detailed step-by-step breakdown of agent actions.

In [None]:
# Display execution trajectory
if task_response and hasattr(task_response, 'trajectory') and task_response.trajectory:
    print("=" * 80)
    print(f"TRAJECTORY: {len(task_response.trajectory)} STEPS")
    print("=" * 80)
    print()

    for step_idx, step in enumerate(task_response.trajectory, 1):
        print(f"{'='*80}")
        print(f"STEP {step_idx}/{len(task_response.trajectory)}")
        print(f"{'='*80}")

        # Display step information based on type
        if isinstance(step, dict):
            for key, value in step.items():
                print(f"{key}: {value}")
        else:
            print(f"Step data: {step}")

        print()
else:
    print("No trajectory data available")
    if task_response:
        print(f"Task response type: {type(task_response)}")
        print(f"Available attributes: {dir(task_response)}")


## MCP Tool Calls

Detailed view of all tool executions during the task.

In [None]:
# Extract and display tool calls
if task_response and hasattr(task_response, 'trajectory') and task_response.trajectory:
    tool_calls = []

    # Extract tool calls from trajectory
    for step_idx, step in enumerate(task_response.trajectory, 1):
        if isinstance(step, dict):
            # Check for tool-related keys
            if 'tool_name' in step or 'action_name' in step:
                tool_calls.append({
                    'step': step_idx,
                    'data': step
                })

    if tool_calls:
        print("=" * 80)
        print(f"TOOL CALLS: {len(tool_calls)} total")
        print("=" * 80)
        print()

        for call in tool_calls:
            step_num = call['step']
            data = call['data']

            print(f"{'─'*80}")
            print(f"Tool Call at Step {step_num}")
            print(f"{'─'*80}")

            tool_name = data.get('tool_name', 'Unknown')
            action_name = data.get('action_name', 'Unknown')
            params = data.get('params', {})
            result = data.get('result', 'No result captured')

            print(f"Tool: {tool_name}")
            print(f"Action: {action_name}")
            print(f"\nParameters:")
            for key, value in params.items():
                value_str = str(value)
                if len(value_str) > 200:
                    value_str = value_str[:200] + "..."
                print(f"  {key}: {value_str}")

            print(f"\nResult:")
            result_str = str(result)
            if len(result_str) > 500:
                result_str = result_str[:500] + "..."
            print(f"  {result_str}")
            print()
    else:
        print("No tool calls found in trajectory")
else:
    print("No trajectory available to extract tool calls")


## Agent Messages & LLM Interactions

Detailed view of all agent communications and LLM calls.

In [None]:
# Extract and display agent messages
if task_response and hasattr(task_response, 'trajectory') and task_response.trajectory:
    print("=" * 80)
    print("AGENT MESSAGES & LLM CALLS")
    print("=" * 80)
    print()

    for step_idx, step in enumerate(task_response.trajectory, 1):
        print(f"{'─'*80}")
        print(f"Step {step_idx}: Message Details")
        print(f"{'─'*80}")

        if isinstance(step, dict):
            # Look for message-related fields
            if 'role' in step or 'content' in step or 'message' in step:
                role = step.get('role', 'unknown')
                content = step.get('content', step.get('message', ''))

                print(f"Role: {role}")
                print(f"Content:")
                content_str = str(content)
                if len(content_str) > 1000:
                    print(f"  {content_str[:1000]}...")
                    print(f"  ... ({len(content_str) - 1000} more characters)")
                else:
                    print(f"  {content_str}")
            else:
                # Display all step data
                for key, value in step.items():
                    value_str = str(value)
                    if len(value_str) > 300:
                        value_str = value_str[:300] + "..."
                    print(f"{key}: {value_str}")
        else:
            print(f"Step type: {type(step)}")
            step_str = str(step)
            if len(step_str) > 500:
                print(f"{step_str[:500]}...")
            else:
                print(step_str)

        print()
else:
    print("No trajectory available")


## Answer Validation

Extract the agent's answer and compare with ground truth.

In [None]:
# Extract and validate answer
import re
import string

def normalize_str(input_str, remove_punct=True):
    """Normalize string for comparison."""
    no_spaces = re.sub(r"\s", "", input_str)
    if remove_punct:
        translator = str.maketrans("", "", string.punctuation)
        return no_spaces.lower().translate(translator)
    else:
        return no_spaces.lower()

def normalize_number_str(number_str):
    """Normalize number string."""
    for char in ["$", "%", ","]:
        number_str = number_str.replace(char, "")
    try:
        return float(number_str)
    except ValueError:
        return float("inf")

def is_float(element):
    """Check if element can be converted to float."""
    try:
        float(element)
        return True
    except (ValueError, TypeError):
        return False

def question_scorer(model_answer, ground_truth):
    """Score the model answer against ground truth."""
    try:
        if is_float(ground_truth):
            # Numeric comparison
            normalized_answer = normalize_number_str(model_answer)
            return normalized_answer == float(ground_truth)
        elif any(char in ground_truth for char in [",", ";"]):
            # List comparison
            gt_elems = re.split(r"[,;]", ground_truth)
            ma_elems = re.split(r"[,;]", model_answer)

            if len(gt_elems) != len(ma_elems):
                return False

            comparisons = []
            for ma_elem, gt_elem in zip(ma_elems, gt_elems):
                if is_float(gt_elem):
                    normalized_ma_elem = normalize_number_str(ma_elem)
                    comparisons.append(normalized_ma_elem == float(gt_elem))
                else:
                    ma_elem = normalize_str(ma_elem, remove_punct=False)
                    gt_elem = normalize_str(gt_elem, remove_punct=False)
                    comparisons.append(ma_elem == gt_elem)
            return all(comparisons)
        else:
            # String comparison
            ma_elem = normalize_str(model_answer)
            gt_elem = normalize_str(ground_truth)
            return ma_elem == gt_elem
    except Exception as e:
        print(f"Error during validation: {e}")
        return False

# Extract answer
extracted_answer = None
if task_response:
    agent_response = task_response.answer

    # Try to extract answer from <answer> tags
    match = re.search(r"<answer>(.*?)</answer>", agent_response, re.DOTALL)
    if match:
        extracted_answer = match.group(1).strip()
        print("✓ Extracted answer from <answer> tags")
    else:
        # Fallback: use full response
        extracted_answer = agent_response.strip()
        print("⚠ No <answer> tags found, using full response")

    print()
    print("=" * 80)
    print("ANSWER EXTRACTION")
    print("=" * 80)
    print("Extracted Answer:")
    print("-" * 80)
    print(extracted_answer)
    print()
    print("Ground Truth:")
    print("-" * 80)
    print(ground_truth)
    print("=" * 80)
    print()

    # Validate
    is_correct = question_scorer(extracted_answer, ground_truth)

    print("=" * 80)
    print("VALIDATION RESULT")
    print("=" * 80)
    if is_correct:
        print("✅ PASS - Answer matches ground truth!")
    else:
        print("❌ FAIL - Answer does not match ground truth")
    print("=" * 80)

    # Display comparison details
    print()
    print("Comparison Details:")
    print(f"  Task ID: {task_id}")
    print(f"  Level: {level}")
    print(f"  Correct: {is_correct}")
else:
    print("✗ No task response available for validation")
