# GAIA Agent Builder
## Design and test agent architecture

**Objective:** Build 4-agent system with proper GAIA formatting  
**Output:** Working agents with routing and format compliance

---

# Section 1: Dependency Check & Import Validation

In [None]:
# Import dependency verification and system check
import sys
import importlib
import pkg_resources
from pathlib import Path
import os

# Required dependencies with version checking
REQUIRED_PACKAGES = {
    'langchain': '>=0.1.0',
    'langchain-groq': '>=0.1.0', 
    'langchain-google-genai': '>=0.1.0',
    'langchain-openai': '>=0.1.0',
    'langgraph': '>=0.1.0',
    'smolagents': '>=0.1.0',
    'pandas': '>=1.5.0',
    'numpy': '>=1.20.0',
    'openai': '>=1.0.0',
    'backoff': '>=2.0.0',
    'rich': '>=10.0.0'
}

def check_dependencies():
    """Check all required dependencies are installed"""
    missing_packages = []
    version_issues = []
    
    for package, min_version in REQUIRED_PACKAGES.items():
        try:
            importlib.import_module(package.replace('-', '_'))
            # Check version if specified
            if min_version:
                installed_version = pkg_resources.get_distribution(package).version
                # Version comparison logic here
        except ImportError:
            missing_packages.append(package)
    
    return missing_packages, version_issues

# Check system requirements
missing, version_issues = check_dependencies()
if missing:
    print(f"‚ùå Missing packages: {missing}")
    print("Install with: pip install " + " ".join(missing))
else:
    print("‚úÖ All dependencies available")

# Check file system requirements
required_files = [
    "gaia_embeddings.csv",
    "metadata.jsonl", 
    "agent_logic.py",
    "agent_interface.py", 
    "agent_logging.py",
    "agent_testing.py",
    "dev_retriever.py"
]

print("\nüìÅ File System Check:")
for file_path in required_files:
    if Path(file_path).exists():
        print(f"‚úÖ {file_path}")
    else:
        print(f"‚ùå {file_path} - Missing")

# Environment variables check
required_env_vars = [
    "GROQ_API_KEY",
    "GOOGLE_API_KEY", 
    "OPENROUTER_API_KEY",
    # "OLLAMA_HOST" - optional
]

print("\nüîë Environment Variables:")
for var in required_env_vars:
    if os.getenv(var):
        print(f"‚úÖ {var} - Set")
    else:
        print(f"‚ö†Ô∏è {var} - Not set (provider will be unavailable)")

# Import all core modules
try:
    from agent_logic import GAIAAgent, GAIAConfig
    from agent_interface import create_gaia_agent, get_groq_config
    from agent_logging import AgentLoggingSetup
    from agent_testing import run_quick_gaia_test
    from dev_retriever import load_gaia_retriever
    print("\n‚úÖ All core modules imported successfully")
except ImportError as e:
    print(f"\n‚ùå Import error: {e}")

In [None]:
# Import dependency verification and system check based on pyproject.toml
import sys
import importlib
import pkg_resources
from pathlib import Path
import os
import subprocess

# Core dependencies from pyproject.toml + missing critical ones
CORE_DEPENDENCIES = {
    # From pyproject.toml - Core application
    'gradio': '^5.13.2',
    'requests': '^2.32.3', 
    'pandas': '^2.0.0',
    'python-dotenv': '^1.0.0',
    
    # From pyproject.toml - AI/ML framework
    'smolagents': '^1.15.0',
    'transformers': '^4.40.0',
    'huggingface-hub': '^0.30.0',
    'torch': '^2.1.0',
    'datasets': '^2.14.0',
    
    # From pyproject.toml - LangChain ecosystem
    'langgraph': '^0.4.8',
    'langchain-openai': '^0.3.19',
    'langchain-ollama': '^0.3.3',
    'langchain-groq': '^0.3.2',
    'langchain-google-genai': '^2.1.5',
    'langchain-community': '^0.3.24',
    'langchain-huggingface': '^0.2.0',
    'langchain-weaviate': '^0.0.5',
    
    # From pyproject.toml - Scientific computing
    'numpy': '>=2.0.0,<3.0.0',
    'matplotlib': '^3.7.0',
    'scipy': '^1.11.0',
    'seaborn': '^0.13.2',
    
    # From pyproject.toml - File handling
    'beautifulsoup4': '^4.12.0',
    'pillow': '^11.0.0',
    'openpyxl': '^3.1.0',
    'pypdf2': '^3.0.0',
    'python-docx': '^1.1.0',
    
    # From pyproject.toml - Audio processing
    'librosa': '^0.10.0',
    'soundfile': '^0.12.0',
    
    # From pyproject.toml - Additional tools
    'sentence-transformers': '^4.1.0',
    'weaviate-client': '^4.14.4',
    'duckduckgo-search': '^8.0.2',
    'docling': '^2.36.1',
    'backoff': '^2.2.1',
    
    # MISSING FROM PYPROJECT.TOML but required by our system
    'litellm': '^1.72.1',  # Critical for SmolagAgents LiteLLMModel
    'rich': '^10.0.0',     # Used in agent_logging.py
    'lxml': '>=5.3.0',     # From pyproject.toml but important for web scraping
}

# Tool-specific dependencies that our system uses
TOOL_DEPENDENCIES = {
    # Document processing tools (used by ContentRetrieverTool)
    'docling': '^2.36.1',           # PDF, DOCX, HTML parsing
    'docling-core': '^2.33.1',      # Docling core functionality
    'docling-parse': '^4.0.3',      # PDF parsing
    'pypdf2': '^3.0.0',             # PDF fallback
    'python-docx': '^1.1.0',        # Word documents
    'openpyxl': '^3.1.0',           # Excel files
    'beautifulsoup4': '^4.12.0',    # HTML parsing
    
    # Web scraping tools (used by web researchers)
    'duckduckgo-search': '^8.0.2',  # Search functionality
    'requests': '^2.32.3',          # HTTP requests
    'lxml': '>=5.3.0',              # XML/HTML parsing
    'selenium': '^4.15.0',          # Browser automation (optional)
    
    # Audio processing tools
    'librosa': '^0.10.0',           # Audio analysis
    'soundfile': '^0.12.0',         # Audio I/O
    'pydub': '^0.25.1',             # Audio manipulation
    
    # Image processing tools
    'pillow': '^11.0.0',            # Image processing
    'opencv-python-headless': '^4.11.0',  # Computer vision
    'easyocr': '^1.7.2',            # OCR functionality
    
    # Embedding and retrieval tools
    'sentence-transformers': '^4.1.0',  # Embeddings
    'weaviate-client': '^4.14.4',       # Vector database
    'langchain-weaviate': '^0.0.5',     # LangChain-Weaviate integration
    
    # Data processing tools
    'numpy': '>=2.0.0,<3.0.0',      # Numerical computing
    'pandas': '^2.0.0',             # Data manipulation
    'scipy': '^1.11.0',             # Scientific computing
}

# Optional dependencies that enhance functionality
OPTIONAL_DEPENDENCIES = {
    'openai': '^1.0.0',           # From pyproject.toml extras
    'anthropic': '^0.7.0',        # From pyproject.toml extras  
    'selenium': '^4.15.0',        # From pyproject.toml extras
    'groq': '^0.26.0',            # Groq API client
    'ollama': '^0.5.1',           # Ollama API client
}

def get_installed_packages():
    """Get list of actually installed packages"""
    try:
        result = subprocess.run([sys.executable, '-m', 'pip', 'list'], 
                              capture_output=True, text=True)
        installed = {}
        for line in result.stdout.split('\n')[2:]:  # Skip header lines
            if line.strip():
                parts = line.split()
                if len(parts) >= 2:
                    package_name = parts[0].lower()
                    version = parts[1]
                    installed[package_name] = version
        return installed
    except Exception as e:
        print(f"‚ö†Ô∏è Could not get installed packages: {e}")
        return {}

def check_package_availability(package_name, installed_packages=None):
    """Check if a package is available and get version"""
    if installed_packages is None:
        installed_packages = {}
    
    # Check in installed packages first (most reliable)
    package_lower = package_name.lower()
    if package_lower in installed_packages:
        return True, installed_packages[package_lower]
    
    # Handle package name variations
    variations = [
        package_name.replace('-', '_'),
        package_name.replace('_', '-'),
        package_name.replace('-', ''),
    ]
    
    for variation in variations:
        if variation.lower() in installed_packages:
            return True, installed_packages[variation.lower()]
    
    # Fallback to import test
    try:
        import_name = package_name.replace('-', '_')
        importlib.import_module(import_name)
        
        try:
            version = pkg_resources.get_distribution(package_name).version
            return True, version
        except pkg_resources.DistributionNotFound:
            return True, "unknown"
    except ImportError:
        return False, None

print("üîç DEPENDENCY ANALYSIS (Based on pyproject.toml + Actual Installation)")
print("=" * 70)

# Get actually installed packages
print("üì¶ Scanning installed packages...")
installed_packages = get_installed_packages()
print(f"‚úÖ Found {len(installed_packages)} installed packages")

# Check core dependencies
missing_core = []
available_core = []
version_info = {}

print(f"\nüì¶ CORE DEPENDENCIES ({len(CORE_DEPENDENCIES)} packages):")
for package, version_spec in CORE_DEPENDENCIES.items():
    is_available, installed_version = check_package_availability(package, installed_packages)
    
    if is_available:
        available_core.append(package)
        version_info[package] = installed_version
        print(f"‚úÖ {package:<25} {installed_version}")
    else:
        missing_core.append(package)
        print(f"‚ùå {package:<25} NOT INSTALLED")

# Check tool-specific dependencies
print(f"\nüîß TOOL-SPECIFIC DEPENDENCIES ({len(TOOL_DEPENDENCIES)} packages):")
missing_tools = []
available_tools = []

for package, version_spec in TOOL_DEPENDENCIES.items():
    is_available, installed_version = check_package_availability(package, installed_packages)
    
    if is_available:
        available_tools.append(package)
        print(f"‚úÖ {package:<30} {installed_version}")
    else:
        missing_tools.append(package)
        print(f"‚ùå {package:<30} NOT INSTALLED")

# Check optional dependencies
print(f"\nüîß OPTIONAL DEPENDENCIES ({len(OPTIONAL_DEPENDENCIES)} packages):")
missing_optional = []
available_optional = []

for package, version_spec in OPTIONAL_DEPENDENCIES.items():
    is_available, installed_version = check_package_availability(package, installed_packages)
    
    if is_available:
        available_optional.append(package)
        print(f"‚úÖ {package:<25} {installed_version}")
    else:
        missing_optional.append(package)
        print(f"‚ö†Ô∏è {package:<25} NOT INSTALLED (optional)")

# Summary
print(f"\nüìä DEPENDENCY SUMMARY:")
print(f"‚úÖ Core available: {len(available_core)}/{len(CORE_DEPENDENCIES)}")
print(f"‚ùå Core missing: {len(missing_core)}")
print(f"üîß Tools available: {len(available_tools)}/{len(TOOL_DEPENDENCIES)}")
print(f"‚ùå Tools missing: {len(missing_tools)}")
print(f"‚öôÔ∏è Optional available: {len(available_optional)}/{len(OPTIONAL_DEPENDENCIES)}")

# Critical missing packages
all_missing = missing_core + missing_tools
if all_missing:
    print(f"\n‚ùå MISSING PACKAGES:")
    for package in all_missing:
        print(f"   ‚Ä¢ {package}")
    
    print(f"\nüí° Install missing packages:")
    print(f"pip install {' '.join(all_missing)}")

# Tools capability assessment
print(f"\nüõ†Ô∏è TOOLS CAPABILITY ASSESSMENT:")
print("=" * 40)

# Document processing capability
doc_tools = ['docling', 'docling-core', 'pypdf2', 'python-docx', 'openpyxl', 'beautifulsoup4']
doc_available = sum(1 for tool in doc_tools if tool in available_tools)
print(f"üìÑ Document Processing: {doc_available}/{len(doc_tools)} tools available")

# Web scraping capability
web_tools = ['duckduckgo-search', 'requests', 'lxml']
web_available = sum(1 for tool in web_tools if tool in available_tools)
web_selenium = 'selenium' in available_optional
print(f"üåê Web Research: {web_available}/{len(web_tools)} tools available {'+ Selenium' if web_selenium else ''}")

# Audio processing capability
audio_tools = ['librosa', 'soundfile', 'pydub']
audio_available = sum(1 for tool in audio_tools if tool in available_tools)
print(f"üéµ Audio Processing: {audio_available}/{len(audio_tools)} tools available")

# Image processing capability
image_tools = ['pillow', 'opencv-python-headless', 'easyocr']
image_available = sum(1 for tool in image_tools if tool in available_tools)
print(f"üñºÔ∏è Image Processing: {image_available}/{len(image_tools)} tools available")

# Embedding and retrieval capability
embed_tools = ['sentence-transformers', 'weaviate-client', 'langchain-weaviate']
embed_available = sum(1 for tool in embed_tools if tool in available_tools)
print(f"üß† Embeddings/Retrieval: {embed_available}/{len(embed_tools)} tools available")

# Docker/HF Spaces shopping list
print(f"\nüê≥ DOCKER/HF SPACES INSTALLATION LIST:")
print("=" * 50)
print("# Add these to your requirements.txt or Dockerfile:")
print()

all_packages_dict = {**CORE_DEPENDENCIES, **TOOL_DEPENDENCIES, **OPTIONAL_DEPENDENCIES}
for package in sorted(all_packages_dict.keys()):
    if package in CORE_DEPENDENCIES:
        version = CORE_DEPENDENCIES[package]
        print(f"{package}{version}  # Core - Required")
    elif package in TOOL_DEPENDENCIES:
        version = TOOL_DEPENDENCIES[package]
        print(f"{package}{version}  # Tool - Required for functionality")
    else:
        version = OPTIONAL_DEPENDENCIES[package] 
        print(f"{package}{version}  # Optional - Enhances functionality")

print()
print("# Additional system dependencies for Docker:")
print("# RUN apt-get update && apt-get install -y \\")
print("#     libsndfile1 \\          # For librosa/soundfile")
print("#     ffmpeg \\              # For audio processing") 
print("#     poppler-utils \\       # For PDF processing")
print("#     libxml2-dev \\         # For lxml")
print("#     libxslt1-dev \\        # For lxml")
print("#     tesseract-ocr \\       # For easyocr")
print("#     libglib2.0-0 \\       # For opencv")
print("#     libsm6 \\             # For opencv")
print("#     libxext6 \\           # For opencv")
print("#     libxrender-dev \\     # For opencv")
print("#     libgomp1 \\           # For scientific computing")
print("#     && rm -rf /var/lib/apt/lists/*")

print(f"\nüîë ENVIRONMENT VARIABLES CHECK:")
required_env_vars = [
    "GROQ_API_KEY",
    "GOOGLE_API_KEY", 
    "OPENROUTER_API_KEY",
    "OPENAI_API_KEY",        # For OpenAI provider
    "ANTHROPIC_API_KEY",     # For Anthropic provider
    "HF_TOKEN",              # For HuggingFace
    # "OLLAMA_HOST",         # Optional for local Ollama
]

env_status = {}
for var in required_env_vars:
    is_set = bool(os.getenv(var))
    env_status[var] = is_set
    status = "‚úÖ Set" if is_set else "‚ö†Ô∏è Not set"
    provider = ""
    
    if "GROQ" in var:
        provider = "(Groq provider)"
    elif "GOOGLE" in var:
        provider = "(Google provider)" 
    elif "OPENROUTER" in var:
        provider = "(OpenRouter provider)"
    elif "OPENAI" in var:
        provider = "(OpenAI provider)"
    elif "ANTHROPIC" in var:
        provider = "(Anthropic provider)"
    elif "HF_TOKEN" in var:
        provider = "(HuggingFace)"
    
    print(f"{status:<15} {var:<20} {provider}")

print(f"\nüìÅ FILE SYSTEM CHECK:")
required_files = [
    "gaia_embeddings.csv",
    "metadata.jsonl", 
    "agent_logic.py",
    "agent_interface.py", 
    "agent_logging.py",
    "agent_testing.py",
    "dev_retriever.py"
]

file_status = {}
for file_path in required_files:
    exists = Path(file_path).exists()
    file_status[file_path] = exists
    status = "‚úÖ" if exists else "‚ùå"
    print(f"{status} {file_path}")

# Test critical imports
print(f"\nüß™ CRITICAL IMPORT TEST:")
critical_imports = [
    ("agent_logic", "GAIAAgent, GAIAConfig"),
    ("agent_interface", "create_gaia_agent, get_groq_config"),
    ("agent_logging", "AgentLoggingSetup"),
    ("agent_testing", "run_quick_gaia_test"),
    ("dev_retriever", "load_gaia_retriever"),
    ("smolagents", "LiteLLMModel, ToolCallingAgent"),
    ("langchain_groq", "ChatGroq"),
    ("langchain_google_genai", "ChatGoogleGenerativeAI"),
    ("langgraph.graph", "StateGraph"),
    ("docling", "DocumentConverter"),  # Key for document processing
    ("litellm", "completion"),         # Key for LLM providers
]

import_status = {}
for module, components in critical_imports:
    try:
        importlib.import_module(module.replace('-', '_'))
        import_status[module] = True
        print(f"‚úÖ {module:<25} ({components})")
    except ImportError as e:
        import_status[module] = False
        print(f"‚ùå {module:<25} FAILED: {str(e)}")

# Test tool imports specifically
print(f"\nüîß TOOL-SPECIFIC IMPORT TEST:")
tool_imports = [
    ("docling", "Document processing"),
    ("sentence_transformers", "Embeddings"),
    ("weaviate", "Vector database"),
    ("duckduckgo_search", "Web search"),
    ("librosa", "Audio processing"),
    ("cv2", "Computer vision"),  # opencv-python-headless
    ("easyocr", "OCR"),
]

tool_import_status = {}
for module, description in tool_imports:
    try:
        importlib.import_module(module)
        tool_import_status[module] = True
        print(f"‚úÖ {module:<20} ({description})")
    except ImportError as e:
        tool_import_status[module] = False
        print(f"‚ùå {module:<20} FAILED: {description}")

# Final readiness assessment
print(f"\nüèÜ SYSTEM READINESS ASSESSMENT:")
print("=" * 40)

core_deps_ready = len(missing_core) == 0
tools_deps_ready = len(missing_tools) <= 2  # Allow 2 missing tool deps
files_ready = all(file_status.values())
imports_ready = all(import_status.values())
min_env_ready = env_status.get("GROQ_API_KEY", False)  # At least one provider

readiness_score = sum([
    core_deps_ready * 30,      # Dependencies (30%)
    tools_deps_ready * 30,     # Tool dependencies (30%)
    files_ready * 20,          # Files (20%)
    imports_ready * 15,        # Imports (15%)
    min_env_ready * 5          # Environment (5%)
])

print(f"üì¶ Core Dependencies: {'‚úÖ' if core_deps_ready else '‚ùå'} ({len(available_core)}/{len(CORE_DEPENDENCIES)})")
print(f"üîß Tool Dependencies: {'‚úÖ' if tools_deps_ready else '‚ùå'} ({len(available_tools)}/{len(TOOL_DEPENDENCIES)})")
print(f"üìÅ Required Files: {'‚úÖ' if files_ready else '‚ùå'} ({sum(file_status.values())}/{len(required_files)})")  
print(f"üß™ Critical Imports: {'‚úÖ' if imports_ready else '‚ùå'} ({sum(import_status.values())}/{len(critical_imports)})")
print(f"üîë Min Environment: {'‚úÖ' if min_env_ready else '‚ùå'} (At least Groq API key)")

print(f"\nüéØ Readiness Score: {readiness_score}/100")

if readiness_score >= 90:
    print("üöÄ SYSTEM READY - Proceed to component testing")
elif readiness_score >= 70:
    print("‚ö†Ô∏è MOSTLY READY - Address missing items before proceeding")
elif readiness_score >= 50:
    print("üîß NEEDS WORK - Significant setup required")
else:
    print("‚ùå NOT READY - Major setup required")

# Save dependency info for later use
dependency_info = {
    'core_available': available_core,
    'core_missing': missing_core,
    'tools_available': available_tools,
    'tools_missing': missing_tools,
    'optional_available': available_optional,
    'optional_missing': missing_optional,
    'file_status': file_status,
    'env_status': env_status,
    'import_status': import_status,
    'tool_import_status': tool_import_status,
    'readiness_score': readiness_score,
    'installed_packages': installed_packages
}

print(f"\nüíæ Dependency info saved in 'dependency_info' variable for reference")
print(f"üîç Actual installed packages: {len(installed_packages)} found")

for module, components in critical_imports:
    try:
        importlib.import_module(module.replace('-', '_'))
        import_status[module] = True
        print(f"‚úÖ {module:<25} ({components})")
    except ImportError as e:
        import_status[module] = False
        print(f"‚ùå {module:<25} FAILED: {str(e)}")

# Final readiness assessment
print(f"\nüèÜ SYSTEM READINESS ASSESSMENT:")
print("=" * 40)

core_deps_ready = len(missing_core) == 0
files_ready = all(file_status.values())
imports_ready = all(import_status.values())
min_env_ready = env_status.get("GROQ_API_KEY", False)  # At least one provider

readiness_score = sum([
    core_deps_ready * 40,      # Dependencies (40%)
    files_ready * 30,          # Files (30%)
    imports_ready * 20,        # Imports (20%)
    min_env_ready * 10         # Environment (10%)
])

print(f"üì¶ Dependencies: {'‚úÖ' if core_deps_ready else '‚ùå'} ({len(available_core)}/{len(CORE_DEPENDENCIES)})")
print(f"üìÅ Required Files: {'‚úÖ' if files_ready else '‚ùå'} ({sum(file_status.values())}/{len(required_files)})")  
print(f"üß™ Critical Imports: {'‚úÖ' if imports_ready else '‚ùå'} ({sum(import_status.values())}/{len(critical_imports)})")
print(f"üîë Min Environment: {'‚úÖ' if min_env_ready else '‚ùå'} (At least Groq API key)")

print(f"\nüéØ Readiness Score: {readiness_score}/100")

if readiness_score >= 90:
    print("üöÄ SYSTEM READY - Proceed to component testing")
elif readiness_score >= 70:
    print("‚ö†Ô∏è MOSTLY READY - Address missing items before proceeding")
elif readiness_score >= 50:
    print("üîß NEEDS WORK - Significant setup required")
else:
    print("‚ùå NOT READY - Major setup required")

# Save dependency info for later use
dependency_info = {
    'core_available': available_core,
    'core_missing': missing_core,
    'optional_available': available_optional,
    'optional_missing': missing_optional,
    'file_status': file_status,
    'env_status': env_status,
    'import_status': import_status,
    'readiness_score': readiness_score
}

print(f"\nüíæ Dependency info saved in 'dependency_info' variable for reference")

# Section 2: Model Initialization Testing

In [None]:
# Test all provider configurations systematically
from agent_interface import (
    get_groq_config, get_google_config, get_openrouter_config, get_ollama_config
)

# Define test matrix
providers_to_test = [
    ("groq", "qwen-qwq-32b", get_groq_config),
    ("groq_fast", "llama-3.3-70b-versatile", lambda: get_groq_config("llama-3.3-70b-versatile")),
    ("google", "gemini-2.0-flash-preview", get_google_config), 
    ("google_pro", "gemini-1.5-pro-002", lambda: get_google_config("gemini-1.5-pro-002")),
    ("openrouter", "qwen/qwen-2.5-coder-32b-instruct:free", get_openrouter_config),
    ("ollama", "qwen2.5-coder:32b", get_ollama_config)  # if available
]

model_test_results = {}

for provider_name, model_name, config_func in providers_to_test:
    print(f"\nüß™ Testing {provider_name} ({model_name})")
    print("-" * 50)
    
    try:
        # Get configuration
        config = config_func()
        print(f"‚úÖ Configuration loaded")
        
        # Test agent creation
        agent = create_gaia_agent(config)
        print(f"‚úÖ Agent created successfully")
        
        # Test simple inference
        test_question = "What is 2 + 2?"
        result = agent.run_single_question(test_question, task_id=f"test_{provider_name}")
        
        if result and result.get('final_answer'):
            print(f"‚úÖ Inference successful: {result['final_answer']}")
            model_test_results[provider_name] = {
                'status': 'success',
                'answer': result['final_answer'],
                'steps': len(result.get('steps', [])),
                'complexity': result.get('complexity', 'unknown')
            }
        else:
            print(f"‚ö†Ô∏è Inference returned empty result")
            model_test_results[provider_name] = {'status': 'empty_result'}
            
    except Exception as e:
        print(f"‚ùå Failed: {str(e)}")
        model_test_results[provider_name] = {'status': 'failed', 'error': str(e)}

# Summary of model testing
print(f"\nüìä MODEL TESTING SUMMARY")
print("=" * 40)
for provider, result in model_test_results.items():
    status = result['status']
    if status == 'success':
        print(f"‚úÖ {provider}: Working ({result.get('answer', 'N/A')})")
    elif status == 'failed':
        print(f"‚ùå {provider}: Failed - {result.get('error', 'Unknown')}")
    else:
        print(f"‚ö†Ô∏è {provider}: Issues detected")

In [None]:
print("\nüîç Testing Single Question Execution...")

# Test questions of different complexity levels
test_questions = [
    {
        "complexity": "Simple",
        "question": "What is 25 + 17?",
        "expected_strategy": "direct_llm"
    },
    {
        "complexity": "Moderate", 
        "question": "Calculate the compound interest on $1000 at 5% annually for 3 years",
        "expected_strategy": "smolag_agent"
    },
    {
        "complexity": "Complex",
        "question": "Analyze the correlation between these datasets: [1,2,3,4,5] and [2,4,6,8,10]",
        "expected_strategy": "smolag_agent"
    }
]

# Initialize agent for testing
agent = create_gaia_agent("qwen3_32b")

test_results = []

for test_case in test_questions:
    print(f"\nüîç Testing {test_case['complexity']} Question:")
    print(f"Q: {test_case['question']}")
    
    try:
        result = agent.run_single_question(test_case['question'])
        
        print(f"A: {result['final_answer']}")
        print(f"Strategy: {result['selected_strategy']}")
        print(f"Agent: {result.get('selected_agent', 'N/A')}")
        print(f"Time: {result.get('execution_time', 0):.2f}s")
        
        # Check if strategy matches expectation
        strategy_match = result['selected_strategy'] == test_case['expected_strategy']
        strategy_status = "‚úÖ" if strategy_match else "‚ö†Ô∏è"
        print(f"Expected Strategy: {test_case['expected_strategy']} {strategy_status}")
        
        test_results.append({
            "complexity": test_case['complexity'],
            "question": test_case['question'],
            "answer": result['final_answer'],
            "strategy_used": result['selected_strategy'],
            "expected_strategy": test_case['expected_strategy'],
            "strategy_correct": strategy_match,
            "execution_time": result.get('execution_time', 0),
            "similar_examples": len(result.get('similar_examples', []))
        })
        
    except Exception as e:
        print(f"‚ùå Error: {e}")
        test_results.append({
            "complexity": test_case['complexity'],
            "error": str(e)
        })

# Summary of single question tests
print(f"\nüìä Single Question Test Summary:")
successful_tests = [r for r in test_results if 'error' not in r]
print(f"Successful tests: {len(successful_tests)}/{len(test_questions)}")

if successful_tests:
    strategy_accuracy = sum(r['strategy_correct'] for r in successful_tests) / len(successful_tests)
    avg_time = np.mean([r['execution_time'] for r in successful_tests])
    print(f"Strategy selection accuracy: {strategy_accuracy:.2f}")
    print(f"Average execution time: {avg_time:.2f}s")

agent.close()

# Section 3: Component Isolation Tests

In [None]:
# Test each component independently to isolate issues

# 2.1 Retriever Testing
print("üîç Testing RAG Retriever System")
print("-" * 30)

try:
    retriever = load_gaia_retriever("gaia_embeddings.csv")
    
    if retriever and retriever.is_ready():
        print("‚úÖ Retriever loaded successfully")
        
        # Test retrieval quality
        test_queries = [
            "calculate percentage",
            "analyze spreadsheet data", 
            "image processing question",
            "complex reasoning task"
        ]
        
        for query in test_queries:
            similar_docs = retriever.search(query, k=3)
            print(f"üìö '{query}': Found {len(similar_docs)} similar examples")
            
            # Show one example
            if similar_docs:
                example_content = similar_docs[0].page_content[:100] + "..."
                print(f"    Example: {example_content}")
    else:
        print("‚ùå Retriever failed to initialize")
        
except Exception as e:
    print(f"‚ùå Retriever error: {e}")

# 2.2 Logging System Testing  
print(f"\nüìù Testing Logging System")
print("-" * 30)

try:
    logging_setup = AgentLoggingSetup(debug_mode=True)
    print("‚úÖ Logging system initialized")
    
    # Test logging methods
    logging_setup.start_task("test_task_123", complexity="simple")
    logging_setup.set_routing_path("one_shot_llm")
    logging_setup.set_similar_examples_count(3)
    logging_setup.log_question_result(
        task_id="test_task_123",
        question="Test question",
        final_answer="Test answer", 
        total_steps=2,
        success=True
    )
    
    print("‚úÖ All logging methods working")
    print(f"üìÅ Log files: {logging_setup.current_log_files}")
    
except Exception as e:
    print(f"‚ùå Logging error: {e}")

# 2.3 Tool Loading Testing
print(f"\nüîß Testing Tool Integration")
print("-" * 30)

try:
    from tools import GetAttachmentTool, ContentRetrieverTool
    
    # Test tool creation
    attachment_tool = GetAttachmentTool()
    content_tool = ContentRetrieverTool()
    
    print("‚úÖ Custom GAIA tools loaded")
    print(f"    GetAttachmentTool: {attachment_tool.name}")
    print(f"    ContentRetrieverTool: {content_tool.name}")
    
except ImportError as e:
    print(f"‚ö†Ô∏è Custom tools not available: {e}")
    print("    This is expected if tools.py is not implemented yet")

# Test SmolagAgent base tools
try:
    from smolagents import GoogleSearchTool, VisitWebpageTool
    
    search_tool = GoogleSearchTool()
    web_tool = VisitWebpageTool()
    
    print("‚úÖ SmolagAgent web tools loaded")
    
except Exception as e:
    print(f"‚ö†Ô∏è Web tools issue: {e}")

# Section 4: LangGraph Workflow Validation

In [None]:
# Test workflow paths and routing decisions

# Create test agent for workflow testing
workflow_config = get_ollama_config()
workflow_agent = create_gaia_agent(workflow_config)

# Test routing decision accuracy
routing_test_cases = [
    # (question, expected_complexity, expected_route, description)
    ("What is 25% of 400?", "simple", "one_shot", "Simple arithmetic"),
    ("What are the primary colors?", "simple", "one_shot", "Simple factual"), 
    ("What is the current population of Tokyo?", "complex", "manager", "Needs web search"),
    ("Analyze the data in the attached Excel file", "complex", "manager", "File processing"),
    ("Calculate the compound interest on $1000 at 5% for 3 years", "simple", "one_shot", "Math formula"),
    ("Research recent developments in AI and summarize trends", "complex", "manager", "Complex research")
]

print("üîÄ Testing Workflow Routing")
print("-" * 40)

routing_results = []

for question, expected_complexity, expected_route, description in routing_test_cases:
    print(f"\nüìù Test: {description}")
    print(f"‚ùì Question: {question}")
    
    try:
        result = workflow_agent.run_single_question(question, task_id=f"routing_test_{len(routing_results)}")
        
        actual_complexity = result.get('complexity', 'unknown')
        actual_steps = result.get('steps', [])
        
        # Determine actual route from steps
        if any('one-shot' in step.lower() for step in actual_steps):
            actual_route = "one_shot"
        elif any('manager' in step.lower() for step in actual_steps):
            actual_route = "manager"
        else:
            actual_route = "unknown"
        
        # Check routing accuracy
        complexity_correct = actual_complexity == expected_complexity
        route_correct = actual_route == expected_route
        
        print(f"üß† Complexity: {actual_complexity} (expected: {expected_complexity}) {'‚úÖ' if complexity_correct else '‚ùå'}")
        print(f"üîÄ Route: {actual_route} (expected: {expected_route}) {'‚úÖ' if route_correct else '‚ùå'}")
        print(f"üí¨ Answer: {result.get('final_answer', 'No answer')}")
        
        routing_results.append({
            'question': question,
            'expected_complexity': expected_complexity,
            'actual_complexity': actual_complexity,
            'expected_route': expected_route,
            'actual_route': actual_route,
            'complexity_correct': complexity_correct,
            'route_correct': route_correct,
            'description': description
        })
        
    except Exception as e:
        print(f"‚ùå Workflow error: {e}")
        routing_results.append({
            'question': question,
            'error': str(e),
            'description': description
        })

# Routing accuracy summary
correct_complexity = sum(1 for r in routing_results if r.get('complexity_correct', False))
correct_routing = sum(1 for r in routing_results if r.get('route_correct', False))
total_tests = len([r for r in routing_results if 'error' not in r])

print(f"\nüìä ROUTING ACCURACY SUMMARY")
print("=" * 40)
print(f"Complexity Detection: {correct_complexity}/{total_tests} ({correct_complexity/total_tests*100:.1f}%)")
print(f"Route Selection: {correct_routing}/{total_tests} ({correct_routing/total_tests*100:.1f}%)")

if correct_complexity/total_tests < 0.8:
    print("‚ö†Ô∏è Complexity detection needs improvement")
if correct_routing/total_tests < 0.8:
    print("‚ö†Ô∏è Route selection logic needs adjustment")

# Section 5: Agent Setup Verification

In [None]:
# Deep dive into specialist agent creation and coordination

print("ü§ñ Testing Specialist Agent Setup")
print("-" * 40)

# Test specialist creation individually
specialist_configs = [
    ("data_analyst", "CodeAgent with Python tools"),
    ("web_researcher", "ToolCallingAgent with web tools"), 
    ("document_processor", "ToolCallingAgent with file tools")
]

for specialist_name, description in specialist_configs:
    print(f"\nüîß Testing {specialist_name}")
    print(f"   {description}")
    
    try:
        # Access the specialist from workflow_agent
        if hasattr(workflow_agent, 'specialists') and specialist_name in workflow_agent.specialists:
            specialist = workflow_agent.specialists[specialist_name]
            
            print(f"‚úÖ {specialist_name} created successfully")
            print(f"   Type: {type(specialist).__name__}")
            print(f"   Tools: {len(getattr(specialist, 'tools', []))} tools available")
            
            # Test basic functionality
            if specialist_name == "data_analyst":
                # Test code execution capability
                test_code = "print(f'Data analyst test: 2 + 2 = {2+2}')"
                # Would need to test specialist.run() with code
                print(f"   Code execution: Available")
                
            elif specialist_name == "web_researcher":
                # Test web tools availability
                web_tools = getattr(specialist, 'tools', [])
                tool_names = [tool.name if hasattr(tool, 'name') else str(tool) for tool in web_tools]
                print(f"   Web tools: {tool_names}")
                
            elif specialist_name == "document_processor":
                # Test document processing tools
                doc_tools = getattr(specialist, 'tools', [])
                tool_names = [tool.name if hasattr(tool, 'name') else str(tool) for tool in doc_tools]
                print(f"   Document tools: {tool_names}")
        else:
            print(f"‚ùå {specialist_name} not found in agent")
            
    except Exception as e:
        print(f"‚ùå {specialist_name} setup error: {e}")

# Test manager coordination
print(f"\nüë®‚Äçüíº Testing Manager Agent")
print("-" * 30)

try:
    manager = workflow_agent.manager
    print(f"‚úÖ Manager agent available")
    print(f"   Type: {type(manager).__name__}")
    print(f"   Managed agents: {len(getattr(manager, 'managed_agents', []))}")
    
    # Test manager can delegate to specialists
    manager_tools = getattr(manager, 'tools', [])
    print(f"   Manager tools: {len(manager_tools)}")
    
except Exception as e:
    print(f"‚ùå Manager setup error: {e}")

# Section 6: GAIA File Type Testing

In [None]:
#Test specific GAIA file types with actual files

print("üìé Testing GAIA File Type Support")
print("-" * 40)

# Define test files (these should exist in your test data)
test_files = {
    '.xlsx': 'sample_spreadsheet.xlsx',
    '.csv': 'sample_data.csv', 
    '.png': 'sample_image.png',
    '.pdf': 'sample_document.pdf',
    '.txt': 'sample_text.txt',
    '.json': 'sample_data.json'
}

file_processing_results = {}

for file_type, file_path in test_files.items():
    print(f"\nüìÑ Testing {file_type} processing")
    
    if Path(file_path).exists():
        print(f"‚úÖ Test file available: {file_path}")
        
        # Create test question with file
        test_question = f"Analyze the content in the attached {file_type} file and provide a brief summary."
        
        try:
            # Test with document processor specialist
            result = workflow_agent.run_single_question(
                question=test_question,
                task_id=f"file_test_{file_type.replace('.', '')}"
            )
            
            if result and result.get('final_answer'):
                print(f"‚úÖ Processing successful")
                print(f"   Answer length: {len(result['final_answer'])} characters")
                print(f"   Steps: {len(result.get('steps', []))}")
                
                file_processing_results[file_type] = {
                    'status': 'success',
                    'answer_length': len(result['final_answer']),
                    'steps': len(result.get('steps', []))
                }
            else:
                print(f"‚ö†Ô∏è Processing returned empty result")
                file_processing_results[file_type] = {'status': 'empty'}
                
        except Exception as e:
            print(f"‚ùå Processing failed: {e}")
            file_processing_results[file_type] = {'status': 'failed', 'error': str(e)}
    else:
        print(f"‚ö†Ô∏è Test file not available: {file_path}")
        file_processing_results[file_type] = {'status': 'no_file'}

# File processing summary
print(f"\nüìä FILE PROCESSING SUMMARY")
print("=" * 40)
for file_type, result in file_processing_results.items():
    status = result['status']
    if status == 'success':
        print(f"‚úÖ {file_type}: Processed successfully")
    elif status == 'failed':
        print(f"‚ùå {file_type}: Failed - {result.get('error', 'Unknown')}")
    elif status == 'no_file':
        print(f"‚ö†Ô∏è {file_type}: No test file available")
    else:
        print(f"‚ö†Ô∏è {file_type}: Issues detected")

# Section 7: Mock/Synthetic Data Testing

In [None]:
# Test with synthetic data before using real GAIA data

print("üé≠ Testing with Mock/Synthetic Data")
print("-" * 40)

# Create synthetic test cases that mirror GAIA patterns
synthetic_test_cases = [
    {
        'task_id': 'synthetic_001',
        'question': 'Calculate the compound interest on $5000 at 3% annual rate for 2 years.',
        'expected_answer': '5304.50',  # Pre-calculated
        'level': 1,
        'category': 'mathematics'
    },
    {
        'task_id': 'synthetic_002', 
        'question': 'What are the three primary colors in the RGB color model?',
        'expected_answer': 'red, green, blue',
        'level': 1,
        'category': 'knowledge'
    },
    {
        'task_id': 'synthetic_003',
        'question': 'If a dataset has values [10, 15, 20, 25, 30], what is the median?',
        'expected_answer': '20',
        'level': 1,
        'category': 'statistics'
    },
    {
        'task_id': 'synthetic_004',
        'question': 'List the chemical symbols for hydrogen, oxygen, and carbon.',
        'expected_answer': 'H, O, C',
        'level': 1,
        'category': 'chemistry'
    }
]

synthetic_results = []

for test_case in synthetic_test_cases:
    question = test_case['question']
    expected = test_case['expected_answer']
    
    print(f"\nüìù Testing: {test_case['category']}")
    print(f"‚ùì Question: {question}")
    print(f"üéØ Expected: {expected}")
    
    try:
        result = workflow_agent.run_single_question(
            question=question,
            task_id=test_case['task_id']
        )
        
        agent_answer = result.get('final_answer', '').strip()
        complexity = result.get('complexity', 'unknown')
        steps = len(result.get('steps', []))
        
        # Simple answer matching (normalize for comparison)
        agent_normalized = agent_answer.lower().replace(',', '').replace('$', '').strip()
        expected_normalized = expected.lower().replace(',', '').replace('$', '').strip()
        
        is_correct = agent_normalized == expected_normalized or expected_normalized in agent_normalized
        
        print(f"ü§ñ Agent: {agent_answer}")
        print(f"üéØ Match: {'‚úÖ' if is_correct else '‚ùå'}")
        print(f"üß† Complexity: {complexity}")
        print(f"üìä Steps: {steps}")
        
        synthetic_results.append({
            'test_case': test_case,
            'agent_answer': agent_answer,
            'is_correct': is_correct,
            'complexity': complexity,
            'steps': steps
        })
        
    except Exception as e:
        print(f"‚ùå Error: {e}")
        synthetic_results.append({
            'test_case': test_case,
            'error': str(e)
        })

# Synthetic testing summary
correct_answers = sum(1 for r in synthetic_results if r.get('is_correct', False))
total_synthetic = len([r for r in synthetic_results if 'error' not in r])

print(f"\nüìä SYNTHETIC DATA TESTING SUMMARY")
print("=" * 40)
print(f"Correct answers: {correct_answers}/{total_synthetic} ({correct_answers/total_synthetic*100:.1f}%)")

for result in synthetic_results:
    if 'error' not in result:
        category = result['test_case']['category']
        correct = '‚úÖ' if result['is_correct'] else '‚ùå'
        complexity = result['complexity']
        print(f"{correct} {category}: {complexity} complexity")

if correct_answers/total_synthetic >= 0.8:
    print("üéâ System ready for real GAIA data testing!")
else:
    print("‚ö†Ô∏è Address issues before proceeding to real data")

# Section 8: Development Summary & Next Steps

In [None]:
# Comprehensive summary of all tests

print("üìã DEVELOPMENT TESTING SUMMARY")
print("=" * 50)

# Collect all test results
summary_data = {
    'dependencies': len(missing) == 0 if 'missing' in locals() else False,
    'model_providers': len([r for r in model_test_results.values() if r.get('status') == 'success']),
    'total_providers': len(model_test_results),
    'routing_accuracy': correct_routing/total_tests if 'total_tests' in locals() and total_tests > 0 else 0,
    'synthetic_accuracy': correct_answers/total_synthetic if 'total_synthetic' in locals() and total_synthetic > 0 else 0,
    'file_types_working': len([r for r in file_processing_results.values() if r.get('status') == 'success']),
    'total_file_types': len(file_processing_results)
}

print(f"‚úÖ Dependencies: {'All OK' if summary_data['dependencies'] else 'Issues found'}")
print(f"ü§ñ Model Providers: {summary_data['model_providers']}/{summary_data['total_providers']} working")
print(f"üîÄ Routing Accuracy: {summary_data['routing_accuracy']*100:.1f}%")
print(f"üé≠ Synthetic Test Accuracy: {summary_data['synthetic_accuracy']*100:.1f}%") 
print(f"üìé File Type Support: {summary_data['file_types_working']}/{summary_data['total_file_types']} working")

# Readiness assessment
readiness_score = (
    summary_data['dependencies'] * 20 +
    (summary_data['model_providers'] / summary_data['total_providers']) * 20 +
    summary_data['routing_accuracy'] * 20 +
    summary_data['synthetic_accuracy'] * 20 +
    (summary_data['file_types_working'] / summary_data['total_file_types']) * 20
)

print(f"\nüèÜ Development Readiness Score: {readiness_score:.1f}/100")

if readiness_score >= 80:
    print("üöÄ System ready for production validation testing!")
elif readiness_score >= 60:
    print("‚ö†Ô∏è Minor issues to address before production testing")
else:
    print("‚ùå Significant issues need resolution")

# Next steps recommendations
print(f"\nüí° Next Steps:")
if summary_data['model_providers'] < summary_data['total_providers']:
    print("  1. Fix model provider connectivity issues")
if summary_data['routing_accuracy'] < 0.8:
    print("  2. Improve routing logic and complexity detection")
if summary_data['synthetic_accuracy'] < 0.8:
    print("  3. Debug answer formatting and processing logic")
if summary_data['file_types_working'] < summary_data['total_file_types']:
    print("  4. Implement or fix file processing capabilities")

print("  ‚Üí Proceed to Production Validator notebook when ready!")