# GAIA Validator
## Implementation plan, testing framework, and deployment

**Objective:** Complete implementation roadmap and evaluation system  
**Target:** 45-55% GAIA accuracy within $10 budget

---

### Tests for correct functioning of agent systems

In [None]:
# Proper imports for testing GAIAAgent
from agent_logic import GAIAAgent
from agent_interface import get_openrouter_config

# Now create the agent
agent = GAIAAgent(get_openrouter_config())

print("üîç Context-aware tools analysis:")
for i, tool in enumerate(agent.context_aware_tools):
   print(f"  Tool {i}: {type(tool)}")
   print(f"    Name: {getattr(tool, 'name', 'NO NAME')}")
   print(f"    Has name attr: {hasattr(tool, 'name')}")

print(f"\nüîç Shared tools:")
for name, tool in agent.shared_tools.items():
   print(f"  {name}: {type(tool)} - {getattr(tool, 'name', 'NO NAME')}")

print(f"\nüîç Specialist tool check:")
for spec_name, specialist in agent.specialists.items():
   tools = getattr(specialist, 'tools', [])
   tool_names = [getattr(tool, 'name', str(tool)[:30]) for tool in tools]
   has_attachment = any('get_attachment' in str(name).lower() for name in tool_names)
   print(f"  {spec_name}: {len(tools)} tools, has get_attachment: {has_attachment}")

In [None]:
result = agent.process_question("Analyze the data in this spreadsheet", task_id="some_file_id")

In [None]:
print(result)

In [None]:
print(f"\nüîç Specialist tool check:")
for spec_name, specialist in agent.specialists.items():
    tools = getattr(specialist, 'tools', [])
    tool_names = [getattr(tool, 'name', str(tool)[:30]) for tool in tools]
    has_attachment = any('get_attachment' in str(name).lower() for name in tool_names)
    print(f"  {spec_name}: {len(tools)} tools, has get_attachment: {has_attachment}")
    print(f"    Tool names: {tool_names}")

In [None]:
from agent_testing import run_quick_gaia_test

print("üß™ Testing specialized agents...")
results = run_quick_gaia_test('openrouter')

print(f"\nüìä Results with specialized agents:")
print(f"Overall accuracy: {results.get('overall_accuracy', 0)}%")
print(f"Successful executions: {results.get('successful_executions', 0)}")

# Check if we fixed the template response issue
if 'detailed_results' in results:
    for result in results['detailed_results']:
        answer = result.get('agent_answer', '')
        if '{' in answer or '[your answer]' in answer:
            print(f"‚ùå Still getting templates: {answer}")
        else:
            print(f"‚úÖ Real answer: {answer}")

## Section 1: Status Quo Gaia-benchmark Agent Testing Framework

### Import testing framework

In [None]:
import json
import pandas as pd
from typing import Dict, List, Optional
from datetime import datetime
from collections import defaultdict

# Import the comprehensive testing framework
try:
    from agent_testing import (
        run_gaia_test, 
        run_quick_gaia_test, 
        compare_agent_configs, 
        run_smart_routing_test,
        analyze_failure_patterns,
        validate_test_environment
    )
    TESTING_AVAILABLE = True
    print("‚úÖ GAIA Testing Framework loaded successfully")
except ImportError as e:
    print(f"‚ùå Could not import testing framework: {e}")
    TESTING_AVAILABLE = False

class GAIAValidator:
    """
    üöÄ GAIA Production Validator - Notebook Edition
    
    Clean interface for testing GAIA agents:
    1. Run tests ‚Üí 2. Get insights ‚Üí 3. Make decisions
    """
    
    def __init__(self):
        self.last_result = None
        self.history = []
        
        print("üéØ GAIA Production Validator (Notebook Edition)")
        
        if not TESTING_AVAILABLE:
            print("‚ùå Testing framework not available!")
            return
        
        # Quick environment check
        try:
            env_status = validate_test_environment()
            if env_status.get("all_dependencies_ready", False):
                print("‚úÖ Environment ready!")
                if env_status.get("context_bridge_functional", False):
                    print("üåâ Hybrid State + Context Bridge verified!")
            else:
                print("‚ö†Ô∏è  Some dependencies missing")
        except:
            print("‚ö†Ô∏è  Environment check failed - but may still work")

### Declare test methods

In [None]:
def quick(self, config="groq", questions=5):
    """üöÄ Quick test (5 questions) - perfect for development"""
    print(f"üöÄ Quick Test: {config} ({questions} questions)")
    
    try:
        result = run_quick_gaia_test(config, num_questions=questions)
        self.last_result = result
        self._add_history("quick", config, result)
        
        if result and 'overall_performance' in result:
            acc = result['overall_performance']['accuracy']
            print(f"‚úÖ Quick Test: {acc:.1%} accuracy")
            print("üí° Run validator.insights() for detailed analysis")
        
        return result
        
    except Exception as e:
        print(f"‚ùå Test failed: {e}")
        return {"error": str(e)}

def full(self, config="groq", questions=20):
    """üéØ Full test (20+ questions) - for production validation"""
    print(f"üéØ Full Test: {config} ({questions} questions)")
    
    try:
        result = run_gaia_test(config, max_questions=questions)
        self.last_result = result
        self._add_history("full", config, result)
        
        if result and 'overall_performance' in result:
            overall = result['overall_performance']
            acc = overall['accuracy']
            correct = overall['correct_answers']
            total = overall['total_questions']
            
            print(f"‚úÖ Full Test Complete:")
            print(f"   Accuracy: {acc:.1%} ({correct}/{total})")
            print(f"   GAIA Target: {'‚úÖ MET' if acc >= 0.45 else '‚ùå NOT MET'}")
            print("üí° Run validator.insights() for detailed analysis")
        
        return result
        
    except Exception as e:
        print(f"‚ùå Test failed: {e}")
        return {"error": str(e)}

def compare(self, configs=["groq", "google"], questions=10):
    """üîÑ Compare configurations"""
    print(f"üîÑ Comparing: {configs} ({questions} questions each)")
    
    try:
        result = compare_agent_configs(configs, questions)
        self.last_result = result
        self._add_history("compare", configs, result)
        
        comparison = result.get('comparison_results', {})
        if comparison:
            print(f"‚úÖ Comparison Complete:")
            # Quick ranking
            ranked = sorted(
                [(name, data.get('accuracy', 0)) for name, data in comparison.items() 
                    if 'accuracy' in data],
                key=lambda x: x[1], reverse=True
            )
            for i, (config, acc) in enumerate(ranked, 1):
                medal = "ü•á" if i == 1 else "ü•à" if i == 2 else "ü•â" if i == 3 else f"{i}."
                print(f"   {medal} {config}: {acc:.1%}")
            print("üí° Run validator.insights() for detailed analysis")
        
        return result
        
    except Exception as e:
        print(f"‚ùå Comparison failed: {e}")
        return {"error": str(e)}

def routing(self, config="performance", questions=15):
    """üõ§Ô∏è Test smart routing effectiveness"""
    print(f"üõ§Ô∏è Routing Test: {config} ({questions} questions)")
    
    try:
        result = run_smart_routing_test(config)
        self.last_result = result
        self._add_history("routing", config, result)
        
        if result and 'strategy_analysis' in result:
            strategies = result['strategy_analysis']
            print(f"‚úÖ Routing Test Complete:")
            for strategy, stats in strategies.items():
                acc = stats.get('accuracy', 0)
                count = stats.get('total_questions', 0)
                if count > 0:
                    print(f"   {strategy}: {acc:.1%} ({count}q)")
            print("üí° Run validator.insights() for detailed analysis")
        
        return result
        
    except Exception as e:
        print(f"‚ùå Routing test failed: {e}")
        return {"error": str(e)}

### Declare Evaluation Insights Methods

In [None]:
    def insights(self, result=None):
        """‚ú® Get actionable insights from test results"""
        if result is None:
            result = self.last_result
        
        if not result:
            print("‚ùå No test results! Run a test first.")
            return
        
        if "error" in result:
            print(f"‚ùå Cannot analyze failed test: {result['error']}")
            return
        
        print("\n‚ú® ACTIONABLE INSIGHTS")
        print("=" * 50)
        
        # Detect test type and analyze accordingly
        if 'comparison_results' in result:
            self._analyze_comparison(result)
        elif self._is_routing_test(result):
            self._analyze_routing(result)
        else:
            self._analyze_performance(result)
        
        # Generate recommendations
        self._generate_recommendations(result)
    
    def _analyze_performance(self, result):
        """Analyze single agent performance"""
        overall = result.get('overall_performance', {})
        accuracy = overall.get('accuracy', 0)
        total = overall.get('total_questions', 0)
        correct = overall.get('correct_answers', 0)
        
        print(f"üìä PERFORMANCE ANALYSIS")
        print(f"   Overall: {accuracy:.1%} ({correct}/{total})")
        
        # GAIA benchmark assessment
        if accuracy >= 0.60:
            print(f"   üèÜ EXCELLENT - Competitive performance!")
        elif accuracy >= 0.45:
            print(f"   ‚úÖ GOOD - Above GAIA threshold")
        elif accuracy >= 0.30:
            print(f"   ‚ö†Ô∏è  FAIR - Below GAIA threshold")
        else:
            print(f"   ‚ùå POOR - Needs significant improvement")
        
        # Level breakdown
        level_perf = result.get('level_performance', {})
        if level_perf:
            print(f"\nüìà LEVEL BREAKDOWN:")
            for level in sorted(level_perf.keys()):
                perf = level_perf[level]
                acc = perf['accuracy']
                count = perf['total']
                print(f"   Level {level}: {acc:.1%} ({count} questions)")
                
                # Level-specific insights
                if level == '1' and acc < 0.7:
                    print(f"      ‚ö†Ô∏è  Level 1 should be >70%")
                elif level == '3' and acc > 0.3:
                    print(f"      üéØ Strong Level 3 performance!")
        
        # Strategy analysis
        strategy_perf = result.get('strategy_analysis', {})
        if strategy_perf:
            print(f"\nüéØ STRATEGY EFFECTIVENESS:")
            for strategy, stats in strategy_perf.items():
                acc = stats.get('accuracy', 0)
                count = stats.get('total_questions', 0)
                if count > 0:
                    print(f"   {strategy}: {acc:.1%} ({count}q)")
        
        # Hybrid metrics
        hybrid = result.get('hybrid_state_metrics', {})
        if hybrid:
            context_usage = hybrid.get('context_bridge_usage', {})
            if context_usage:
                usage = context_usage.get('usage_percentage', 0)
                print(f"\nüåâ HYBRID STATE:")
                print(f"   Context Bridge: {usage:.1%} usage")
                
                avg_time = hybrid.get('average_execution_time', 0)
                if avg_time > 0:
                    print(f"   Avg Time: {avg_time:.2f}s per question")
    
    def _analyze_comparison(self, result):
        """Analyze configuration comparison"""
        comparison = result.get('comparison_results', {})
        
        print(f"üîÑ CONFIGURATION COMPARISON")
        
        # Sort by accuracy
        ranked = sorted(
            [(name, data) for name, data in comparison.items() if 'accuracy' in data],
            key=lambda x: x[1]['accuracy'], reverse=True
        )
        
        print(f"   üìä RANKING:")
        for i, (config, data) in enumerate(ranked, 1):
            acc = data['accuracy']
            total = data.get('total_questions', 0)
            correct = data.get('correct_answers', 0)
            
            medal = "ü•á" if i == 1 else "ü•à" if i == 2 else "ü•â" if i == 3 else f"{i}."
            print(f"   {medal} {config}: {acc:.1%} ({correct}/{total})")
        
        # Winner analysis
        if ranked:
            winner, winner_data = ranked[0]
            print(f"\nüèÜ WINNER: {winner}")
            print(f"   Accuracy: {winner_data['accuracy']:.1%}")
            
            if len(ranked) > 1:
                gap = winner_data['accuracy'] - ranked[1][1]['accuracy']
                print(f"   Lead: {gap:.1%} ahead of 2nd place")
    
    def _analyze_routing(self, result):
        """Analyze smart routing"""
        strategies = result.get('strategy_analysis', {})
        
        print(f"üõ§Ô∏è SMART ROUTING ANALYSIS")
        
        # Group strategies
        one_shot = {k: v for k, v in strategies.items() if 'one_shot' in k.lower()}
        complex_strat = {k: v for k, v in strategies.items() if 'manager' in k.lower() or 'agent_' in k.lower()}
        
        if one_shot:
            print(f"   ‚ö° SIMPLE STRATEGIES:")
            for strategy, stats in one_shot.items():
                acc = stats.get('accuracy', 0)
                count = stats.get('total_questions', 0)
                print(f"      {strategy}: {acc:.1%} ({count}q)")
        
        if complex_strat:
            print(f"   üß† COMPLEX STRATEGIES:")
            for strategy, stats in complex_strat.items():
                acc = stats.get('accuracy', 0)
                count = stats.get('total_questions', 0)
                print(f"      {strategy}: {acc:.1%} ({count}q)")
        
        # Routing effectiveness
        if one_shot and complex_strat:
            simple_total = sum(s.get('total_questions', 0) for s in one_shot.values())
            complex_total = sum(s.get('total_questions', 0) for s in complex_strat.values())
            
            print(f"\nüìä ROUTING BALANCE:")
            print(f"   Simple: {simple_total} questions")
            print(f"   Complex: {complex_total} questions")
            
            if simple_total > complex_total:
                print(f"   ‚úÖ Good - more simple questions handled efficiently")
            else:
                print(f"   ‚ö†Ô∏è  Many complex questions - check routing logic")
    
    def _generate_recommendations(self, result):
        """Generate actionable recommendations"""
        print(f"\nüí° RECOMMENDATIONS")
        
        # Try to get detailed failure analysis
        try:
            failure_analysis = analyze_failure_patterns(result)
            recommendations = failure_analysis.get('recommendations', [])
            
            if recommendations:
                for i, rec in enumerate(recommendations, 1):
                    print(f"   {i}. {rec}")
            else:
                print("   üéâ No specific issues - performance looks good!")
                
        except:
            # Fallback simple recommendations
            overall = result.get('overall_performance', {})
            accuracy = overall.get('accuracy', 0)
            
            if accuracy < 0.45:
                print("   1. Focus on reaching GAIA 45% threshold")
                print("   2. Check Level 1 performance first")
                print("   3. Analyze specific failure cases")
            elif accuracy < 0.60:
                print("   1. Good performance - optimize consistency")
                print("   2. Focus on Level 2 improvements")
                print("   3. Consider smart routing tweaks")
            else:
                print("   1. Excellent! Document this configuration")
                print("   2. Test with larger question sets")
                print("   3. Monitor performance over time")
    
    # ========================================================================
    # UTILITY METHODS üõ†Ô∏è
    # ========================================================================
    
    def _is_routing_test(self, result):
        """Check if this is a routing test"""
        strategies = result.get('strategy_analysis', {})
        return len(strategies) > 2  # Routing tests have multiple strategies
    
    def _add_history(self, test_type, config, result):
        """Add to test history"""
        self.history.append({
            "time": datetime.now().strftime("%H:%M:%S"),
            "type": test_type,
            "config": str(config),
            "accuracy": result.get('overall_performance', {}).get('accuracy', 0) if 'overall_performance' in result else 0,
            "success": "error" not in result
        })
    
    def status(self):
        """Show validator status"""
        print(f"üìä VALIDATOR STATUS")
        print(f"   Tests run: {len(self.history)}")
        print(f"   Framework: {'‚úÖ Available' if TESTING_AVAILABLE else '‚ùå Not available'}")
        
        if self.history:
            print(f"   Recent tests:")
            for test in self.history[-3:]:  # Last 3 tests
                status = "‚úÖ" if test['success'] else "‚ùå"
                print(f"      {test['time']} {status} {test['type']}({test['config']}) - {test['accuracy']:.1%}")
    
    def show_history(self):
        """Show test history as DataFrame"""
        if not self.history:
            print("No test history yet")
            return
        
        df = pd.DataFrame(self.history)
        return df
    
    def save(self, filename=None):
        """Save last result to file"""
        if not self.last_result:
            print("‚ùå No result to save")
            return
        
        if filename is None:
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            filename = f"gaia_result_{timestamp}.json"
        
        try:
            with open(filename, 'w') as f:
                json.dump(self.last_result, f, indent=2)
            print(f"üíæ Saved: {filename}")
        except Exception as e:
            print(f"‚ùå Save failed: {e}")

# Create the validator instance
print("\nüöÄ Creating GAIA Validator...")
validator = GAIAValidator()

print("\nüí° READY TO USE!")
print("Quick commands:")
print("  validator.quick('groq')     # Quick test")
print("  validator.full('groq', 20)  # Full test") 
print("  validator.compare(['groq', 'google'])  # Compare configs")
print("  validator.routing('performance')  # Test routing")
print("  validator.insights()        # Analyze results")
print("  validator.status()          # Show status")


## Section 1.1: Create Small Test Batch (Data Layer)

In [None]:
from agent_logic import GAIAAgent, GAIAConfig

# Create config object for Openrouter
config = GAIAConfig(
    model_provider="openrouter",
    model_name="qwen/qwen3-30b-a3b"
)
agent = GAIAAgent(config)
result = agent.process_question("Is Musk still ceo of Tesla?")

In [None]:
print(result)

In [None]:
# Clear cache issues nuclear.way:
import os
os._exit(0)  # Completely restart Python process

In [None]:
# Cell 1: Create Small Test Batch using Pure Data Layer
print("üì¶ Creating Small Test Batch (5 questions)")
print("=" * 50)

from gaia_dataset_utils import GAIADatasetManager, quick_dataset_check

# Step 1: Validate dataset
print("üîç Step 1: Dataset Validation")
dataset_ready = quick_dataset_check("./tests/gaia_data")

if dataset_ready:
    # Step 2: Create dataset manager
    print("\nüìä Step 2: Loading Dataset Manager")
    manager = GAIADatasetManager("./tests/gaia_data")
    
    if manager.metadata:
        print(f"‚úÖ Dataset loaded: {len(manager.metadata)} total questions")
        print(f"üìÅ Questions with files: {len(manager.file_questions)}")
        
        # Step 3: Create small test batch
        print("\nüì¶ Step 3: Creating Small Test Batch")
        test_batch = manager.create_test_batch(5, "small_sample")
        
        if test_batch:
            print(f"‚úÖ Created test batch with {len(test_batch)} questions")
            
            # Show batch composition
            print(f"\nüìã Batch Composition:")
            levels = {}
            file_count = 0
            file_types = set()
            
            for i, question in enumerate(test_batch, 1):
                level = question.get('Level', 'Unknown')
                levels[level] = levels.get(level, 0) + 1
                
                has_file = question['task_id'] in manager.file_questions
                if has_file:
                    file_count += 1
                    file_name = manager.file_questions[question['task_id']].get('file_name', '')
                    if file_name:
                        ext = file_name.split('.')[-1].lower()
                        file_types.add(ext)
                
                # Show question preview
                question_text = question.get('Question', '')
                preview = question_text[:60] + "..." if len(question_text) > 60 else question_text
                file_info = f" (File: {file_name})" if has_file else ""
                
                print(f"  {i}. Level {level}: {preview}{file_info}")
            
            print(f"\nüìä Summary:")
            print(f"  Level distribution: {dict(levels)}")
            print(f"  Questions with files: {file_count}/{len(test_batch)}")
            if file_types:
                print(f"  File types: {', '.join(sorted(file_types))}")
            
            # Verify blind testing - show what agent will see vs hidden
            print(f"\nüîí Blind Testing Verification:")
            sample_question = test_batch[0]
            
            print(f"  ü§ñ Agent will see:")
            visible_fields = ['task_id', 'Question', 'Level', 'file_name', 'file_path']
            for field in visible_fields:
                if field in sample_question:
                    value = sample_question[field]
                    if isinstance(value, str) and len(value) > 50:
                        value = value[:50] + "..."
                    print(f"    {field}: {value}")
            
            print(f"  üîí Agent will NOT see:")
            # Check what's hidden by looking at full dataset
            full_question = manager.get_question_by_id(sample_question['task_id'])
            hidden_info = ['Final answer', 'Annotator Metadata']
            for field in hidden_info:
                if field in full_question:
                    print(f"    {field}: HIDDEN")
            
            print(f"\n‚úÖ Test batch ready for execution!")
            
        else:
            print("‚ùå Failed to create test batch")
    else:
        print("‚ùå Failed to load dataset")
else:
    print("‚ùå Dataset not ready - check your setup")
    test_batch = None

## Section 1.2: Execute Test Batch with provider

In [None]:
# Cell 2: Execute Test Batch using Pure Testing Layer
print("ü§ñ Executing Test Batch with Openrouter")
print("=" * 50)

# Import testing layer components
from agent_testing import GAIATestExecutor, GAIATestEvaluator, get_agent_config
import time

# Check if we have test batch from previous cell
if 'test_batch' not in locals() or not test_batch:
    print("‚ùå No test batch available. Run Cell 1 first.")
else:
    print(f"üìã Test batch loaded: {len(test_batch)} questions")
    
    # Step 1: Configure Groq agent (supports tools)
    print(f"\nüîß Step 1: Configure Openrouter Agent")
    
    try:
        # Get Groq configuration (supports tool calling)
        ollama_config = get_agent_config("openrouter")
        print(f"‚úÖ Openrouter config loaded")
        print(f"   Model: google/gemini-2.5-flash")
        print(f"   Provider: Ollama")
        print(f"   Tool Support: ‚úÖ YES")
        
        # Create test executor
        executor = GAIATestExecutor("openrouter")
        print(f"‚úÖ Test executor created")
        
    except Exception as e:
        print(f"‚ùå Failed to create agent: {e}")
        executor = None

    if executor:
        # Step 2: Execute test batch (blind)
        print(f"\nüöÄ Step 2: Execute Test Batch (Blind)")
        print(f"‚ö†Ô∏è  This will possibly use provider API credits")
        
        proceed = input("Proceed with execution? (y/n): ")
        
        if proceed.lower() in ['y', 'yes']:
            start_time = time.time()
            
            try:
                # Execute the batch
                execution_results = executor.execute_test_batch(test_batch)
                
                execution_time = time.time() - start_time
                
                if execution_results:
                    print(f"\n‚úÖ Execution completed!")
                    print(f"‚è±Ô∏è Total time: {execution_time:.1f}s")
                    
                    # Analyze execution results
                    successful = len([r for r in execution_results if r.get('execution_successful', False)])
                    avg_time = sum(r.get('execution_time', 0) for r in execution_results) / len(execution_results)
                    
                    print(f"üìä Execution Summary:")
                    print(f"  Total questions: {len(execution_results)}")
                    print(f"  Successful question executions: {successful}/{len(execution_results)} ({successful/len(execution_results):.1%})")
                    print(f"  Average time per question: {avg_time:.2f}s")
                    
                    # Show sample results (without revealing correctness yet)
                    print(f"\nü§ñ Sample Agent Responses:")
                    for i, result in enumerate(execution_results[:3], 1):
                        answer = result.get('agent_answer', 'No answer')
                        strategy = result.get('strategy_used', 'unknown')
                        exec_time = result.get('execution_time', 0)
                        
                        print(f"  {i}. Answer: '{answer}' (Strategy: {strategy}, Time: {exec_time:.1f}s)")
                    
                    if len(execution_results) > 3:
                        print(f"  ... and {len(execution_results) - 3} more")
                    
                    print(f"\nüîí Note: Correctness not yet determined - evaluation needed!")
                    
                else:
                    print(f"‚ùå Execution failed - no results returned")
                    execution_results = None
                    
            except Exception as e:
                print(f"‚ùå Execution error: {e}")
                execution_results = None
        else:
            print("‚è≠Ô∏è Execution skipped")
            execution_results = None

## Section 1.3: Evaluate Results (Testing Layer + Data Layer)

In [None]:
# Cell 3: Evaluate Results against Ground Truth
print("üéØ Evaluating Results against Ground Truth")
print("=" * 50)

# Check if we have execution results
if 'execution_results' not in locals() or not execution_results:
    print("‚ùå No execution results available. Run Cell 2 first.")
elif 'manager' not in locals() or not manager:
    print("‚ùå No dataset manager available. Run Cell 1 first.")
else:
    print(f"üìã Execution results loaded: {len(execution_results)} results")
    
    # Step 1: Create evaluator with dataset manager for ground truth access
    print(f"\nüîç Step 1: Initialize Evaluator")
    
    try:
        evaluator = GAIATestEvaluator(manager)
        print(f"‚úÖ Evaluator created with ground truth access")
        
    except Exception as e:
        print(f"‚ùå Failed to create evaluator: {e}")
        evaluator = None

    if evaluator:
        # Step 2: Evaluate execution results
        print(f"\nüéØ Step 2: Evaluate Against Ground Truth")
        
        try:
            evaluation_results = evaluator.evaluate_execution_results(execution_results)
            
            if evaluation_results and 'evaluation_metadata' in evaluation_results:
                metadata = evaluation_results['evaluation_metadata']
                analysis = evaluation_results.get('analysis', {})
                
                # Overall performance
                print(f"\nüìä Overall Performance:")
                print(f"  Total questions: {metadata.get('total_questions', 0)}")
                print(f"  Correct answers: {metadata.get('correct_answers', 0)}")
                print(f"  Overall accuracy: {metadata.get('overall_accuracy', 0):.1%}")
                
                gaia_target_met = metadata.get('overall_accuracy', 0) >= 0.45
                print(f"  GAIA target (45%): {'‚úÖ MET' if gaia_target_met else '‚ùå NOT MET'}")
                
                # Performance by level
                level_perf = analysis.get('level_performance', {})
                if level_perf:
                    print(f"\nüìà Performance by Level:")
                    for level_key, stats in level_perf.items():
                        level_num = level_key.replace('level_', '')
                        accuracy = stats.get('accuracy', 0)
                        total = stats.get('total_questions', 0)
                        correct = stats.get('correct_answers', 0)
                        
                        print(f"  Level {level_num}: {accuracy:.1%} ({correct}/{total} correct)")
                
                # Strategy performance
                strategy_perf = analysis.get('strategy_performance', {})
                if strategy_perf:
                    print(f"\nüéØ Strategy Performance:")
                    for strategy, stats in strategy_perf.items():
                        accuracy = stats.get('accuracy', 0)
                        total = stats.get('total_questions', 0)
                        avg_time = stats.get('avg_execution_time', 0)
                        
                        strategy_name = strategy.replace('_', ' ').title()
                        print(f"  {strategy_name}: {accuracy:.1%} ({total} questions, {avg_time:.1f}s avg)")
                
                # File attachment performance
                file_perf = analysis.get('file_attachment_performance', {})
                if file_perf:
                    print(f"\nüìé File Attachment Performance:")
                    for category, stats in file_perf.items():
                        accuracy = stats.get('accuracy', 0)
                        total = stats.get('total_questions', 0)
                        category_name = category.replace('_', ' ').title()
                        print(f"  {category_name}: {accuracy:.1%} ({total} questions)")
                
                # Show detailed results for each question
                print(f"\nüìù Detailed Question Results:")
                detailed_results = evaluation_results.get('detailed_results', [])
                
                for i, result in enumerate(detailed_results, 1):
                    is_correct = result.get('is_correct', False)
                    agent_answer = result.get('agent_answer', '')
                    expected_answer = result.get('expected_answer', '')
                    level = result.get('level', 'Unknown')
                    strategy = result.get('strategy_used', 'unknown')
                    exec_time = result.get('execution_time', 0)
                    
                    status = "‚úÖ CORRECT" if is_correct else "‚ùå INCORRECT"
                    
                    print(f"  {i}. {status} (Level {level}, {strategy}, {exec_time:.1f}s)")
                    print(f"     Agent: '{agent_answer}'")
                    print(f"     Expected: '{expected_answer}'")
                    
                    if result.get('has_file'):
                        file_name = result.get('file_name', 'unknown')
                        print(f"     File: {file_name}")
                    print()
                
                # Error analysis
                error_analysis = analysis.get('error_analysis', {})
                if error_analysis:
                    exec_errors = error_analysis.get('execution_errors', 0)
                    success_rate = error_analysis.get('execution_success_rate', 0)
                    
                    print(f"üîß Execution Analysis:")
                    print(f"  Success rate: {success_rate:.1%}")
                    print(f"  Execution errors: {exec_errors}")
                    
                    if exec_errors > 0:
                        sample_errors = error_analysis.get('sample_errors', [])
                        if sample_errors:
                            print(f"  Sample errors:")
                            for error in sample_errors[:2]:
                                print(f"    - {error}")
                
                print(f"\n‚úÖ Evaluation completed!")
                print(f"üíæ Results saved to logs/gaia_evaluation_*.json")
                
            else:
                print(f"‚ùå Evaluation failed - no results returned")
                
        except Exception as e:
            print(f"‚ùå Evaluation error: {e}")

print(f"\nüéâ Test completed!")
print(f"üìä You've tested the clean architecture with:")
print(f"  ‚úÖ Pure data layer (gaia_dataset_utils)")
print(f"  ‚úÖ Pure testing layer (agent_testing)")
print(f"  ‚úÖ OpenRouter free model")
print(f"  ‚úÖ Small batch strategy")
print(f"  ‚úÖ Blind testing methodology")

## Section 2: Performance Baseline Establishment

In [None]:
# Establish performance baselines across different question types

print("üìä Establishing Performance Baselines")
print("=" * 50)

# Define baseline test configurations
baseline_configs = {
    'level_1_only': {
        'name': 'Level 1 Questions Only',
        'params': {
            'max_questions': 15,
            'target_levels': [1],
            'include_files': True,
            'include_images': True
        }
    },
    'level_1_2_mix': {
        'name': 'Level 1 & 2 Mixed',
        'params': {
            'max_questions': 20,
            'target_levels': [1, 2],
            'include_files': True,
            'include_images': True
        }
    },
    'all_levels': {
        'name': 'All Difficulty Levels',
        'params': {
            'max_questions': 25,
            'target_levels': [1, 2, 3],
            'include_files': True,
            'include_images': True
        }
    },
    'text_only': {
        'name': 'Text-Only Questions',
        'params': {
            'max_questions': 15,
            'target_levels': [1, 2],
            'include_files': False,
            'include_images': False
        }
    },
    'with_files': {
        'name': 'File-Based Questions',
        'params': {
            'max_questions': 15,
            'target_levels': [1, 2],
            'include_files': True,
            'include_images': True
        }
    }
}

# Run baseline tests with groq configuration
baseline_results = {}
baseline_agent = "ollama"  # Use most reliable configuration

for config_key, config_data in baseline_configs.items():
    config_name = config_data['name']
    params = config_data['params']
    
    print(f"\nüß™ Running: {config_name}")
    print("-" * 40)
    
    try:
        result = run_gaia_test(
            agent_config_name=baseline_agent,
            **params
        )
        
        if result and 'evaluation_metadata' in result:
            metadata = result['evaluation_metadata']
            analysis = result.get('analysis', {})
            
            # Extract key metrics
            baseline_results[config_key] = {
                'name': config_name,
                'total_questions': metadata.get('total_questions', 0),
                'correct_answers': metadata.get('correct_answers', 0),
                'accuracy': metadata.get('overall_accuracy', 0),
                'analysis': analysis,
                'full_result': result
            }
            
            # Print immediate summary
            accuracy = metadata.get('overall_accuracy', 0)
            total = metadata.get('total_questions', 0)
            correct = metadata.get('correct_answers', 0)
            
            print(f"‚úÖ Completed: {correct}/{total} correct ({accuracy:.1%})")
            
            # Check GAIA target
            if accuracy >= 0.45:
                print("üèÜ GAIA target achieved (45%+)")
            elif accuracy >= 0.35:
                print("‚ö†Ô∏è Approaching GAIA target")
            else:
                print("‚ùå Below GAIA target")
                
        else:
            print(f"‚ùå Test failed for {config_name}")
            baseline_results[config_key] = {'name': config_name, 'error': 'Test failed'}
            
    except Exception as e:
        print(f"‚ùå Error in {config_name}: {e}")
        baseline_results[config_key] = {'name': config_name, 'error': str(e)}

# Baseline summary table
print(f"\nüìä BASELINE PERFORMANCE SUMMARY")
print("=" * 60)

baseline_df_data = []
for config_key, result in baseline_results.items():
    if 'error' not in result:
        baseline_df_data.append({
            'Test Configuration': result['name'],
            'Questions': result['total_questions'],
            'Correct': result['correct_answers'],
            'Accuracy': f"{result['accuracy']:.1%}",
            'GAIA Target': "‚úÖ" if result['accuracy'] >= 0.45 else "‚ùå"
        })
    else:
        baseline_df_data.append({
            'Test Configuration': result['name'],
            'Questions': 0,
            'Correct': 0,
            'Accuracy': "ERROR",
            'GAIA Target': "‚ùå"
        })

baseline_df = pd.DataFrame(baseline_df_data)
print(baseline_df.to_string(index=False))

# Identify best and worst performing configurations
if baseline_df_data:
    successful_results = [r for r in baseline_results.values() if 'error' not in r and r.get('accuracy', 0) > 0]
    if successful_results:
        best_config = max(successful_results, key=lambda x: x['accuracy'])
        worst_config = min(successful_results, key=lambda x: x['accuracy'])
        
        print(f"\nüèÜ Best Performance: {best_config['name']} ({best_config['accuracy']:.1%})")
        print(f"‚ö†Ô∏è Needs Improvement: {worst_config['name']} ({worst_config['accuracy']:.1%})")

## Section 3: Routing Analysis

In [None]:
# Deep dive into routing effectiveness and optimization

print("üîÄ Smart Routing Analysis")
print("=" * 40)

# Test routing with performance-optimized configuration
routing_test_configs = ["groq", "performance"]

routing_analysis_results = {}

for config in routing_test_configs:
    print(f"\nüß™ Testing routing with {config} configuration")
    print("-" * 50)
    
    try:
        routing_result = run_smart_routing_test(config)
        
        if routing_result and 'analysis' in routing_result:
            analysis = routing_result['analysis']
            routing_analysis = analysis.get('routing_analysis', {})
            strategy_performance = analysis.get('strategy_performance', {})
            level_performance = analysis.get('level_performance', {})
            
            routing_analysis_results[config] = {
                'routing_stats': routing_analysis,
                'strategy_stats': strategy_performance,
                'level_stats': level_performance,
                'overall_accuracy': routing_result['evaluation_metadata']['overall_accuracy']
            }
            
            # Print routing effectiveness
            one_shot_count = routing_analysis.get('one_shot_questions', 0)
            manager_count = routing_analysis.get('manager_questions', 0)
            routing_accuracy = routing_analysis.get('routing_accuracy', 0)
            
            print(f"üìä Question Distribution:")
            print(f"‚îú‚îÄ‚îÄ One-shot LLM: {one_shot_count} questions")
            print(f"‚îú‚îÄ‚îÄ Manager Coordination: {manager_count} questions")
            print(f"‚îî‚îÄ‚îÄ Routing Accuracy: {routing_accuracy:.1%}")
            
            print(f"\nüìà Strategy Performance:")
            for strategy, stats in strategy_performance.items():
                accuracy = stats.get('accuracy', 0)
                avg_time = stats.get('avg_execution_time', 0)
                total = stats.get('total_questions', 0)
                print(f"‚îú‚îÄ‚îÄ {strategy.replace('_', ' ').title()}: {accuracy:.1%} accuracy, {avg_time:.1f}s avg ({total} questions)")
            
            # Routing insights
            print(f"\nüí° Routing Insights:")
            if routing_accuracy >= 0.8:
                print("‚úÖ Routing decisions are highly accurate")
            elif routing_accuracy >= 0.6:
                print("‚ö†Ô∏è Routing decisions are moderately accurate")
            else:
                print("‚ùå Routing decisions need improvement")
            
            # Strategy effectiveness analysis
            if 'one_shot_llm' in strategy_performance and 'manager_coordination' in strategy_performance:
                one_shot_acc = strategy_performance['one_shot_llm'].get('accuracy', 0)
                manager_acc = strategy_performance['manager_coordination'].get('accuracy', 0)
                one_shot_time = strategy_performance['one_shot_llm'].get('avg_execution_time', 0)
                manager_time = strategy_performance['manager_coordination'].get('avg_execution_time', 0)
                
                print(f"\n‚öñÔ∏è Strategy Comparison:")
                print(f"‚îú‚îÄ‚îÄ One-shot: {one_shot_acc:.1%} accuracy, {one_shot_time:.1f}s avg")
                print(f"‚îú‚îÄ‚îÄ Manager: {manager_acc:.1%} accuracy, {manager_time:.1f}s avg")
                
                if one_shot_acc > manager_acc:
                    print("‚îî‚îÄ‚îÄ üí° One-shot performing better - consider simpler routing")
                elif manager_acc > one_shot_acc + 0.1:
                    print("‚îî‚îÄ‚îÄ üí° Manager significantly better - routing working well")
                else:
                    print("‚îî‚îÄ‚îÄ üí° Similar performance - routing providing good balance")
        else:
            print(f"‚ùå Routing test failed for {config}")
            
    except Exception as e:
        print(f"‚ùå Routing test error for {config}: {e}")

# Compare routing across configurations
if len(routing_analysis_results) > 1:
    print(f"\nüìä ROUTING COMPARISON ACROSS CONFIGURATIONS")
    print("=" * 60)
    
    routing_comparison_data = []
    for config, results in routing_analysis_results.items():
        routing_stats = results['routing_stats']
        one_shot_acc = results['strategy_stats'].get('one_shot_llm', {}).get('accuracy', 0)
        manager_acc = results['strategy_stats'].get('manager_coordination', {}).get('accuracy', 0)
        
        routing_comparison_data.append({
            'Configuration': config,
            'Overall Accuracy': f"{results['overall_accuracy']:.1%}",
            'Routing Accuracy': f"{routing_stats.get('routing_accuracy', 0):.1%}",
            'One-shot Accuracy': f"{one_shot_acc:.1%}",
            'Manager Accuracy': f"{manager_acc:.1%}",
            'One-shot Questions': routing_stats.get('one_shot_questions', 0),
            'Manager Questions': routing_stats.get('manager_questions', 0)
        })
    
    routing_comparison_df = pd.DataFrame(routing_comparison_data)
    print(routing_comparison_df.to_string(index=False))

## Section 4: Provider-Specific Testing (Ollama, OpenRouter, Groq)

In [None]:
# Test specific providers independently as requested

print("üîå Provider-Specific Performance Testing")
print("=" * 50)

# Define provider-specific test configurations
provider_configs = {
    'groq_standard': {
        'name': 'Groq (QwQ-32B)',
        'config': 'groq',
        'description': 'Standard Groq configuration with QwQ-32B model'
    },
    'groq_fast': {
        'name': 'Groq (Llama-3.3-70B)', 
        'config': 'groq_fast',
        'description': 'Faster Groq model for speed comparison'
    },
    'openrouter_free': {
        'name': 'OpenRouter (Free)',
        'config': 'openrouter',
        'description': 'OpenRouter free tier model'
    },
    'openrouter_premium': {
        'name': 'OpenRouter (Premium)',
        'config': 'openrouter_premium', 
        'description': 'OpenRouter premium model for accuracy'
    },
    'ollama_local': {
        'name': 'Ollama (Local)',
        'config': 'ollama',
        'description': 'Local Ollama deployment'
    }
}

provider_test_results = {}

# Standard test parameters for all providers
test_params = {
    'max_questions': 15,
    'target_levels': [1, 2],
    'include_files': True,
    'include_images': True
}

for provider_key, provider_info in provider_configs.items():
    config_name = provider_info['config']
    provider_name = provider_info['name']
    description = provider_info['description']
    
    print(f"\nüß™ Testing {provider_name}")
    print(f"üìù {description}")
    print("-" * 50)
    
    try:
        # Test if provider is available first
        from agent_interface import get_agent_config
        test_config = get_agent_config(config_name)
        
        result = run_gaia_test(
            agent_config_name=config_name,
            **test_params
        )
        
        if result and 'evaluation_metadata' in result:
            metadata = result['evaluation_metadata']
            analysis = result.get('analysis', {})
            
            # Extract comprehensive metrics
            provider_test_results[provider_key] = {
                'name': provider_name,
                'config': config_name,
                'total_questions': metadata.get('total_questions', 0),
                'correct_answers': metadata.get('correct_answers', 0),
                'accuracy': metadata.get('overall_accuracy', 0),
                'strategy_performance': analysis.get('strategy_performance', {}),
                'level_performance': analysis.get('level_performance', {}),
                'execution_time': 0,  # Will calculate from strategy data
                'error_rate': 1 - analysis.get('error_analysis', {}).get('execution_success_rate', 1),
                'full_result': result
            }
            
            # Calculate average execution time
            strategy_perf = analysis.get('strategy_performance', {})
            if strategy_perf:
                total_time = sum(stats.get('avg_execution_time', 0) * stats.get('total_questions', 0) 
                               for stats in strategy_perf.values())
                total_questions = sum(stats.get('total_questions', 0) for stats in strategy_perf.values())
                avg_time = total_time / total_questions if total_questions > 0 else 0
                provider_test_results[provider_key]['execution_time'] = avg_time
            
            # Print immediate results
            accuracy = metadata.get('overall_accuracy', 0)
            total = metadata.get('total_questions', 0)
            correct = metadata.get('correct_answers', 0)
            
            print(f"‚úÖ {provider_name}: {correct}/{total} correct ({accuracy:.1%})")
            print(f"‚è±Ô∏è Avg execution time: {provider_test_results[provider_key]['execution_time']:.1f}s")
            
            # Provider-specific insights
            if 'groq' in provider_key:
                print(f"üöÄ Groq performance: {'Excellent' if accuracy >= 0.5 else 'Good' if accuracy >= 0.4 else 'Needs improvement'}")
            elif 'openrouter' in provider_key:
                if 'free' in provider_key:
                    print(f"üí∞ Free tier performance: {'Good value' if accuracy >= 0.4 else 'Consider premium'}")
                else:
                    print(f"üíé Premium performance: {'Worth the cost' if accuracy >= 0.5 else 'Evaluate cost/benefit'}")
            elif 'ollama' in provider_key:
                print(f"üè† Local deployment: {'Viable alternative' if accuracy >= 0.4 else 'Cloud providers recommended'}")
                
        else:
            print(f"‚ùå Test failed for {provider_name}")
            provider_test_results[provider_key] = {
                'name': provider_name,
                'config': config_name,
                'error': 'Test execution failed'
            }
            
    except Exception as e:
        print(f"‚ùå Provider {provider_name} unavailable: {e}")
        provider_test_results[provider_key] = {
            'name': provider_name,
            'config': config_name,
            'error': f'Provider unavailable: {str(e)}'
        }

# Provider comparison table
print(f"\nüìä PROVIDER PERFORMANCE COMPARISON")
print("=" * 70)

provider_comparison_data = []
for provider_key, result in provider_test_results.items():
    if 'error' not in result:
        provider_comparison_data.append({
            'Provider': result['name'],
            'Questions': result['total_questions'],
            'Correct': result['correct_answers'],
            'Accuracy': f"{result['accuracy']:.1%}",
            'Avg Time (s)': f"{result['execution_time']:.1f}",
            'Error Rate': f"{result['error_rate']:.1%}",
            'GAIA Target': "‚úÖ" if result['accuracy'] >= 0.45 else "‚ùå"
        })
    else:
        provider_comparison_data.append({
            'Provider': result['name'],
            'Questions': 'N/A',
            'Correct': 'N/A', 
            'Accuracy': 'ERROR',
            'Avg Time (s)': 'N/A',
            'Error Rate': 'N/A',
            'GAIA Target': "‚ùå"
        })

provider_comparison_df = pd.DataFrame(provider_comparison_data)
print(provider_comparison_df.to_string(index=False))

# Provider recommendations
successful_providers = [r for r in provider_test_results.values() if 'error' not in r and r.get('accuracy', 0) > 0]

if successful_providers:
    # Best accuracy
    best_accuracy_provider = max(successful_providers, key=lambda x: x['accuracy'])
    # Fastest provider
    fastest_provider = min(successful_providers, key=lambda x: x['execution_time'])
    # Most reliable (lowest error rate)
    most_reliable_provider = min(successful_providers, key=lambda x: x['error_rate'])
    
    print(f"\nüèÜ PROVIDER RECOMMENDATIONS")
    print("=" * 40)
    print(f"üéØ Best Accuracy: {best_accuracy_provider['name']} ({best_accuracy_provider['accuracy']:.1%})")
    print(f"‚ö° Fastest: {fastest_provider['name']} ({fastest_provider['execution_time']:.1f}s avg)")
    print(f"üõ°Ô∏è Most Reliable: {most_reliable_provider['name']} ({most_reliable_provider['error_rate']:.1%} error rate)")
    
    # Cost considerations
    print(f"\nüí∞ Cost Considerations:")
    groq_providers = [p for p in successful_providers if 'groq' in p['config'].lower()]
    openrouter_providers = [p for p in successful_providers if 'openrouter' in p['config'].lower()]
    ollama_providers = [p for p in successful_providers if 'ollama' in p['config'].lower()]
    
    if groq_providers:
        avg_groq_acc = sum(p['accuracy'] for p in groq_providers) / len(groq_providers)
        print(f"‚îú‚îÄ‚îÄ Groq: High performance, reasonable cost ({avg_groq_acc:.1%} avg accuracy)")
    
    if openrouter_providers:
        free_or = [p for p in openrouter_providers if 'free' in p['name'].lower()]
        premium_or = [p for p in openrouter_providers if 'premium' in p['name'].lower()]
        
        if free_or:
            print(f"‚îú‚îÄ‚îÄ OpenRouter Free: Budget option ({free_or[0]['accuracy']:.1%} accuracy)")
        if premium_or:
            print(f"‚îú‚îÄ‚îÄ OpenRouter Premium: Premium option ({premium_or[0]['accuracy']:.1%} accuracy)")
    
    if ollama_providers:
        print(f"‚îî‚îÄ‚îÄ Ollama: Zero cost, local control ({ollama_providers[0]['accuracy']:.1%} accuracy)")

## Section 5: Interactive Visualization Example

In [None]:
# One example of interactive visualization, then direct to source data

print("üìä Interactive Performance Visualization")
print("=" * 50)

# Create visualization if we have provider comparison data
if provider_comparison_data and len(provider_comparison_data) > 1:
    # Extract accuracy data for visualization
    viz_data = []
    for item in provider_comparison_data:
        if item['Accuracy'] != 'ERROR':
            accuracy_val = float(item['Accuracy'].strip('%')) / 100
            time_val = float(item['Avg Time (s)']) if item['Avg Time (s)'] != 'N/A' else 0
            
            viz_data.append({
                'Provider': item['Provider'],
                'Accuracy': accuracy_val,
                'Avg_Time': time_val,
                'Questions': int(item['Questions']) if item['Questions'] != 'N/A' else 0
            })
    
    if viz_data:
        viz_df = pd.DataFrame(viz_data)
        
        # Create a simple performance visualization
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
        
        # Accuracy comparison
        providers = viz_df['Provider']
        accuracies = viz_df['Accuracy']
        colors = ['green' if acc >= 0.45 else 'orange' if acc >= 0.35 else 'red' for acc in accuracies]
        
        bars1 = ax1.bar(providers, accuracies, color=colors, alpha=0.7)
        ax1.axhline(y=0.45, color='red', linestyle='--', label='GAIA Target (45%)')
        ax1.set_ylabel('Accuracy')
        ax1.set_title('Provider Accuracy Comparison')
        ax1.set_ylim(0, 1)
        ax1.legend()
        
        # Add value labels on bars
        for bar, acc in zip(bars1, accuracies):
            height = bar.get_height()
            ax1.text(bar.get_x() + bar.get_width()/2., height + 0.01,
                    f'{acc:.1%}', ha='center', va='bottom')
        
        # Rotate x-axis labels for readability
        plt.setp(ax1.get_xticklabels(), rotation=45, ha='right')
        
        # Speed vs Accuracy scatter plot
        ax2.scatter(viz_df['Avg_Time'], viz_df['Accuracy'], 
                   s=viz_df['Questions']*10, alpha=0.6, c=colors)
        
        # Add provider labels
        for idx, row in viz_df.iterrows():
            ax2.annotate(row['Provider'], 
                        (row['Avg_Time'], row['Accuracy']),
                        xytext=(5, 5), textcoords='offset points',
                        fontsize=8)
        
        ax2.axhline(y=0.45, color='red', linestyle='--', label='GAIA Target')
        ax2.set_xlabel('Average Execution Time (seconds)')
        ax2.set_ylabel('Accuracy')
        ax2.set_title('Speed vs Accuracy Trade-off')
        ax2.legend()
        
        plt.tight_layout()
        plt.show()
        
        print("üìà Visualization shows accuracy and speed trade-offs")
        print("üí° Larger dots = more questions tested")
        print("üéØ Red line = GAIA performance target (45%)")
        
        # Analysis of the visualization
        best_overall = viz_df.loc[viz_df['Accuracy'].idxmax()]
        fastest = viz_df.loc[viz_df['Avg_Time'].idxmin()]
        
        print(f"\nüìä Visual Analysis:")
        print(f"‚îú‚îÄ‚îÄ Highest accuracy: {best_overall['Provider']} ({best_overall['Accuracy']:.1%})")
        print(f"‚îú‚îÄ‚îÄ Fastest execution: {fastest['Provider']} ({fastest['Avg_Time']:.1f}s)")
        
        # Efficiency score (accuracy / time)
        viz_df['Efficiency'] = viz_df['Accuracy'] / (viz_df['Avg_Time'] + 1)  # +1 to avoid division by zero
        most_efficient = viz_df.loc[viz_df['Efficiency'].idxmax()]
        print(f"‚îî‚îÄ‚îÄ Most efficient: {most_efficient['Provider']} (best accuracy/time ratio)")

print(f"\nüíæ Direct Data Access:")
print("For detailed analysis, access the source data:")
print("‚îú‚îÄ‚îÄ baseline_results: Performance across question types")
print("‚îú‚îÄ‚îÄ routing_analysis_results: Routing effectiveness data") 
print("‚îú‚îÄ‚îÄ provider_test_results: Provider-specific performance")
print("‚îî‚îÄ‚îÄ All results include full evaluation metadata and analysis")

# Show how to access specific data
print(f"\nüîç Example Data Access:")
print("# Access baseline results")
print("baseline_results['level_1_only']['accuracy']")
print("\n# Access provider comparison")
print("provider_test_results['groq_standard']['strategy_performance']")
print("\n# Access detailed analysis")
print("provider_test_results['groq_standard']['full_result']['analysis']")

## Section 6: Failure Pattern Analysis & Optimization

In [None]:
# Deep analysis of failure patterns for improvement insights

print("üîç Comprehensive Failure Pattern Analysis")
print("=" * 50)

# Analyze failures across all test results
all_test_results = []

# Collect results from baseline tests
for config_key, result in baseline_results.items():
    if 'full_result' in result:
        all_test_results.append({
            'source': f'baseline_{config_key}',
            'result': result['full_result']
        })

# Collect results from provider tests  
for provider_key, result in provider_test_results.items():
    if 'full_result' in result:
        all_test_results.append({
            'source': f'provider_{provider_key}',
            'result': result['full_result']
        })

failure_analyses = {}

for test_data in all_test_results:
    source = test_data['source']
    result = test_data['result']
    
    print(f"\nüîç Analyzing failures in {source}")
    print("-" * 40)
    
    try:
        failure_analysis = analyze_failure_patterns(result)
        
        if failure_analysis and 'failure_patterns' in failure_analysis:
            failure_analyses[source] = failure_analysis
            
            patterns = failure_analysis['failure_patterns']
            recommendations = failure_analysis.get('recommendations', [])
            
            # Print key failure insights
            print(f"üìä Failure Distribution:")
            
            # By level
            level_failures = patterns.get('by_level', {})
            total_failures = sum(level_failures.values())
            if total_failures > 0:
                print(f"‚îú‚îÄ‚îÄ By Level:")
                for level, count in level_failures.items():
                    percentage = count / total_failures * 100
                    print(f"‚îÇ   ‚îú‚îÄ‚îÄ Level {level}: {count} ({percentage:.1f}%)")
            
            # By strategy
            strategy_failures = patterns.get('by_strategy', {})
            if strategy_failures:
                print(f"‚îú‚îÄ‚îÄ By Strategy:")
                for strategy, count in strategy_failures.items():
                    percentage = count / total_failures * 100
                    print(f"‚îÇ   ‚îú‚îÄ‚îÄ {strategy}: {count} ({percentage:.1f}%)")
            
            # Execution issues
            exec_failures = patterns.get('execution_failures', 0)
            if exec_failures > 0:
                print(f"‚îî‚îÄ‚îÄ Execution Failures: {exec_failures}")
            
            # Top recommendations
            if recommendations:
                print(f"\nüí° Top Recommendations for {source}:")
                for i, rec in enumerate(recommendations[:3], 1):
                    print(f"  {i}. {rec}")
        else:
            print(f"‚úÖ No significant failure patterns found")
            
    except Exception as e:
        print(f"‚ùå Failure analysis error: {e}")

# Cross-test pattern analysis
if len(failure_analyses) > 1:
    print(f"\nüîÑ Cross-Test Pattern Analysis")
    print("=" * 50)
    
    # Find common failure patterns across tests
    all_level_failures = {}
    all_strategy_failures = {}
    all_execution_failures = 0
    
    for source, analysis in failure_analyses.items():
        patterns = analysis['failure_patterns']
        
        # Aggregate level failures
        for level, count in patterns.get('by_level', {}).items():
            all_level_failures[level] = all_level_failures.get(level, 0) + count
        
        # Aggregate strategy failures
        for strategy, count in patterns.get('by_strategy', {}).items():
            all_strategy_failures[strategy] = all_strategy_failures.get(strategy, 0) + count
        
        # Aggregate execution failures
        all_execution_failures += patterns.get('execution_failures', 0)
    
    print(f"üìä Aggregated Failure Patterns:")
    
    if all_level_failures:
        total_level_failures = sum(all_level_failures.values())
        print(f"‚îú‚îÄ‚îÄ Most problematic levels:")
        sorted_levels = sorted(all_level_failures.items(), key=lambda x: x[1], reverse=True)
        for level, count in sorted_levels:
            percentage = count / total_level_failures * 100
            print(f"‚îÇ   ‚îú‚îÄ‚îÄ Level {level}: {count} failures ({percentage:.1f}%)")
    
    if all_strategy_failures:
        total_strategy_failures = sum(all_strategy_failures.values())
        print(f"‚îú‚îÄ‚îÄ Most problematic strategies:")
        sorted_strategies = sorted(all_strategy_failures.items(), key=lambda x: x[1], reverse=True)
        for strategy, count in sorted_strategies:
            percentage = count / total_strategy_failures * 100
            print(f"‚îÇ   ‚îú‚îÄ‚îÄ {strategy}: {count} failures ({percentage:.1f}%)")
    
    if all_execution_failures > 0:
        print(f"‚îî‚îÄ‚îÄ Total execution failures: {all_execution_failures}")
    
    # Global recommendations
    print(f"\nüéØ Global Optimization Priorities:")
    
    if all_level_failures:
        worst_level = max(all_level_failures, key=all_level_failures.get)
        print(f"1. Focus on Level {worst_level} performance improvement")
    
    if all_strategy_failures:
        worst_strategy = max(all_strategy_failures, key=all_strategy_failures.get)
        print(f"2. Optimize {worst_strategy.replace('_', ' ')} strategy")
    
    if all_execution_failures > 5:
        print(f"3. Improve system reliability (reduce execution failures)")
    
    print(f"4. Consider routing adjustments based on failure patterns")
    print(f"5. Enhance answer formatting and validation")

## Section 7: Production Readiness Assessment

In [None]:
# Final comprehensive assessment for production deployment

print("üè≠ Production Readiness Assessment")
print("=" * 50)

# Collect all performance data for assessment
assessment_data = {
    'baseline_performance': {},
    'provider_performance': {},
    'routing_effectiveness': {},
    'system_reliability': {},
    'file_processing': {},
    'overall_metrics': {}
}

# Extract baseline performance metrics
successful_baselines = [r for r in baseline_results.values() if 'error' not in r]
if successful_baselines:
    baseline_accuracies = [r['accuracy'] for r in successful_baselines]
    assessment_data['baseline_performance'] = {
        'avg_accuracy': np.mean(baseline_accuracies),
        'min_accuracy': np.min(baseline_accuracies),
        'max_accuracy': np.max(baseline_accuracies),
        'std_accuracy': np.std(baseline_accuracies),
        'gaia_target_met': any(acc >= 0.45 for acc in baseline_accuracies)
    }

# Extract provider performance metrics  
successful_providers = [r for r in provider_test_results.values() if 'error' not in r]
if successful_providers:
    provider_accuracies = [r['accuracy'] for r in successful_providers]
    provider_times = [r['execution_time'] for r in successful_providers]
    provider_errors = [r['error_rate'] for r in successful_providers]
    
    assessment_data['provider_performance'] = {
        'available_providers': len(successful_providers),
        'avg_accuracy': np.mean(provider_accuracies),
        'best_accuracy': np.max(provider_accuracies),
        'avg_execution_time': np.mean(provider_times),
        'fastest_time': np.min(provider_times),
        'avg_error_rate': np.mean(provider_errors),
        'best_reliability': np.min(provider_errors)
    }

# Extract routing effectiveness metrics
if routing_analysis_results:
    routing_accuracies = [r['routing_stats'].get('routing_accuracy', 0) for r in routing_analysis_results.values()]
    assessment_data['routing_effectiveness'] = {
        'avg_routing_accuracy': np.mean(routing_accuracies),
        'min_routing_accuracy': np.min(routing_accuracies),
        'routing_working': np.mean(routing_accuracies) >= 0.7
    }

# System reliability assessment
total_execution_failures = 0
total_questions = 0

for result_set in [baseline_results.values(), provider_test_results.values()]:
    for result in result_set:
        if 'full_result' in result:
            error_analysis = result['full_result'].get('analysis', {}).get('error_analysis', {})
            exec_errors = error_analysis.get('execution_errors', 0)
            total_exec_questions = result.get('total_questions', 0)
            total_execution_failures += exec_errors
            total_questions += total_exec_questions

system_reliability = 1 - (total_execution_failures / total_questions) if total_questions > 0 else 0
assessment_data['system_reliability'] = {
    'execution_success_rate': system_reliability,
    'total_questions_tested': total_questions,
    'total_failures': total_execution_failures,
    'reliable': system_reliability >= 0.95
}

# Overall metrics calculation
all_accuracies = []
if successful_baselines:
    all_accuracies.extend([r['accuracy'] for r in successful_baselines])
if successful_providers:
    all_accuracies.extend([r['accuracy'] for r in successful_providers])

if all_accuracies:
    assessment_data['overall_metrics'] = {
        'best_accuracy': np.max(all_accuracies),
        'avg_accuracy': np.mean(all_accuracies),
        'consistency': 1 - np.std(all_accuracies),  # Higher consistency = lower std
        'gaia_target_achievement': np.max(all_accuracies) >= 0.45,
        'production_ready_accuracy': np.max(all_accuracies) >= 0.50
    }

# Production readiness scoring
print(f"üìä PRODUCTION READINESS SCORING")
print("=" * 50)

readiness_score = 0
max_score = 100

# Accuracy score (40 points)
if assessment_data['overall_metrics']:
    best_acc = assessment_data['overall_metrics']['best_accuracy']
    if best_acc >= 0.60:
        accuracy_score = 40
    elif best_acc >= 0.50:
        accuracy_score = 35
    elif best_acc >= 0.45:
        accuracy_score = 30
    elif best_acc >= 0.35:
        accuracy_score = 20
    else:
        accuracy_score = 10
    
    readiness_score += accuracy_score
    print(f"‚úÖ Accuracy Score: {accuracy_score}/40 (Best: {best_acc:.1%})")

# Reliability score (25 points)
if assessment_data['system_reliability']:
    reliability = assessment_data['system_reliability']['execution_success_rate']
    if reliability >= 0.98:
        reliability_score = 25
    elif reliability >= 0.95:
        reliability_score = 20
    elif reliability >= 0.90:
        reliability_score = 15
    else:
        reliability_score = 10
    
    readiness_score += reliability_score
    print(f"‚úÖ Reliability Score: {reliability_score}/25 (Success Rate: {reliability:.1%})")

# Provider availability score (15 points)
if assessment_data['provider_performance']:
    available_providers = assessment_data['provider_performance']['available_providers']
    if available_providers >= 4:
        provider_score = 15
    elif available_providers >= 3:
        provider_score = 12
    elif available_providers >= 2:
        provider_score = 8
    else:
        provider_score = 5
    
    readiness_score += provider_score
    print(f"‚úÖ Provider Score: {provider_score}/15 ({available_providers} providers available)")

# Routing effectiveness score (10 points)
if assessment_data['routing_effectiveness']:
    routing_acc = assessment_data['routing_effectiveness']['avg_routing_accuracy']
    if routing_acc >= 0.80:
        routing_score = 10
    elif routing_acc >= 0.70:
        routing_score = 8
    elif routing_acc >= 0.60:
        routing_score = 6
    else:
        routing_score = 3
    
    readiness_score += routing_score
    print(f"‚úÖ Routing Score: {routing_score}/10 (Accuracy: {routing_acc:.1%})")

# Consistency score (10 points)
if assessment_data['overall_metrics']:
    consistency = assessment_data['overall_metrics']['consistency']
    if consistency >= 0.90:
        consistency_score = 10
    elif consistency >= 0.80:
        consistency_score = 8
    elif consistency >= 0.70:
        consistency_score = 6
    else:
        consistency_score = 3
    
    readiness_score += consistency_score
    print(f"‚úÖ Consistency Score: {consistency_score}/10 (Consistency: {consistency:.1%})")

print(f"\nüèÜ OVERALL READINESS SCORE: {readiness_score}/{max_score} ({readiness_score/max_score*100:.1f}%)")

# Production deployment recommendation
print(f"\nüöÄ PRODUCTION DEPLOYMENT RECOMMENDATION")
print("=" * 50)

if readiness_score >= 85:
    recommendation = "üü¢ READY FOR PRODUCTION"
    details = "System demonstrates excellent performance, reliability, and consistency."
    next_steps = [
        "Deploy to production environment",
        "Set up monitoring and alerting",
        "Implement gradual rollout strategy",
        "Document operational procedures"
    ]
elif readiness_score >= 70:
    recommendation = "üü° READY WITH MINOR OPTIMIZATIONS"
    details = "System shows good performance but could benefit from targeted improvements."
    next_steps = [
        "Address identified failure patterns",
        "Optimize underperforming configurations",
        "Enhance error handling",
        "Conduct limited production trial"
    ]
elif readiness_score >= 55:
    recommendation = "üü† NEEDS IMPROVEMENT BEFORE PRODUCTION"
    details = "System shows promise but requires significant improvements."
    next_steps = [
        "Focus on accuracy improvements",
        "Enhance system reliability",
        "Optimize routing decisions",
        "Conduct additional testing cycles"
    ]
else:
    recommendation = "üî¥ NOT READY FOR PRODUCTION"
    details = "System requires substantial development before production deployment."
    next_steps = [
        "Review core architecture",
        "Improve model configurations",
        "Enhance error handling and reliability",
        "Return to development phase"
    ]

print(f"{recommendation}")
print(f"üìù {details}")
print(f"\nüìã Recommended Next Steps:")
for i, step in enumerate(next_steps, 1):
    print(f"  {i}. {step}")

# Specific recommendations based on assessment data
print(f"\nüéØ SPECIFIC OPTIMIZATION RECOMMENDATIONS")
print("=" * 50)

recommendations = []

# Accuracy-based recommendations
if assessment_data['overall_metrics']:
    best_acc = assessment_data['overall_metrics']['best_accuracy']
    if best_acc < 0.45:
        recommendations.append("üéØ Priority: Achieve GAIA benchmark target (45% accuracy)")
    elif best_acc < 0.55:
        recommendations.append("üìà Focus: Improve accuracy to competitive levels (55%+)")

# Provider-based recommendations
if assessment_data['provider_performance']:
    best_provider_acc = assessment_data['provider_performance']['best_accuracy']
    avg_provider_acc = assessment_data['provider_performance']['avg_accuracy']
    
    if best_provider_acc - avg_provider_acc > 0.1:
        recommendations.append("‚öñÔ∏è Standardize: Large performance gap between providers - optimize configurations")
    
    fastest_time = assessment_data['provider_performance']['fastest_time']
    avg_time = assessment_data['provider_performance']['avg_execution_time']
    
    if avg_time > 30:
        recommendations.append("‚ö° Speed: Reduce average execution time below 30 seconds")

# Reliability-based recommendations
if assessment_data['system_reliability']:
    reliability = assessment_data['system_reliability']['execution_success_rate']
    if reliability < 0.95:
        recommendations.append("üõ°Ô∏è Reliability: Improve execution success rate above 95%")

# Routing-based recommendations
if assessment_data['routing_effectiveness']:
    routing_acc = assessment_data['routing_effectiveness']['avg_routing_accuracy']
    if routing_acc < 0.75:
        recommendations.append("üîÄ Routing: Improve complexity detection and routing accuracy")

if recommendations:
    for rec in recommendations:
        print(f"  ‚Ä¢ {rec}")
else:
    print("  ‚úÖ No critical optimizations needed - system performing well")

# Generate final comprehensive report
print(f"\nüìÑ Generating Comprehensive Production Report...")

# Use the best performing configuration for final report
best_config = None
best_accuracy = 0

for provider_key, result in provider_test_results.items():
    if 'error' not in result and result.get('accuracy', 0) > best_accuracy:
        best_accuracy = result['accuracy']
        best_config = result['config']

if best_config:
    try:
        # Run one final comprehensive test with best configuration
        final_test = run_gaia_test(
            agent_config_name=best_config,
            max_questions=30,  # Larger sample for final assessment
            target_levels=[1, 2, 3],
            include_files=True,
            include_images=True
        )
        
        if final_test:
            report = generate_test_report(final_test, best_config, save_to_file=True)
            print(f"‚úÖ Final report generated and saved")
            print(f"üìä Final test: {final_test['evaluation_metadata']['overall_accuracy']:.1%} accuracy")
        else:
            print(f"‚ö†Ô∏è Final test failed - using existing data for assessment")
            
    except Exception as e:
        print(f"‚ö†Ô∏è Final test error: {e} - using existing data for assessment")

print(f"\nüéâ PRODUCTION VALIDATION COMPLETE")
print("=" * 50)
print(f"üìä Total Questions Tested: {assessment_data['system_reliability']['total_questions_tested']}")
print(f"üèÜ Best Accuracy Achieved: {assessment_data['overall_metrics']['best_accuracy']:.1%}")
print(f"üõ°Ô∏è System Reliability: {assessment_data['system_reliability']['execution_success_rate']:.1%}")
print(f"‚ö° Available Providers: {assessment_data['provider_performance']['available_providers']}")
print(f"üîÄ Routing Effectiveness: {assessment_data['routing_effectiveness']['avg_routing_accuracy']:.1%}")

print(f"\nüíæ All test data available in variables:")
print(f"‚îú‚îÄ‚îÄ baseline_results: Baseline performance data")
print(f"‚îú‚îÄ‚îÄ provider_test_results: Provider-specific results")
print(f"‚îú‚îÄ‚îÄ routing_analysis_results: Routing effectiveness analysis")
print(f"‚îú‚îÄ‚îÄ failure_analyses: Comprehensive failure pattern analysis")
print(f"‚îî‚îÄ‚îÄ assessment_data: Production readiness metrics")

print(f"\nüöÄ System is now ready for production decision based on comprehensive testing!")

## Section 8: Data Export & Reporting

In [None]:
import time
# Export all results for further analysis and reporting

print("üíæ Data Export & Final Reporting")
print("=" * 40)

# Create comprehensive data export
export_data = {
    'test_session': {
        'timestamp': datetime.now().isoformat(),
        'total_questions_tested': assessment_data['system_reliability']['total_questions_tested'],
        'testing_duration': 'Session duration tracked',
        'framework_version': '2.0'
    },
    'baseline_results': baseline_results,
    'provider_results': provider_test_results,
    'routing_analysis': routing_analysis_results,
    'failure_analysis': failure_analyses,
    'production_assessment': assessment_data,
    'readiness_score': readiness_score,
    'recommendation': recommendation
}

# Save comprehensive results
export_file = Path("logs") / f"production_validation_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"

try:
    with open(export_file, 'w') as f:
        json.dump(export_data, f, indent=2, default=str)
    
    print(f"‚úÖ Comprehensive results exported: {export_file}")
except Exception as e:
    print(f"‚ö†Ô∏è Export error: {e}")

# Create summary CSV for quick analysis
summary_data = []

# Add baseline results
for config_key, result in baseline_results.items():
    if 'error' not in result:
        summary_data.append({
            'Test_Type': 'Baseline',
            'Configuration': config_key,
            'Total_Questions': result['total_questions'],
            'Correct_Answers': result['correct_answers'],
            'Accuracy': result['accuracy'],
            'GAIA_Target_Met': result['accuracy'] >= 0.45
        })

# Add provider results
for provider_key, result in provider_test_results.items():
    if 'error' not in result:
        summary_data.append({
            'Test_Type': 'Provider',
            'Configuration': result['config'],
            'Total_Questions': result['total_questions'],
            'Correct_Answers': result['correct_answers'],
            'Accuracy': result['accuracy'],
            'GAIA_Target_Met': result['accuracy'] >= 0.45
        })

if summary_data:
    summary_df = pd.DataFrame(summary_data)
    summary_csv = Path("logs") / f"validation_summary_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
    
    try:
        summary_df.to_csv(summary_csv, index=False)
        print(f"‚úÖ Summary CSV exported: {summary_csv}")
        
        # Display final summary table
        print(f"\nüìä FINAL SUMMARY TABLE")
        print("=" * 80)
        print(summary_df.to_string(index=False))
        
    except Exception as e:
        print(f"‚ö†Ô∏è CSV export error: {e}")

# Performance insights summary
print(f"\nüìà KEY PERFORMANCE INSIGHTS")
print("=" * 40)

if assessment_data['overall_metrics']:
    metrics = assessment_data['overall_metrics']
    print(f"üéØ Best Performance: {metrics['best_accuracy']:.1%} accuracy")
    print(f"üìä Average Performance: {metrics['avg_accuracy']:.1%} accuracy")
    print(f"üèÜ GAIA Target: {'‚úÖ Achieved' if metrics['gaia_target_achievement'] else '‚ùå Not achieved'}")
    print(f"üöÄ Production Ready: {'‚úÖ Yes' if metrics['production_ready_accuracy'] else '‚ùå Needs improvement'}")

if assessment_data['provider_performance']:
    provider_metrics = assessment_data['provider_performance']
    print(f"‚ö° Fastest Provider: {provider_metrics['fastest_time']:.1f}s average")
    print(f"üõ°Ô∏è Best Reliability: {(1-provider_metrics['best_reliability']):.1%} success rate")

print(f"\nüéì Production Validation Complete!")
print(f"Use the exported data and reports for production deployment decisions.")