# GAIA Agent Builder
## Design and test agent architecture

**Objective:** Build 4-agent system with proper GAIA formatting  
**Output:** Working agents with routing and format compliance

---

# Section 1: Dependencies and Model configs

In [None]:
import os
import sys
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import json
from datetime import datetime

# Load environment variables
load_dotenv()

print("üîß Environment Setup Complete")
print(f"Python version: {sys.version}")
print(f"Working directory: {os.getcwd()}")

# Check for required files
required_files = ["gaia_agent_system.py", "dev_retriever.py", "gaia_embeddings.csv", "metadata.jsonl"]
missing_files = [f for f in required_files if not os.path.exists(f)]

if missing_files:
    print(f"‚ùå Missing files: {missing_files}")
    print("Please ensure all required files are in the current directory")
else:
    print("‚úÖ All required files found")

In [None]:
from gaia_agent_system import (
    create_gaia_agent,           # was: create_hybrid_gaia_agent
    create_production_gaia_agent,  # this one stays the same
    ModelConfigs,                # was: GAIAModelConfigs
    GAIAConfig                   # was: HybridGAIAConfig
)

print("üöÄ GAIA Agent Builder - Test Notebook")
print("=" * 50)

# Show available model configurations
print("\nüìã Available Model Configurations:")
configs = ModelConfigs.get_all_configs()  # Updated class name

openrouter_configs = ModelConfigs.get_openrouter_configs()  # Updated class name
groq_configs = ModelConfigs.get_groq_configs()              # Added Groq configs
google_configs = ModelConfigs.get_google_configs()          # Added Google configs
ollama_configs = ModelConfigs.get_ollama_configs()          # Updated class name

print("\nüåê OpenRouter Models (Free):")
for name, config in openrouter_configs.items():
    primary = config['primary_model']
    secondary = config.get('secondary_model', 'None')
    print(f"  ‚îú‚îÄ‚îÄ {name}: {primary} ‚Üí {secondary}")

print("\n‚ö° Groq Models (Fast):")
for name, config in groq_configs.items():
    primary = config['primary_model']
    secondary = config.get('secondary_model', 'None')
    print(f"  ‚îú‚îÄ‚îÄ {name}: {primary} ‚Üí {secondary}")

print("\nü§ñ Google Models:")
for name, config in google_configs.items():
    primary = config['primary_model']
    secondary = config.get('secondary_model', 'None')
    print(f"  ‚îú‚îÄ‚îÄ {name}: {primary} ‚Üí {secondary}")

print("\nüè† Ollama Models (Local):")
for name, config in ollama_configs.items():
    primary = config['primary_model']
    secondary = config.get('secondary_model', 'None')
    print(f"  ‚îú‚îÄ‚îÄ {name}: {primary} ‚Üí {secondary}")

print(f"\nTotal configurations available: {len(configs)}")

# Show fallback chain summary
print("\nüîÑ Fallback Chain Examples:")
print("Primary Model Fails ‚Üí Secondary Model ‚Üí Hardcoded Fallback")
print("  ‚îú‚îÄ‚îÄ qwen-qwq-32b ‚Üí llama3-70b-8192 ‚Üí qwen/qwen-2.5-coder-32b-instruct:free")
print("  ‚îú‚îÄ‚îÄ gemini-2.5-flash-preview-04-17 ‚Üí gemini-2.5-flash-preview-05-20 ‚Üí qwen/qwen-2.5-coder-32b-instruct:free")
print("  ‚îî‚îÄ‚îÄ qwen/qwen3-32b:free ‚Üí qwen/qwen3-14b:free ‚Üí qwen/qwen-2.5-coder-32b-instruct:free")

In [None]:
# Example usage with updated function names
print("\nüß™ Quick Test Examples:")

# Test 1: Using Groq with QwQ model
print("\n1. Testing Groq QwQ model with fallback:")
try:
    agent = create_gaia_agent("qwen_qwq_groq")
    result = agent.run_single_question("What is 15% of 200?")
    print(f"   Answer: {result.get('final_answer', 'No answer')}")
    print(f"   Strategy: {result.get('selected_strategy', 'Unknown')}")
    print(f"   Model used: {result.get('model_used', 'Unknown')}")
    agent.close()
    print("   ‚úÖ Groq test successful!")
except Exception as e:
    print(f"   ‚ùå Groq test failed: {e}")

# Test 2: Using Google Gemini with fallback
print("\n2. Testing Google Gemini with fallback:")
try:
    agent = create_gaia_agent("gemini_flash_04")
    result = agent.run_single_question("Calculate the square root of 144")
    print(f"   Answer: {result.get('final_answer', 'No answer')}")
    print(f"   Strategy: {result.get('selected_strategy', 'Unknown')}")
    print(f"   Model used: {result.get('model_used', 'Unknown')}")
    agent.close()
    print("   ‚úÖ Google test successful!")
except Exception as e:
    print(f"   ‚ùå Google test failed: {e}")

# Test 3: Using OpenRouter with new naming
print("\n3. Testing OpenRouter with updated naming:")
try:
    agent = create_gaia_agent("qwen3_32b")
    result = agent.run_single_question("What is 25% of 400?")
    print(f"   Answer: {result.get('final_answer', 'No answer')}")
    print(f"   Strategy: {result.get('selected_strategy', 'Unknown')}")
    print(f"   Model used: {result.get('model_used', 'Unknown')}")
    agent.close()
    print("   ‚úÖ OpenRouter test successful!")
except Exception as e:
    print(f"   ‚ùå OpenRouter test failed: {e}")

# Method 4: Custom configuration using GAIAConfig with fallback
print("\n4. Testing custom configuration with manual fallback:")
try:
    custom_config = GAIAConfig(
        model_provider="groq",
        primary_model="qwen-qwq-32b",
        secondary_model="llama3-70b-8192",
        temperature=0.2,
        enable_model_fallback=True,
        debug_mode=True
    )
    custom_agent = create_gaia_agent(custom_config)
    result = custom_agent.run_single_question("What is 10 + 25?")
    print(f"   Answer: {result.get('final_answer', 'No answer')}")
    print(f"   Fallback used: {'Yes' if result.get('fallback_used') else 'No'}")
    custom_agent.close()
    print("   ‚úÖ Custom config test successful!")
except Exception as e:
    print(f"   ‚ùå Custom config test failed: {e}")

print("\n‚úÖ All updated imports and fallback models working correctly!")
print("\nüéØ Available Config Names:")
print("OpenRouter:", list(openrouter_configs.keys()))
print("Groq:", list(groq_configs.keys()))
print("Google:", list(google_configs.keys()))
print("Ollama:", list(ollama_configs.keys()))

# Section 2: Agent initialization

In [None]:
print("\nüß™ Testing Basic Agent Initialization...")

try:
    # Test with free OpenRouter model (most likely to work)
    agent = create_gaia_agent("qwen3_32b")
    print("‚úÖ Agent initialized successfully with qwen_coder_free")
    
    # Test retriever
    test_search = agent.retriever.search("calculate compound interest", k=2)
    print(f"‚úÖ Retriever working - found {len(test_search)} similar examples")
    
    # Test metadata
    sample_metadata = agent.metadata_manager.get_test_sample(5)
    print(f"‚úÖ Metadata loaded - {len(sample_metadata)} samples available")
    
    # Show tool usage analysis
    tool_usage = agent.metadata_manager.analyze_tool_usage()
    print(f"‚úÖ Tool analysis complete - {len(tool_usage)} tool types identified")
    
    agent.close()
    print("‚úÖ Agent closed successfully")
    
except Exception as e:
    print(f"‚ùå Agent initialization failed: {e}")
    print("Check your API keys and file paths")

In [None]:
print("\nüîç Testing Single Question Execution...")

# Test questions of different complexity levels
test_questions = [
    {
        "complexity": "Simple",
        "question": "What is 25 + 17?",
        "expected_strategy": "direct_llm"
    },
    {
        "complexity": "Moderate", 
        "question": "Calculate the compound interest on $1000 at 5% annually for 3 years",
        "expected_strategy": "smolag_agent"
    },
    {
        "complexity": "Complex",
        "question": "Analyze the correlation between these datasets: [1,2,3,4,5] and [2,4,6,8,10]",
        "expected_strategy": "smolag_agent"
    }
]

# Initialize agent for testing
agent = create_gaia_agent("qwen3_32b")

test_results = []

for test_case in test_questions:
    print(f"\nüîç Testing {test_case['complexity']} Question:")
    print(f"Q: {test_case['question']}")
    
    try:
        result = agent.run_single_question(test_case['question'])
        
        print(f"A: {result['final_answer']}")
        print(f"Strategy: {result['selected_strategy']}")
        print(f"Agent: {result.get('selected_agent', 'N/A')}")
        print(f"Time: {result.get('execution_time', 0):.2f}s")
        
        # Check if strategy matches expectation
        strategy_match = result['selected_strategy'] == test_case['expected_strategy']
        strategy_status = "‚úÖ" if strategy_match else "‚ö†Ô∏è"
        print(f"Expected Strategy: {test_case['expected_strategy']} {strategy_status}")
        
        test_results.append({
            "complexity": test_case['complexity'],
            "question": test_case['question'],
            "answer": result['final_answer'],
            "strategy_used": result['selected_strategy'],
            "expected_strategy": test_case['expected_strategy'],
            "strategy_correct": strategy_match,
            "execution_time": result.get('execution_time', 0),
            "similar_examples": len(result.get('similar_examples', []))
        })
        
    except Exception as e:
        print(f"‚ùå Error: {e}")
        test_results.append({
            "complexity": test_case['complexity'],
            "error": str(e)
        })

# Summary of single question tests
print(f"\nüìä Single Question Test Summary:")
successful_tests = [r for r in test_results if 'error' not in r]
print(f"Successful tests: {len(successful_tests)}/{len(test_questions)}")

if successful_tests:
    strategy_accuracy = sum(r['strategy_correct'] for r in successful_tests) / len(successful_tests)
    avg_time = np.mean([r['execution_time'] for r in successful_tests])
    print(f"Strategy selection accuracy: {strategy_accuracy:.2f}")
    print(f"Average execution time: {avg_time:.2f}s")

agent.close()

# Section 3: Agent Optimization

In [None]:
print("\nüîç Testing RAG Effectiveness...")

# Test different RAG example counts
rag_test_counts = [1, 3, 5]
rag_test_question = "Calculate the interest rate needed to double an investment in 10 years"

rag_results = {}

for count in rag_test_counts:
    print(f"\nTesting with {count} RAG examples...")
    
    try:
        rag_agent = create_gaia_agent({
            "model_provider": "openrouter",
            "primary_model": "qwen/qwen-2.5-coder-32b-instruct:free",
            "rag_examples_count": count,
            "temperature": 0.7
        })
        
        result = rag_agent.run_single_question(rag_test_question)
        
        rag_results[f"rag_{count}"] = {
            "examples_count": count,
            "answer": result['final_answer'],
            "similar_examples_found": len(result.get('similar_examples', [])),
            "execution_time": result.get('execution_time', 0),
            "strategy": result['selected_strategy']
        }
        
        print(f"  Examples found: {rag_results[f'rag_{count}']['similar_examples_found']}")
        print(f"  Answer: {result['final_answer']}")
        print(f"  Strategy: {result['selected_strategy']}")
        
        rag_agent.close()
        
    except Exception as e:
        print(f"  Error with {count} examples: {e}")

# Display RAG effectiveness
if rag_results:
    print(f"\nüìä RAG Effectiveness Results:")
    rag_df = pd.DataFrame(rag_results).T
    print(rag_df[['examples_count', 'similar_examples_found', 'strategy', 'execution_time']])

In [None]:
print("\nüß™ Running Small Batch Evaluation...")

try:
    # Use best performing model from previous tests
    eval_agent = create_production_gaia_agent(
        model_config="qwen_coder_free",
        enable_logging=True,
        performance_tracking=True
    )
    
    # Run evaluation on small sample
    print("Running evaluation on 15 questions...")
    results_df = eval_agent.run_batch_evaluation(sample_size=15)
    
    # Detailed analysis
    print(f"\nüìà Detailed Analysis:")
    print(f"Total questions: {len(results_df)}")
    
    if 'is_correct' in results_df and results_df['is_correct'].notna().any():
        accuracy = results_df['is_correct'].mean()
        print(f"Accuracy: {accuracy:.3f} ({accuracy*100:.1f}%)")
    else:
        print("Accuracy: Unable to calculate (no ground truth)")
    
    if 'execution_time' in results_df:
        avg_time = results_df['execution_time'].mean()
        print(f"Average execution time: {avg_time:.2f}s")
    
    # Strategy analysis
    if 'strategy_used' in results_df:
        strategy_counts = results_df['strategy_used'].value_counts()
        print(f"\nStrategy Usage:")
        for strategy, count in strategy_counts.items():
            percentage = (count / len(results_df)) * 100
            print(f"  ‚îú‚îÄ‚îÄ {strategy}: {count} questions ({percentage:.1f}%)")
            
            if 'is_correct' in results_df:
                strategy_accuracy = results_df[results_df['strategy_used'] == strategy]['is_correct'].mean()
                print(f"      Accuracy: {strategy_accuracy:.3f}")
    
    # Level analysis (if available)
    if 'level' in results_df:
        level_analysis = results_df.groupby('level').agg({
            'is_correct': ['count', 'mean'],
            'execution_time': 'mean'
        }).round(3)
        print(f"\nPerformance by GAIA Level:")
        print(level_analysis)
    
    # Agent usage analysis
    if 'selected_agent' in results_df:
        smolag_data = results_df[results_df['strategy_used'] == 'smolag_agent']
        if len(smolag_data) > 0:
            print(f"\nSmolagAgent Usage:")
            agent_counts = smolag_data['selected_agent'].value_counts()
            for agent, count in agent_counts.items():
                if pd.notna(agent):
                    print(f"  ‚îú‚îÄ‚îÄ {agent}: {count} times")
    
    eval_agent.close()
    
except Exception as e:
    print(f"‚ùå Batch evaluation failed: {e}")

In [None]:
print("\nüéØ Performance Analysis and Recommendations")
print("=" * 50)

# Analyze all collected data
total_tests = len(test_results) if 'test_results' in locals() else 0
successful_tests = len([r for r in test_results if 'error' not in r]) if total_tests > 0 else 0

print(f"\nüìä Overall Test Summary:")
print(f"Single question tests: {successful_tests}/{total_tests}")
print(f"Model configurations tested: {len(model_comparison) if 'model_comparison' in locals() else 0}")
print(f"RAG configurations tested: {len(rag_results) if 'rag_results' in locals() else 0}")

# Recommendations based on test results
print(f"\nüí° Recommendations:")

# Model recommendations
if 'model_comparison' in locals():
    successful_models = [name for name, result in model_comparison.items() if 'error' not in result]
    if successful_models:
        print(f"‚úÖ Working models: {', '.join(successful_models)}")
        
        # Find fastest model
        times = {name: result.get('execution_time', float('inf')) 
                for name, result in model_comparison.items() if 'execution_time' in result}
        if times:
            fastest_model = min(times, key=times.get)
            print(f"‚ö° Fastest model: {fastest_model} ({times[fastest_model]:.2f}s)")
    else:
        print("‚ö†Ô∏è  No models worked successfully - check API keys")

# RAG recommendations
if 'rag_results' in locals() and rag_results:
    avg_examples_found = np.mean([r['similar_examples_found'] for r in rag_results.values()])
    print(f"üìö Average RAG examples found: {avg_examples_found:.1f}")
    
    if avg_examples_found >= 2:
        print("‚úÖ RAG system working well")
    else:
        print("‚ö†Ô∏è  Low RAG retrieval - check vector store")

# Strategy recommendations
if 'test_results' in locals() and successful_tests > 0:
    strategy_accuracy = sum(r.get('strategy_correct', False) for r in test_results if 'strategy_correct' in r) / successful_tests
    if strategy_accuracy >= 0.7:
        print("‚úÖ Strategy selection working well")
    else:
        print("‚ö†Ô∏è  Strategy selection may need tuning")

# Final recommendations
print(f"\nüéØ Next Steps:")
print("1. If models are working: Run larger batch evaluation (50+ questions)")
print("2. If accuracy is low: Tune complexity_threshold parameter")
print("3. If too slow: Use faster models (Groq) or reduce RAG examples")
print("4. If SmolagAgents failing: Check tool imports and permissions")

# Export test results
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
test_summary = {
    "timestamp": timestamp,
    "single_question_tests": test_results if 'test_results' in locals() else [],
    "model_comparison": model_comparison if 'model_comparison' in locals() else {},
    "rag_effectiveness": rag_results if 'rag_results' in locals() else {},
    "batch_evaluation": results_df.to_dict() if 'results_df' in locals() else {}
}

with open(f"gaia_test_results_{timestamp}.json", "w") as f:
    json.dump(test_summary, f, indent=2, default=str)

print(f"\nüíæ Test results saved to: gaia_test_results_{timestamp}.json")
print("‚úÖ Testing complete!")

In [None]:
print("\nüé¨ Quick Demo for Showcase")
print("=" * 50)

def get_demo_questions_from_metadata(metadata_path: str = "metadata.jsonl", count: int = 3):
    """Extract real GAIA questions from metadata.jsonl for demo purposes"""
    try:
        import json
        import random
        
        # Load metadata
        questions = []
        with open(metadata_path, 'r', encoding='utf-8') as f:
            for line in f:
                try:
                    item = json.loads(line.strip())
                    if 'Question' in item:
                        # Prefer shorter questions for demo (better showcase)
                        question_text = item['Question']
                        if len(question_text) < 150:  # Keep demo questions concise
                            questions.append({
                                'question': question_text,
                                'answer': item.get('Final answer', 'Unknown'),
                                'level': item.get('Level', 1),
                                'task_id': item.get('task_id', 'demo')
                            })
                except json.JSONDecodeError:
                    continue
        
        if not questions:
            print("‚ö†Ô∏è  No questions found in metadata, using fallback demo questions")
            return [
                {'question': "What is 15% of 240?", 'answer': '36', 'level': 1},
                {'question': "Calculate the area of a circle with radius 7 meters", 'answer': '153.94', 'level': 1},
                {'question': "Find the population of Tokyo", 'answer': 'Unknown', 'level': 2}
            ]
        
        # Randomly select questions with preference for different levels
        level_1_questions = [q for q in questions if q['level'] == 1]
        level_2_questions = [q for q in questions if q['level'] == 2]
        level_3_questions = [q for q in questions if q['level'] == 3]
        
        demo_questions = []
        
        # Try to get one question from each level
        if level_1_questions and len(demo_questions) < count:
            demo_questions.append(random.choice(level_1_questions))
        
        if level_2_questions and len(demo_questions) < count:
            demo_questions.append(random.choice(level_2_questions))
        
        if level_3_questions and len(demo_questions) < count:
            demo_questions.append(random.choice(level_3_questions))
        
        # Fill remaining slots with random questions
        remaining_questions = [q for q in questions if q not in demo_questions]
        while len(demo_questions) < count and remaining_questions:
            demo_questions.append(random.choice(remaining_questions))
            remaining_questions = [q for q in remaining_questions if q not in demo_questions]
        
        print(f"‚úÖ Loaded {len(questions)} real GAIA questions from metadata")
        print(f"üìã Selected {len(demo_questions)} questions for demo (Levels: {[q['level'] for q in demo_questions]})")
        
        return demo_questions
        
    except FileNotFoundError:
        print(f"‚ö†Ô∏è  Metadata file '{metadata_path}' not found, using fallback demo questions")
        return [
            {'question': "What is 15% of 240?", 'answer': '36', 'level': 1},
            {'question': "Calculate the area of a circle with radius 7 meters", 'answer': '153.94', 'level': 1},
            {'question': "Find the population of Tokyo", 'answer': 'Unknown', 'level': 2}
        ]
    except Exception as e:
        print(f"‚ö†Ô∏è  Error loading metadata: {e}, using fallback demo questions")
        return [
            {'question': "What is 15% of 240?", 'answer': '36', 'level': 1},
            {'question': "Calculate the area of a circle with radius 7 meters", 'answer': '153.94', 'level': 1},
            {'question': "Find the population of Tokyo", 'answer': 'Unknown', 'level': 2}
        ]

def run_quick_demo():
    """Run a quick demo with real GAIA questions for showcase purposes"""
    print("\nüé¨ GAIA Agent Quick Demo")
    print("=" * 30)
    
    # Get real GAIA questions from metadata
    demo_questions = get_demo_questions_from_metadata(count=3)
    
    try:
        # Use proper config name from ModelConfigs
        demo_agent = create_gaia_agent("qwen3_32b")  # Use config name from ModelConfigs
        
        for i, q_data in enumerate(demo_questions, 1):
            question = q_data['question']
            expected_answer = q_data['answer']
            level = q_data['level']
            
            print(f"\nüîç Demo Question {i} (Level {level}):")
            print(f"Q: {question}")
            
            # Run the question through GAIA agent
            result = demo_agent.run_single_question(
                question=question,
                task_id=f"demo_{i}",
                ground_truth=expected_answer,
                level=level
            )
            
            agent_answer = result.get('final_answer', 'No answer')
            is_correct = result.get('debug_info', {}).get('is_correct', None)
            
            print(f"A: {agent_answer}")
            if expected_answer != 'Unknown':
                print(f"Expected: {expected_answer}")
                if is_correct is not None:
                    print(f"Correct: {'‚úÖ' if is_correct else '‚ùå'}")
            
            print(f"Strategy: {result.get('selected_strategy', 'Unknown')}")
            print(f"Agent: {result.get('selected_agent', 'N/A')}")
            print(f"Time: {result.get('execution_time', 0):.2f}s")
            print(f"Confidence: {result.get('confidence_score', 0):.2f}")
            
            # Show any errors or fallbacks
            errors = result.get('errors', [])
            if errors:
                print(f"Errors: {len(errors)} encountered")
            if result.get('fallback_used', False):
                print("‚ö†Ô∏è  Fallback strategy used")
        
        demo_agent.close()
        print("\n‚ú® Demo completed successfully!")
        print("üéØ This demo used real GAIA benchmark questions!")
        
    except Exception as e:
        print(f"‚ùå Demo failed: {e}")
        print("üí° Make sure you have proper API keys configured")

def run_extended_demo(question_count: int = 5):
    """Run extended demo with more questions"""
    print(f"\nüé¨ Extended GAIA Demo ({question_count} questions)")
    print("=" * 40)
    
    demo_questions = get_demo_questions_from_metadata(count=question_count)
    
    try:
        demo_agent = create_gaia_agent("qwen3_32b")
        
        correct_answers = 0
        total_questions = len(demo_questions)
        
        for i, q_data in enumerate(demo_questions, 1):
            question = q_data['question']
            expected_answer = q_data['answer']
            level = q_data['level']
            
            print(f"\nüìù Question {i}/{total_questions} (Level {level}):")
            print(f"Q: {question[:100]}{'...' if len(question) > 100 else ''}")
            
            result = demo_agent.run_single_question(
                question=question,
                ground_truth=expected_answer,
                level=level
            )
            
            agent_answer = result.get('final_answer', 'No answer')
            is_correct = result.get('debug_info', {}).get('is_correct', None)
            
            print(f"A: {agent_answer}")
            
            if is_correct is not None:
                print(f"Result: {'‚úÖ Correct' if is_correct else '‚ùå Incorrect'}")
                if is_correct:
                    correct_answers += 1
            
            print(f"Time: {result.get('execution_time', 0):.1f}s | Strategy: {result.get('selected_strategy', 'Unknown')}")
        
        # Summary
        if total_questions > 0:
            accuracy = correct_answers / total_questions
            print(f"\nüìä Extended Demo Results:")
            print(f"Accuracy: {correct_answers}/{total_questions} ({accuracy:.1%})")
            print(f"Questions from GAIA benchmark levels: {sorted(set(q['level'] for q in demo_questions))}")
        
        demo_agent.close()
        print("\nüéâ Extended demo completed!")
        
    except Exception as e:
        print(f"‚ùå Extended demo failed: {e}")

# Uncomment to run demo
run_quick_demo()

# Uncomment for extended demo with more questions
# run_extended_demo(5)

print("\n" + "="*60)
print("üéâ GAIA Agent Builder Test Notebook Complete!")
print("Ready for GAIA benchmark evaluation!")
print("="*60)

# Section 6: GAIA Format Testing & Compliance

In [None]:
def create_gaia_formatter():
    """Create GAIA format compliance checker"""
    
    def clean_gaia_answer(text):
        """Clean answer according to GAIA rules"""
        
        # Extract final answer if present
        if "FINAL ANSWER:" in text:
            text = text.split("FINAL ANSWER:")[-1].strip()
        
        # Remove articles (the, a, an)
        text = re.sub(r'\b(the|a|an)\b\s*', '', text, flags=re.IGNORECASE)
        
        # Remove commas from numbers
        text = re.sub(r'(\d),(\d)', r'\1\2', text)
        
        # Handle currency and percentages (remove unless specified)
        # This is simplified - real implementation needs context awareness
        text = text.replace(', '').replace('%', '')
        
        # Clean extra whitespace
        text = re.sub(r'\s+', ' ', text).strip()
        
        return text
    
    def validate_gaia_format(response):
        """Validate response follows GAIA format"""
        
        errors = []
        
        # Check for FINAL ANSWER
        if "FINAL ANSWER:" not in response:
            errors.append("Missing 'FINAL ANSWER:' prefix")
        
        # Extract and check final answer
        if "FINAL ANSWER:" in response:
            answer = response.split("FINAL ANSWER:")[-1].strip()
            
            # Check for common violations
            if re.search(r'\b(the|a|an)\b', answer, re.IGNORECASE):
                errors.append("Contains articles (the, a, an)")
            
            if re.search(r'\d,\d', answer):
                errors.append("Contains commas in numbers")
            
            if len(answer.split()) > 10:  # Arbitrary threshold
                errors.append("Answer too verbose - should be as few words as possible")
        
        return len(errors) == 0, errors
    
    return clean_gaia_answer, validate_gaia_format

def test_gaia_compliance():
    """Test GAIA formatting with examples"""
    
    clean_answer, validate_format = create_gaia_formatter()
    
    test_cases = [
        # (input, expected_output, should_pass)
        ("The answer is 1,234", "1234", True),
        ("Total: $25.50", "25.50", True),  
        ("The city is Paris", "Paris", True),
        ("Cities: New York, Boston", "New York,Boston", True),
        ("I think the answer is definitely 42", "I think answer is definitely 42", False),  # Too verbose
    ]
    
    print("üß™ GAIA Compliance Testing:")
    print("=" * 50)
    
    passed = 0
    total = len(test_cases)
    
    for i, (input_text, expected, should_pass) in enumerate(test_cases, 1):
        # Test cleaning
        cleaned = clean_answer(input_text)
        clean_match = cleaned == expected
        
        # Test validation
        full_response = f"Let me think... FINAL ANSWER: {input_text}"
        is_valid, errors = validate_format(full_response)
        
        # Overall pass
        test_passed = clean_match and (is_valid == should_pass)
        if test_passed:
            passed += 1
        
        status = "‚úÖ" if test_passed else "‚ùå"
        print(f"{i}. {status} '{input_text}'")
        print(f"   Cleaned: '{cleaned}' (expected: '{expected}')")
        print(f"   Valid: {is_valid} | Errors: {errors if errors else 'None'}")
        print()
    
    print(f"üìä Compliance Results: {passed}/{total} ({passed/total*100:.0f}%) passed")
    
    return clean_answer, validate_format

# Test GAIA compliance
gaia_cleaner, gaia_validator = test_gaia_compliance()

In [None]:
def create_mock_agent_responses():
    """Create mock responses to test agent routing and formatting"""
    
    test_questions = [
        {
            "question": "Calculate 15% of 2500",
            "file": None,
            "expected_agent": "data_analyst",
            "expected_answer": "375"
        },
        {
            "question": "What is the population of Tokyo in 2023?",
            "file": None,
            "expected_agent": "web_researcher",
            "expected_answer": "37194000"
        },
        {
            "question": "Extract the total from this spreadsheet",
            "file": "data.xlsx",
            "expected_agent": "data_analyst",
            "expected_answer": "1250"
        },
        {
            "question": "Read the text in this PDF",
            "file": "document.pdf",
            "expected_agent": "document_reader",
            "expected_answer": "extracted text content"
        },
        {
            "question": "Explain the concept of entropy",
            "file": None,
            "expected_agent": "general_helper",
            "expected_answer": "measure of disorder in system"
        }
    ]
    
    def mock_agent_response(agent_type, question, expected_answer):
        """Generate mock response for testing"""
        reasoning = f"I am the {agent_type} agent. Let me process this question: {question[:50]}..."
        
        if agent_type == "data_analyst":
            reasoning += " I'll use mathematical calculations to solve this."
        elif agent_type == "web_researcher":
            reasoning += " I'll search for the most current information online."
        elif agent_type == "document_reader":
            reasoning += " I'll extract and process the document content."
        else:
            reasoning += " I'll use general reasoning to explain this concept."
        
        return f"{reasoning}\n\nFINAL ANSWER: {expected_answer}"
    
    print("üîÑ Testing Agent Routing & Responses:")
    print("=" * 50)
    
    routing_correct = 0
    format_correct = 0
    total_tests = len(test_questions)
    
    for i, test in enumerate(test_questions, 1):
        # Test routing
        selected_agent = route_question(test["question"], test["file"])
        routing_ok = selected_agent == test["expected_agent"]
        if routing_ok:
            routing_correct += 1
        
        # Generate mock response
        response = mock_agent_response(selected_agent, test["question"], test["expected_answer"])
        
        # Test GAIA formatting
        is_valid, errors = gaia_validator(response)
        if is_valid:
            format_correct += 1
        
        # Results
        route_status = "‚úÖ" if routing_ok else "‚ùå"
        format_status = "‚úÖ" if is_valid else "‚ùå"
        
        print(f"{i}. Q: '{test['question'][:40]}...'")  
        print(f"   File: {test['file'] or 'None'}")
        print(f"   {route_status} Routing: {selected_agent} (expected: {test['expected_agent']})")
        print(f"   {format_status} Format: {'Valid' if is_valid else f'Errors: {errors}'}")
        print(f"   Response: {response[-50:]}...")  # Show end of response
        print()
    
    print(f"üìä Test Results:")
    print(f"  ‚îú‚îÄ‚îÄ Routing: {routing_correct}/{total_tests} ({routing_correct/total_tests*100:.0f}%)")
    print(f"  ‚îú‚îÄ‚îÄ Format: {format_correct}/{total_tests} ({format_correct/total_tests*100:.0f}%)")
    print(f"  ‚îî‚îÄ‚îÄ Overall: {min(routing_correct, format_correct)}/{total_tests} ({min(routing_correct, format_correct)/total_tests*100:.0f}%)")
    
    return routing_correct == total_tests and format_correct == total_tests

# Run comprehensive testing
all_tests_passed = create_mock_agent_responses()

print("\n" + "="*50)
print("üéâ AGENT CHECKER COMPLETE!")
print("="*50)
print("‚úÖ 4-agent architecture designed")
print("‚úÖ GAIA-compliant system prompts created")
print("‚úÖ Format compliance tested")
print("‚úÖ Routing logic validated")
print(f"‚úÖ All tests passed: {all_tests_passed}")
print("="*50)