# GAIA VALIDATOR

## Test correct initialization of GaiaAgent object

In [None]:
# Environment setup
import sys
import os
from pathlib import Path
import traceback
from datetime import datetime

print("üöÄ GAIA Agent Initialization Test")
print("=" * 50)
print(f"üìÖ Start time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"üêç Python version: {sys.version}")
print(f"üìÅ Current directory: {os.getcwd()}")

In [None]:
# Check dependencies
try:
    print("üîÑ Importing GAIA components...")
    from agent_logic import GAIAAgent, GAIAConfig
    from agent_interface import get_groq_config, create_gaia_agent
    print("‚úÖ Agent imports successful")
    
    # Check API keys
    required_vars = ['ANTHROPIC_API_KEY', 'GOOGLE_API_KEY', 'OPENROUTER_API_KEY']
    available_providers = []
    
    for var in required_vars:
        if os.getenv(var):
            provider = var.replace('_API_KEY', '').lower()
            available_providers.append(provider)
            print(f"‚úÖ {var}: Available")
        else:
            print(f"‚ö†Ô∏è  {var}: Not set")
    
    print(f"üéØ Available providers: {available_providers}")
    
except Exception as e:
    print(f"‚ùå Setup failed: {e}")
    raise

In [None]:
print("üîÑ Setting up configuration...")

# Choose your provider
chosen_provider = "anthropuc"

# Handle different return types correctly
if chosen_provider == "google":
    # get_groq_config() returns GAIAConfig object directly
    gaia_config = get_google_config()
    print(f"‚úÖ Using Groq config (GAIAConfig object)")
    
elif chosen_provider == "google":
    # Create GAIAConfig for Google
    gaia_config = GAIAConfig(
        model_provider="anthropic",
        model_name="claude-sonnet-4-20250514", 
        temperature=0.1,
        enable_smart_routing=True,
        enable_context_bridge=True,
        enable_csv_logging=False,
        debug_mode=True
    )
    print(f"‚úÖ Created Google config")
    
elif chosen_provider == "openrouter":
    # Create GAIAConfig for OpenRouter
    gaia_config = GAIAConfig(
        model_provider="openrouter",
        model_name="google/gemini-2.5-flash",
        temperature=0.1,
        enable_smart_routing=True,
        enable_context_bridge=True,
        enable_csv_logging=False,
        debug_mode=True
    )
    print(f"‚úÖ Created OpenRouter config")
    
else:
    # Fallback to default
    gaia_config = GAIAConfig()
    print(f"‚úÖ Using default config")

# Display the configuration
print(f"üìã Configuration:")
print(f"   Provider: {gaia_config.model_provider}")
print(f"   Model: {gaia_config.model_name}")
print(f"   Temperature: {gaia_config.temperature}")
print(f"   Smart Routing: {gaia_config.enable_smart_routing}")
print(f"   Context Bridge: {gaia_config.enable_context_bridge}")

In [None]:
# Gaia agent initialization
print("üî• TESTING GAIA AGENT INITIALIZATION")
print("=" * 50)

try:
    print("üöÄ Creating GAIAAgent (with SmolagAgent validation)...")
    gaia_agent = GAIAAgent(gaia_config)
    
    print("\n‚úÖ SUCCESS! GAIA Agent initialized")
    print("üéØ All SmolagAgent validations passed")
    
except Exception as e:
    print(f"\n‚ùå INITIALIZATION FAILED!")
    print(f"Error: {str(e)}")
    print(f"Type: {type(e).__name__}")
    print("\nüîç Full traceback:")
    traceback.print_exc()
    gaia_agent = None

In [None]:
# Quick Test (if initialization succeeded)

if 'gaia_agent' in locals() and gaia_agent is not None:
    print("üß™ QUICK FUNCTIONALITY TEST")
    print("=" * 30)
    
    try:
        test_question = "What is 15% of 200?"
        result = gaia_agent.process_question(test_question, "test_001")
        
        print(f"‚úÖ Test successful!")
        print(f"Question: {test_question}")
        print(f"Answer: {result.get('final_answer', 'No answer')}")
        print(f"Strategy: {result.get('complexity', 'unknown')}")
        print(f"Success: {result.get('execution_successful', False)}")
        
    except Exception as e:
        print(f"‚ùå Test failed: {e}")
        traceback.print_exc()

In [None]:
# Status Summary

print("\nüìä FINAL STATUS")
print("=" * 20)

if 'gaia_agent' in locals() and gaia_agent is not None:
    print("üéâ STATUS: SUCCESS")
    print("‚úÖ GAIA Agent ready for testing")
    print(f"‚úÖ Provider: {gaia_agent.config.model_provider}")
    print(f"‚úÖ Model: {gaia_agent.config.model_name}")
else:
    print("‚ùå STATUS: FAILED")
    print("üîç Check error messages above")
    print("üí° Common fixes:")
    print("   - Update SmolagAgent version")
    print("   - Check API keys")
    print("   - Review agent_logic.py")

print(f"üèÅ Test complete at {datetime.now().strftime('%H:%M:%S')}")

## Testing Framework

### Declaring validation class by importing methods from testing framework

In [None]:
from agent_testing import run_quick_gaia_test, run_gaia_test, compare_agent_configs, run_smart_routing_test

class GAIAValidator:
    def __init__(self):
        self.last_result = None
        print("üéØ GAIA Validator ready")
    
    def quick(self, config="groq", questions=5):
        result = run_quick_gaia_test(config, num_questions=questions)
        self.last_result = result
        if result and 'overall_performance' in result:
            acc = result['overall_performance']['accuracy']
            print(f"‚úÖ {acc:.1%} accuracy")
        return result
    
    def full(self, config="groq", questions=20):
        result = run_gaia_test(config, max_questions=questions)
        self.last_result = result
        if result and 'overall_performance' in result:
            acc = result['overall_performance']['accuracy']
            total = result['overall_performance']['total_questions']
            correct = result['overall_performance']['correct_answers']
            print(f"‚úÖ {acc:.1%} accuracy ({correct}/{total})")
            print(f"GAIA Target: {'‚úÖ MET' if acc >= 0.45 else '‚ùå NOT MET'}")
        return result
    
    def compare(self, configs=["groq", "google"], questions=10):
        result = compare_agent_configs(configs, questions)
        self.last_result = result
        if result and 'comparison_results' in result:
            for config, data in result['comparison_results'].items():
                if 'accuracy' in data:
                    print(f"{config}: {data['accuracy']:.1%}")
        return result
    
    def insights(self):
        """Enhanced insights with rich execution analysis and direct log links"""
        if not self.last_result or 'overall_performance' not in self.last_result:
            print("‚ùå No results to analyze")
            return
        
        print(f"\n‚ú® COMPREHENSIVE INSIGHTS")
        print("=" * 60)
        
        # 0. LOG FILE LINKS (NEW!)
        self._show_log_file_links()
        
        # 1. EXECUTION PERFORMANCE ANALYSIS
        self._analyze_execution_performance()
        
        # 2. ACCURACY BREAKDOWN  
        self._analyze_accuracy_breakdown()
        
        # 3. STRATEGY & ROUTING ANALYSIS
        self._analyze_strategy_performance()
        
        # 4. TECHNICAL PERFORMANCE METRICS
        self._analyze_technical_performance()
        
        # 5. ERROR ANALYSIS (if applicable)
        self._analyze_errors_and_failures()
        
        # 6. ACTIONABLE RECOMMENDATIONS
        self._generate_actionable_recommendations()
    
    def _show_log_file_links(self):
        """Show direct links to log files for detailed analysis"""
        print(f"üìÅ LOG FILES & DETAILED DATA")
        
        # Try to extract log file paths from the result
        execution_file = None
        evaluation_file = None
        
        # Look for execution file in various places
        if 'execution_file' in self.last_result:
            execution_file = self.last_result['execution_file']
        elif 'batch_info' in self.last_result:
            # Sometimes it's embedded in batch info
            pass
        
        # Look for evaluation file
        if 'evaluation_file' in self.last_result:
            evaluation_file = self.last_result['evaluation_file']
        
        # Try to extract from common result patterns
        if not execution_file or not evaluation_file:
            # Look in the result structure for file paths
            results = self.last_result.get('results', [])
            if results:
                # Check if we can infer file names from timestamps/structure
                timestamp = self.last_result.get('evaluation_timestamp', '')
                if timestamp:
                    # Try to construct likely file paths
                    import os
                    base_dir = "./test_results"
                    if os.path.exists(base_dir):
                        # Find most recent files
                        try:
                            import glob
                            execution_files = glob.glob(f"{base_dir}/**/*execution*.json", recursive=True)
                            evaluation_files = glob.glob(f"{base_dir}/**/*evaluation*.json", recursive=True)
                            
                            if execution_files:
                                execution_file = max(execution_files, key=os.path.getctime)
                            if evaluation_files:
                                evaluation_file = max(evaluation_files, key=os.path.getctime)
                        except:
                            pass
        
        # Display file links
        if execution_file:
            print(f"   üìä Execution Details: {execution_file}")
            if execution_file.startswith('./'):
                print(f"      üí° Open in notebook: pd.read_json('{execution_file}')")
            else:
                print(f"      üí° Open in notebook: pd.read_json('./{execution_file}')")
        
        if evaluation_file:
            print(f"   üìà Evaluation Results: {evaluation_file}")
            if evaluation_file.startswith('./'):
                print(f"      üí° Open in notebook: pd.read_json('{evaluation_file}')")
            else:
                print(f"      üí° Open in notebook: pd.read_json('./{evaluation_file}')")
        
        # Look for CSV logs (from agent_logging.py)
        try:
            import glob
            import os
            
            log_dirs = ["./logs", "./test_results/logs", "./logs"]
            csv_files = []
            
            for log_dir in log_dirs:
                if os.path.exists(log_dir):
                    csv_files.extend(glob.glob(f"{log_dir}/*steps*.csv"))
                    csv_files.extend(glob.glob(f"{log_dir}/*questions*.csv"))
                    csv_files.extend(glob.glob(f"{log_dir}/*evaluation*.csv"))
            
            if csv_files:
                # Get most recent CSV files
                recent_csvs = sorted(csv_files, key=os.path.getctime, reverse=True)[:3]
                print(f"   üìã Recent CSV Logs:")
                for csv_file in recent_csvs:
                    file_type = "Steps" if "steps" in csv_file else "Questions" if "questions" in csv_file else "Evaluation"
                    print(f"      {file_type}: {csv_file}")
                    print(f"         üí° Open: pd.read_csv('{csv_file}')")
        except:
            pass
        
        # Quick data access commands
        print(f"\n   üîß QUICK ACCESS COMMANDS:")
        print(f"      # View raw results structure")
        print(f"      validator.last_result.keys()")
        print(f"      ")
        print(f"      # Access specific data")
        print(f"      validator.last_result['overall_performance']")
        print(f"      validator.last_result['level_performance']")
        print(f"      validator.last_result['strategy_analysis']")
        
        if not execution_file and not evaluation_file:
            print(f"   ‚ö†Ô∏è  Log files not automatically detected")
            print(f"      Check ./test_results/ and ./logs/ directories")
        
        print()  # Add spacing before next section
    
    def _analyze_execution_performance(self):
        """Analyze execution success and performance"""
        overall = self.last_result['overall_performance']
        
        total = overall['total_questions']
        successful = overall['successful_executions']
        correct = overall['correct_answers']
        accuracy = overall['accuracy']
        
        print(f"üîß EXECUTION PERFORMANCE")
        print(f"   Total Questions: {total}")
        print(f"   Successful Executions: {successful}/{total} ({successful/total:.1%})")
        print(f"   Correct Answers: {correct}/{total} ({accuracy:.1%})")
        
        # Execution success analysis
        if successful == total:
            print(f"   ‚úÖ Perfect execution success - no technical failures")
        elif successful >= total * 0.9:
            print(f"   ‚úÖ Good execution success - minimal technical issues")
        elif successful >= total * 0.7:
            print(f"   ‚ö†Ô∏è  Moderate execution issues - {total - successful} failures")
        else:
            print(f"   ‚ùå Significant execution problems - {total - successful} failures")
        
        # Answer quality vs execution success
        if successful > 0:
            answer_quality = correct / successful
            print(f"   üìä Answer Quality (when executed): {answer_quality:.1%}")
            
            if answer_quality >= 0.6:
                print(f"   üéØ High answer quality - agent reasoning is strong")
            elif answer_quality >= 0.3:
                print(f"   üìà Moderate answer quality - room for improvement")
            else:
                print(f"   üìâ Low answer quality - needs significant work")
    
    def _analyze_accuracy_breakdown(self):
        """Detailed accuracy analysis"""
        overall = self.last_result['overall_performance']
        accuracy = overall['accuracy']
        
        print(f"\nüéØ ACCURACY ANALYSIS")
        print(f"   Overall Accuracy: {accuracy:.1%}")
        
        # GAIA benchmark context
        if accuracy >= 0.70:
            print(f"   üèÜ EXCEPTIONAL - Top-tier performance!")
            benchmark_status = "exceptional"
        elif accuracy >= 0.60:
            print(f"   üåü EXCELLENT - Competitive performance")
            benchmark_status = "excellent"
        elif accuracy >= 0.45:
            print(f"   ‚úÖ GOOD - Above GAIA 45% threshold")
            benchmark_status = "good"
        elif accuracy >= 0.30:
            print(f"   ‚ö†Ô∏è  FAIR - Below GAIA threshold, needs improvement")
            benchmark_status = "fair"
        else:
            print(f"   ‚ùå POOR - Significant improvements needed")
            benchmark_status = "poor"
        
        # Level breakdown with insights
        levels = self.last_result.get('level_performance', {})
        if levels:
            print(f"\n   üìà PERFORMANCE BY LEVEL:")
            for level in sorted(levels.keys()):
                perf = levels[level]
                acc = perf['accuracy']
                correct = perf['correct']
                total = perf['total']
                
                print(f"      Level {level}: {acc:.1%} ({correct}/{total})")
                
                # Level-specific insights
                if level == '1':
                    if acc >= 0.8:
                        print(f"         ‚úÖ Excellent basic capability")
                    elif acc >= 0.6:
                        print(f"         üìà Good basic capability")
                    else:
                        print(f"         ‚ö†Ô∏è  Basic capabilities need work")
                        
                elif level == '2':
                    if acc >= 0.5:
                        print(f"         ‚úÖ Strong intermediate reasoning")
                    elif acc >= 0.3:
                        print(f"         üìà Developing intermediate skills")
                    else:
                        print(f"         üìâ Intermediate reasoning struggles")
                        
                elif level == '3':
                    if acc >= 0.3:
                        print(f"         üéØ Impressive advanced reasoning!")
                    elif acc >= 0.1:
                        print(f"         üìà Some advanced capability")
                    else:
                        print(f"         üî¨ Advanced reasoning very challenging")
    
    def _analyze_strategy_performance(self):
        """Analyze routing and strategy effectiveness"""
        
        # Strategy analysis
        strategy_analysis = self.last_result.get('strategy_analysis', {})
        if strategy_analysis:
            print(f"\nüõ§Ô∏è  STRATEGY & ROUTING ANALYSIS")
            
            total_strategies = sum(s.get('total_questions', 0) for s in strategy_analysis.values())
            
            for strategy, stats in strategy_analysis.items():
                acc = stats.get('accuracy', 0)
                count = stats.get('total_questions', 0)
                percentage = (count / total_strategies * 100) if total_strategies > 0 else 0
                
                print(f"   {strategy}: {acc:.1%} accuracy ({count}q, {percentage:.0f}%)")
                
                # Strategy-specific insights
                if 'one_shot' in strategy.lower():
                    if acc >= 0.7:
                        print(f"      ‚úÖ Excellent simple question handling")
                    elif acc >= 0.5:
                        print(f"      üìà Good simple question performance") 
                    else:
                        print(f"      ‚ö†Ô∏è  Simple questions underperforming")
                        
                elif 'manager' in strategy.lower() or 'coordination' in strategy.lower():
                    if acc >= 0.4:
                        print(f"      ‚úÖ Strong complex reasoning")
                    elif acc >= 0.2:
                        print(f"      üìà Developing complex capabilities")
                    else:
                        print(f"      üîß Complex coordination needs work")
        
        # Hybrid state metrics
        hybrid_metrics = self.last_result.get('hybrid_state_metrics', {})
        if hybrid_metrics:
            print(f"\nüåâ HYBRID STATE PERFORMANCE")
            
            context_usage = hybrid_metrics.get('context_bridge_usage', {})
            if context_usage:
                usage_pct = context_usage.get('usage_percentage', 0)
                total_questions = context_usage.get('total_questions', 0)
                bridge_used = context_usage.get('context_bridge_used', 0)
                
                print(f"   Context Bridge: {usage_pct:.1%} usage ({bridge_used}/{total_questions})")
                
                if usage_pct >= 0.9:
                    print(f"      ‚úÖ Excellent hybrid state integration")
                elif usage_pct >= 0.7:
                    print(f"      üìà Good hybrid state usage") 
                else:
                    print(f"      ‚ö†Ô∏è  Inconsistent hybrid state integration")
            
            avg_time = hybrid_metrics.get('average_execution_time', 0)
            if avg_time > 0:
                print(f"   Avg Execution Time: {avg_time:.1f}s per question")
                
                if avg_time <= 15:
                    print(f"      ‚ö° Fast execution - excellent efficiency")
                elif avg_time <= 30:
                    print(f"      üìà Reasonable execution speed")
                elif avg_time <= 60:
                    print(f"      ‚è±Ô∏è  Slow execution - consider optimization")
                else:
                    print(f"      ‚ö†Ô∏è  Very slow execution - needs optimization")
    
    def _analyze_technical_performance(self):
        """Analyze technical execution metrics"""
        
        # Look for execution metadata in results
        results = self.last_result.get('results', [])
        if not results:
            return
        
        print(f"\n‚öôÔ∏è  TECHNICAL PERFORMANCE")
        
        # Execution time analysis
        execution_times = [r.get('execution_time', 0) for r in results if r.get('execution_time')]
        if execution_times:
            avg_time = sum(execution_times) / len(execution_times)
            max_time = max(execution_times)
            min_time = min(execution_times)
            
            print(f"   Execution Times:")
            print(f"      Average: {avg_time:.1f}s")
            print(f"      Range: {min_time:.1f}s - {max_time:.1f}s")
            
            if max_time > avg_time * 3:
                print(f"      ‚ö†Ô∏è  High variability - some questions much slower")
        
        # Step analysis
        step_counts = [r.get('total_steps', 0) for r in results if r.get('total_steps')]
        if step_counts:
            avg_steps = sum(step_counts) / len(step_counts)
            max_steps = max(step_counts)
            
            print(f"   Step Counts:")
            print(f"      Average: {avg_steps:.1f} steps")
            print(f"      Maximum: {max_steps} steps")
            
            if avg_steps <= 5:
                print(f"      ‚ö° Efficient processing - low step count")
            elif avg_steps <= 10:
                print(f"      üìà Moderate complexity processing")
            else:
                print(f"      üîß High step count - may need optimization")
        
        # File processing analysis
        file_questions = [r for r in results if r.get('has_file', False)]
        if file_questions:
            file_success = sum(1 for r in file_questions if r.get('execution_successful', False))
            print(f"   File Processing:")
            print(f"      File questions: {len(file_questions)}")
            print(f"      File success rate: {file_success/len(file_questions):.1%}")
    
    def _analyze_errors_and_failures(self):
        """Analyze errors and execution failures"""
        
        results = self.last_result.get('results', [])
        failed_results = [r for r in results if not r.get('execution_successful', True)]
        error_results = [r for r in results if r.get('error') or 'ERROR' in r.get('final_answer', '')]
        
        if failed_results or error_results:
            print(f"\nüêõ ERROR ANALYSIS")
            
            if failed_results:
                print(f"   Execution Failures: {len(failed_results)}")
                
                # Categorize error types
                error_types = {}
                for result in failed_results:
                    error_type = result.get('error_type', 'unknown')
                    error_types[error_type] = error_types.get(error_type, 0) + 1
                
                print(f"   Error Types:")
                for error_type, count in sorted(error_types.items(), key=lambda x: x[1], reverse=True):
                    print(f"      {error_type}: {count} occurrences")
            
            if error_results:
                print(f"   Answer Errors: {len(error_results)}")
                
                # Show sample error messages
                sample_errors = [r.get('final_answer', '')[:100] for r in error_results[:3]]
                for i, error in enumerate(sample_errors, 1):
                    if error:
                        print(f"      Sample {i}: {error}...")
        
        # Check for evaluation errors (like the 'dict' object has no attribute 'strip')
        eval_errors = [r for r in results if r.get('evaluation_error')]
        if eval_errors:
            print(f"\nüîç EVALUATION ERRORS")
            print(f"   Evaluation issues: {len(eval_errors)}")
            
            # Get unique error types
            unique_errors = set(r.get('evaluation_error', '') for r in eval_errors)
            for error in unique_errors:
                if error:
                    print(f"      {error}")
    
    def _generate_actionable_recommendations(self):
        """Generate specific, actionable recommendations"""
        
        overall = self.last_result['overall_performance']
        accuracy = overall['accuracy']
        successful = overall['successful_executions']
        total = overall['total_questions']
        
        print(f"\nüí° ACTIONABLE RECOMMENDATIONS")
        print("-" * 40)
        
        recommendations = []
        
        # Accuracy-based recommendations
        if accuracy < 0.20:
            recommendations.extend([
                "üéØ CRITICAL: Focus on basic functionality - accuracy very low",
                "üîß Check agent configuration and tool integration",
                "üìö Verify GAIA answer formatting is working correctly"
            ])
        elif accuracy < 0.45:
            recommendations.extend([
                "üéØ PRIMARY: Work toward GAIA 45% threshold",
                "üìà Focus on Level 1 questions first - build foundation",
                "üîß Optimize basic reasoning and tool usage"
            ])
        elif accuracy < 0.60:
            recommendations.extend([
                "üéØ OPTIMIZE: Good performance - push toward excellence",
                "üìà Level 2 questions are the key improvement area",
                "‚ö° Consider model or prompt optimization"
            ])
        else:
            recommendations.extend([
                "üèÜ EXCELLENT: Maintain and monitor performance",
                "üìä Test with larger question sets to validate",
                "üî¨ Push boundaries with Level 3 questions"
            ])
        
        # Execution-based recommendations
        execution_rate = successful / total if total > 0 else 0
        if execution_rate < 0.9:
            recommendations.append("üîß CRITICAL: Fix execution failures before optimizing accuracy")
        
        # Strategy-based recommendations
        strategy_analysis = self.last_result.get('strategy_analysis', {})
        if strategy_analysis:
            one_shot_performance = 0
            manager_performance = 0
            
            for strategy, stats in strategy_analysis.items():
                if 'one_shot' in strategy.lower():
                    one_shot_performance = stats.get('accuracy', 0)
                elif 'manager' in strategy.lower():
                    manager_performance = stats.get('accuracy', 0)
            
            if one_shot_performance > 0 and manager_performance > 0:
                if one_shot_performance > manager_performance * 1.5:
                    recommendations.append("üõ§Ô∏è  ROUTING: Consider routing more questions to one-shot LLM")
                elif manager_performance > one_shot_performance * 1.5:
                    recommendations.append("üõ§Ô∏è  ROUTING: Manager coordination is more effective")
        
        # Technical recommendations
        hybrid_metrics = self.last_result.get('hybrid_state_metrics', {})
        if hybrid_metrics:
            avg_time = hybrid_metrics.get('average_execution_time', 0)
            if avg_time > 30:
                recommendations.append("‚ö° PERFORMANCE: Optimize execution speed - currently slow")
            
            context_usage = hybrid_metrics.get('context_bridge_usage', {})
            usage_pct = context_usage.get('usage_percentage', 0)
            if usage_pct < 0.8:
                recommendations.append("üåâ INTEGRATION: Improve context bridge consistency")
        
        # Error-specific recommendations
        results = self.last_result.get('results', [])
        eval_errors = [r for r in results if r.get('evaluation_error')]
        if eval_errors:
            recommendations.append("üîç CRITICAL: Fix evaluation errors preventing accurate assessment")
        
        # Display recommendations
        for i, rec in enumerate(recommendations, 1):
            print(f"   {i}. {rec}")
        
        if not recommendations:
            print("   üéâ No specific issues detected - excellent performance!")
        
        # Final priority guidance
        print(f"\nüöÄ PRIORITY FOCUS:")
        if execution_rate < 0.9:
            print("   1. Fix execution failures first")
            print("   2. Then focus on accuracy improvement")
        elif accuracy < 0.45:
            print("   1. Reach GAIA 45% threshold")
            print("   2. Level 1 questions are your foundation")
        elif accuracy < 0.60:
            print("   1. Optimize Level 2 performance")
            print("   2. Fine-tune routing and strategies")
        else:
            print("   1. Maintain excellent performance")
            print("   2. Scale testing and monitor consistency")

validator = GAIAValidator()

# GAIA Testing Methods Reference

## Validator Methods

### `validator.quick(config, questions)`
**Quick validation test - perfect for development**
- `config` (str, default="groq"): Agent configuration 
  - Options: `"groq"`, `"google"`, `"openrouter"`, `"ollama"`, `"performance"`, `"accuracy"`
- `questions` (int, default=5): Number of questions to test
- **Returns**: Test results dict
- **Use case**: Fast iteration during development

```python
result = validator.quick('groq', 5)
```

### `validator.full(config, questions)`
**Comprehensive test - for production validation**
- `config` (str, default="groq"): Agent configuration
- `questions` (int, default=20): Number of questions (20+ recommended for reliable results)
- **Returns**: Complete test results with level breakdown
- **Use case**: Final validation before deployment

```python
result = validator.full('groq', 20)
```

### `validator.compare(configs, questions)`
**Compare multiple configurations**
- `configs` (list, default=["groq", "google"]): List of configurations to compare
- `questions` (int, default=10): Questions per configuration
- **Returns**: Comparison results with rankings
- **Use case**: Choosing the best configuration

```python
result = validator.compare(['groq', 'google', 'performance'], 10)
```

### `validator.insights()`
**Analyze last test results**
- **No parameters**
- **Returns**: None (prints analysis)
- **Use case**: Get actionable recommendations after any test

```python
validator.insights()
```

---

## Underlying Testing Functions

### `run_quick_gaia_test(agent_config_name, **kwargs)`
**Direct access to quick testing**
- `agent_config_name` (str): Configuration name
- `num_questions` (int, default=5): Number of questions
- `dataset_path` (str, default="./tests/gaia_data"): Dataset location
- **Returns**: Evaluation results dict

### `run_gaia_test(agent_config_name, dataset_path, max_questions, test_config)`
**Complete GAIA test workflow**
- `agent_config_name` (str, default="groq"): Configuration name
- `dataset_path` (str, default="./tests/gaia_data"): Dataset location
- `max_questions` (int, default=20): Maximum questions to test
- `test_config` (GAIATestConfig, optional): Advanced test configuration
- **Returns**: Complete evaluation results

### `compare_agent_configs(config_names, num_questions, dataset_path)`
**Compare multiple agent configurations**
- `config_names` (List[str]): List of configuration names
- `num_questions` (int, default=10): Questions per configuration
- `dataset_path` (str, default="./tests/gaia_data"): Dataset location
- **Returns**: Comparison results dict

### `run_smart_routing_test(agent_config_name, num_questions)`
**Test smart routing effectiveness**
- `agent_config_name` (str, default="performance"): Configuration name
- `num_questions` (int, default=15): Number of questions for analysis
- **Returns**: Routing analysis results

---

## Configuration Options

### Available Configurations
| Config Name | Provider | Model | Use Case |
|-------------|----------|-------|----------|
| `"groq"` | Groq | qwen-qwq-32b | Fast, reliable |
| `"google"` | Google | gemini-2.0-flash-preview | Balanced performance |
| `"openrouter"` | OpenRouter | qwen/qwen-2.5-coder-32b-instruct:free | Cost-effective |
| `"ollama"` | Ollama | qwen2.5-coder:32b | Local deployment |
| `"performance"` | Groq | qwen-qwq-32b | Optimized for speed |
| `"accuracy"` | Google | gemini-2.0-flash-preview | Optimized for accuracy |

---

## Test Result Structure

### Standard Result Format
```python
{
    "overall_performance": {
        "total_questions": 20,
        "correct_answers": 11,
        "accuracy": 0.55,
        "successful_executions": 19
    },
    "level_performance": {
        "1": {"accuracy": 0.70, "correct": 7, "total": 10},
        "2": {"accuracy": 0.44, "correct": 4, "total": 9},
        "3": {"accuracy": 0.0, "correct": 0, "total": 1}
    },
    "strategy_analysis": {
        "one_shot_llm": {"accuracy": 0.67, "total_questions": 12},
        "manager_coordination": {"accuracy": 0.38, "total_questions": 8}
    }
}
```

### Comparison Result Format
```python
{
    "comparison_results": {
        "groq": {"accuracy": 0.60, "correct_answers": 6, "total_questions": 10},
        "google": {"accuracy": 0.50, "correct_answers": 5, "total_questions": 10}
    },
    "timestamp": "2024-12-19T10:30:00",
    "test_questions": 10
}
```

---

## Usage Examples

### Basic Workflow
```python
# 1. Quick test
result = validator.quick('groq', 5)
validator.insights()

# 2. Full validation  
result = validator.full('groq', 20)
validator.insights()
```

### Configuration Comparison
```python
# Compare multiple configs
result = validator.compare(['groq', 'google', 'performance'], 10)
validator.insights()
```

### Advanced Testing
```python
# Direct function access for custom workflows
from agent_testing import run_gaia_test, analyze_failure_patterns

result = run_gaia_test('groq', max_questions=50)
failure_analysis = analyze_failure_patterns(result)
```

---

## Performance Benchmarks

### GAIA Accuracy Targets
- **45%+**: GAIA benchmark threshold
- **50-60%**: Competitive performance
- **60%+**: Excellent performance

### Execution Time Guidelines
- **Quick test (5q)**: ~1-2 minutes
- **Full test (20q)**: ~5-8 minutes  
- **Comparison (3 configs, 10q each)**: ~10-15 minutes

### Recommended Question Counts
- **Development**: 5 questions (quick feedback)
- **Validation**: 20 questions (reliable results)
- **Production**: 50+ questions (comprehensive assessment)

# GAIA Testing

### Run a quick test batch
### `validator.quick(config, # questions)`
**Quick validation test - perfect for development**
- `config` (str, default="groq"): Agent configuration 
  - Options: `"groq"`, `"google"`, `"openrouter"`, `"ollama"`, `"performance"`, `"accuracy"`
- `questions` (int, default=5): Number of questions to test
- **Returns**: Test results dict
- **Use case**: Fast iteration during development

### Evaluation results for the quick test

## Underlying Testing Functions

### `run_quick_gaia_test(agent_config_name, **kwargs)`
**Direct access to quick testing**
- `agent_config_name` (str): Configuration name
- `num_questions` (int, default=5): Number of questions
- `dataset_path` (str, default="./tests/gaia_data"): Dataset location
- **Returns**: Evaluation results dict

In [None]:
# üîß COMPLETE VALIDATOR FIX WITH ENHANCED INSIGHTS
# This will fix your validator and add all enhanced functionality

print("üöÄ COMPLETE VALIDATOR FIX - ENHANCED INSIGHTS")
print("=" * 60)

# First, let's check what we're working with
print("üîç CURRENT STATUS CHECK:")
if 'validator' in globals():
    print(f"‚úÖ Validator exists: {type(validator)}")
    if hasattr(validator, 'last_result'):
        print(f"üìä last_result type: {type(validator.last_result)}")
        if validator.last_result is not None:
            print(f"‚úÖ Has test results")
        else:
            print("‚ùå last_result is None - need to run test")
    else:
        print("‚ùå No last_result attribute")
else:
    print("‚ùå No validator found")

print("\n" + "=" * 60)

# ENHANCED INSIGHTS METHODS - All in one place
def enhanced_insights(self):
    """Enhanced insights with rich execution analysis and direct log links"""
    if not hasattr(self, 'last_result') or not self.last_result or 'overall_performance' not in self.last_result:
        print("‚ùå No results to analyze")
        print(f"Debug: last_result = {getattr(self, 'last_result', 'MISSING ATTRIBUTE')}")
        if hasattr(self, 'last_result') and self.last_result:
            print(f"Debug: last_result type = {type(self.last_result)}")
            if isinstance(self.last_result, dict):
                print(f"Debug: last_result keys = {list(self.last_result.keys())}")
        
        print("\nüí° TO FIX THIS:")
        print("1. Run a test first:")
        print("   result = validator.quick('groq', 5)")
        print("2. Then try enhanced insights:")
        print("   validator.enhanced_insights()")
        return
    
    print(f"\n‚ú® COMPREHENSIVE INSIGHTS")
    print("=" * 60)
    
    # 0. LOG FILE LINKS
    self._show_log_file_links()
    
    # 1. EXECUTION PERFORMANCE ANALYSIS
    self._analyze_execution_performance()
    
    # 2. ACCURACY BREAKDOWN  
    self._analyze_accuracy_breakdown()
    
    # 3. STRATEGY & ROUTING ANALYSIS
    self._analyze_strategy_performance()
    
    # 4. TECHNICAL PERFORMANCE METRICS
    self._analyze_technical_performance()
    
    # 5. ERROR ANALYSIS (if applicable)
    self._analyze_errors_and_failures()
    
    # 6. ACTIONABLE RECOMMENDATIONS
    self._generate_actionable_recommendations()

def _show_log_file_links(self):
    """Show direct links to log files for detailed analysis"""
    print(f"üìÅ LOG FILES & DETAILED DATA")
    
    # Try to extract log file paths from the result
    execution_file = None
    evaluation_file = None
    
    # Look for execution file in various places
    if 'execution_file' in self.last_result:
        execution_file = self.last_result['execution_file']
    
    # Try to find recent log files
    try:
        import glob
        import os
        
        # Look for recent execution and evaluation files
        base_dirs = ["./test_results", "./test_results/logs", "./logs"]
        
        for base_dir in base_dirs:
            if os.path.exists(base_dir):
                exec_files = glob.glob(f"{base_dir}/**/*execution*.json", recursive=True)
                eval_files = glob.glob(f"{base_dir}/**/*evaluation*.json", recursive=True)
                
                if exec_files and not execution_file:
                    execution_file = max(exec_files, key=os.path.getctime)
                if eval_files and not evaluation_file:
                    evaluation_file = max(eval_files, key=os.path.getctime)
    except:
        pass
    
    # Display file links
    if execution_file:
        print(f"   üìä Execution Details: {execution_file}")
        print(f"      üí° Open in notebook: pd.read_json('{execution_file}')")
    
    if evaluation_file:
        print(f"   üìà Evaluation Results: {evaluation_file}")
        print(f"      üí° Open in notebook: pd.read_json('{evaluation_file}')")
    
    # Look for CSV logs
    try:
        csv_files = []
        for base_dir in ["./logs", "./test_results/logs"]:
            if os.path.exists(base_dir):
                csv_files.extend(glob.glob(f"{base_dir}/*steps*.csv"))
                csv_files.extend(glob.glob(f"{base_dir}/*questions*.csv"))
                csv_files.extend(glob.glob(f"{base_dir}/*evaluation*.csv"))
        
        if csv_files:
            recent_csvs = sorted(csv_files, key=os.path.getctime, reverse=True)[:3]
            print(f"   üìã Recent CSV Logs:")
            for csv_file in recent_csvs:
                file_type = "Steps" if "steps" in csv_file else "Questions" if "questions" in csv_file else "Evaluation"
                print(f"      {file_type}: {csv_file}")
                print(f"         üí° Open: pd.read_csv('{csv_file}')")
    except:
        pass
    
    # Quick data access commands
    print(f"\n   üîß QUICK ACCESS COMMANDS:")
    print(f"      # View raw results structure")
    print(f"      validator.last_result.keys()")
    print(f"      # Access specific data")
    print(f"      validator.last_result['overall_performance']")
    
    if not execution_file and not evaluation_file:
        print(f"   ‚ö†Ô∏è  Log files not automatically detected")
        print(f"      Check ./test_results/ and ./logs/ directories")
    
    print()

def _analyze_execution_performance(self):
    """Analyze execution success and performance"""
    overall = self.last_result['overall_performance']
    
    total = overall['total_questions']
    successful = overall['successful_executions']
    correct = overall['correct_answers']
    accuracy = overall['accuracy']
    
    print(f"üîß EXECUTION PERFORMANCE")
    print(f"   Total Questions: {total}")
    print(f"   Successful Executions: {successful}/{total} ({successful/total:.1%})")
    print(f"   Correct Answers: {correct}/{total} ({accuracy:.1%})")
    
    # Execution success analysis
    if successful == total:
        print(f"   ‚úÖ Perfect execution success - no technical failures")
    elif successful >= total * 0.9:
        print(f"   ‚úÖ Good execution success - minimal technical issues")
    elif successful >= total * 0.7:
        print(f"   ‚ö†Ô∏è  Moderate execution issues - {total - successful} failures")
    else:
        print(f"   ‚ùå Significant execution problems - {total - successful} failures")
    
    # Answer quality vs execution success
    if successful > 0:
        answer_quality = correct / successful
        print(f"   üìä Answer Quality (when executed): {answer_quality:.1%}")
        
        if answer_quality >= 0.6:
            print(f"   üéØ High answer quality - agent reasoning is strong")
        elif answer_quality >= 0.3:
            print(f"   üìà Moderate answer quality - room for improvement")
        else:
            print(f"   üìâ Low answer quality - needs significant work")

def _analyze_accuracy_breakdown(self):
    """Detailed accuracy analysis"""
    overall = self.last_result['overall_performance']
    accuracy = overall['accuracy']
    
    print(f"\nüéØ ACCURACY ANALYSIS")
    print(f"   Overall Accuracy: {accuracy:.1%}")
    
    # GAIA benchmark context
    if accuracy >= 0.70:
        print(f"   üèÜ EXCEPTIONAL - Top-tier performance!")
    elif accuracy >= 0.60:
        print(f"   üåü EXCELLENT - Competitive performance")
    elif accuracy >= 0.45:
        print(f"   ‚úÖ GOOD - Above GAIA 45% threshold")
    elif accuracy >= 0.30:
        print(f"   ‚ö†Ô∏è  FAIR - Below GAIA threshold, needs improvement")
    else:
        print(f"   ‚ùå POOR - Significant improvements needed")
    
    # Level breakdown
    levels = self.last_result.get('level_performance', {})
    if levels:
        print(f"\n   üìà PERFORMANCE BY LEVEL:")
        for level in sorted(levels.keys()):
            perf = levels[level]
            acc = perf['accuracy']
            correct = perf['correct']
            total = perf['total']
            print(f"      Level {level}: {acc:.1%} ({correct}/{total})")

def _analyze_strategy_performance(self):
    """Analyze routing and strategy effectiveness"""
    strategy_analysis = self.last_result.get('strategy_analysis', {})
    if strategy_analysis:
        print(f"\nüõ§Ô∏è  STRATEGY & ROUTING ANALYSIS")
        
        total_strategies = sum(s.get('total_questions', 0) for s in strategy_analysis.values())
        
        for strategy, stats in strategy_analysis.items():
            acc = stats.get('accuracy', 0)
            count = stats.get('total_questions', 0)
            percentage = (count / total_strategies * 100) if total_strategies > 0 else 0
            print(f"   {strategy}: {acc:.1%} accuracy ({count}q, {percentage:.0f}%)")
    
    # Hybrid state metrics
    hybrid_metrics = self.last_result.get('hybrid_state_metrics', {})
    if hybrid_metrics:
        print(f"\nüåâ HYBRID STATE PERFORMANCE")
        
        context_usage = hybrid_metrics.get('context_bridge_usage', {})
        if context_usage:
            usage_pct = context_usage.get('usage_percentage', 0)
            print(f"   Context Bridge: {usage_pct:.1%} usage")
        
        avg_time = hybrid_metrics.get('average_execution_time', 0)
        if avg_time > 0:
            print(f"   Avg Execution Time: {avg_time:.1f}s per question")

def _analyze_technical_performance(self):
    """Analyze technical execution metrics"""
    results = self.last_result.get('results', [])
    if not results:
        return
    
    print(f"\n‚öôÔ∏è  TECHNICAL PERFORMANCE")
    
    # Execution time analysis
    execution_times = [r.get('execution_time', 0) for r in results if r.get('execution_time')]
    if execution_times:
        avg_time = sum(execution_times) / len(execution_times)
        max_time = max(execution_times)
        min_time = min(execution_times)
        
        print(f"   Execution Times:")
        print(f"      Average: {avg_time:.1f}s")
        print(f"      Range: {min_time:.1f}s - {max_time:.1f}s")

def _analyze_errors_and_failures(self):
    """Analyze errors and execution failures"""
    results = self.last_result.get('results', [])
    failed_results = [r for r in results if not r.get('execution_successful', True)]
    eval_errors = [r for r in results if r.get('evaluation_error')]
    
    if failed_results or eval_errors:
        print(f"\nüêõ ERROR ANALYSIS")
        
        if failed_results:
            print(f"   Execution Failures: {len(failed_results)}")
        
        if eval_errors:
            print(f"   Evaluation Errors: {len(eval_errors)}")
            unique_errors = set(r.get('evaluation_error', '') for r in eval_errors)
            for error in unique_errors:
                if error:
                    print(f"      {error}")

def _generate_actionable_recommendations(self):
    """Generate specific, actionable recommendations"""
    overall = self.last_result['overall_performance']
    accuracy = overall['accuracy']
    
    print(f"\nüí° ACTIONABLE RECOMMENDATIONS")
    print("-" * 40)
    
    if accuracy < 0.20:
        print("   1. üéØ CRITICAL: Focus on basic functionality - accuracy very low")
        print("   2. üîß Check agent configuration and tool integration")
    elif accuracy < 0.45:
        print("   1. üéØ PRIMARY: Work toward GAIA 45% threshold")
        print("   2. üìà Focus on Level 1 questions first")
    elif accuracy < 0.60:
        print("   1. üéØ OPTIMIZE: Good performance - push toward excellence")
        print("   2. üìà Level 2 questions are the key improvement area")
    else:
        print("   1. üèÜ EXCELLENT: Maintain and monitor performance")
        print("   2. üìä Test with larger question sets to validate")

# ADD OR UPDATE THE VALIDATOR
if 'validator' in globals():
    print("üîß Updating existing validator with enhanced insights...")
    
    # Add all the new methods to your existing validator
    validator.enhanced_insights = enhanced_insights.__get__(validator, validator.__class__)
    validator._show_log_file_links = _show_log_file_links.__get__(validator, validator.__class__)
    validator._analyze_execution_performance = _analyze_execution_performance.__get__(validator, validator.__class__)
    validator._analyze_accuracy_breakdown = _analyze_accuracy_breakdown.__get__(validator, validator.__class__)
    validator._analyze_strategy_performance = _analyze_strategy_performance.__get__(validator, validator.__class__)
    validator._analyze_technical_performance = _analyze_technical_performance.__get__(validator, validator.__class__)
    validator._analyze_errors_and_failures = _analyze_errors_and_failures.__get__(validator, validator.__class__)
    validator._generate_actionable_recommendations = _generate_actionable_recommendations.__get__(validator, validator.__class__)
    
    print("‚úÖ Enhanced insights added to existing validator!")
    
    # Check if we have results
    if hasattr(validator, 'last_result') and validator.last_result:
        print("‚úÖ Test results found - enhanced_insights() ready to use!")
        print("üí° Now try: validator.enhanced_insights()")
    else:
        print("‚ö†Ô∏è  No test results found")
        print("üí° Run a test first: result = validator.quick('groq', 5)")
        print("üí° Then use: validator.enhanced_insights()")
    
else:
    print("‚ùå No 'validator' found")
    print("üí° Create validator first:")
    print("   from agent_testing import GAIATestValidator")
    print("   validator = GAIATestValidator()")
    print("   result = validator.quick('groq', 5)")
    print("   validator.enhanced_insights()")

print(f"\nüéâ COMPLETE VALIDATOR FIX APPLIED!")
print("=" * 50)
print("‚úÖ Enhanced insights methods added")
print("‚úÖ Better error handling and debugging")
print("‚úÖ Rich analysis with log file links")
print("‚úÖ Comprehensive performance breakdown")
print("")
print("üöÄ READY TO USE:")
print("   validator.enhanced_insights()  # Rich analysis")
print("   validator.insights()          # Original basic analysis")

In [None]:
result = validator.quick('openrouter')

In [None]:
validator.enhanced_insights()

### Run full gaia test

In [None]:
# Uncomment when ready for full validation
result = validator.full('openrouter', 20)

In [None]:
validator.enhanced_insights()

### bug fixing and experimentation spot

In [None]:
# test_existing_infrastructure.py - Use your actual GAIA setup

from agent_logic import GAIAAgent, GAIAConfig, extract_file_info_from_task_id
from smolagents import CodeAgent, LiteLLMModel
import os

def test_with_existing_gaia_infrastructure():
    """Test SmolagAgents with your existing GAIA infrastructure"""
    
    # Use your existing config
    config = GAIAConfig(
        model_provider="openrouter",
        model_name="google/gemini-2.5-flash",
        temperature=0.1,
        enable_context_bridge=True,
        debug_mode=True
    )
    
    # Create model using your existing pattern
    specialist_model = LiteLLMModel(
        model_id=f"openrouter/{config.model_name}",
        api_key=os.getenv("OPENROUTER_API_KEY"),
        temperature=config.temperature
    )
    
    # Create single test agent
    test_agent = CodeAgent(
        tools=[],
        model=specialist_model,
        add_base_tools=True,
        additional_authorized_imports=[
            "pandas", "openpyxl", "numpy", "json", "matplotlib", "seaborn"
        ]
    )
    
    # Test with actual GAIA task pattern
    test_task_id = "test_excel_123"
    test_question = "What is the sum of column B in the spreadsheet?"
    
    # Create test state like your GAIAState
    test_state = {
        "task_id": test_task_id,
        "question": test_question,
        "file_path": os.path.abspath("test_data.xlsx"),
        "file_name": "test_data.xlsx",
        "has_file": True
    }
    
    # Create test Excel file
    import pandas as pd
    test_df = pd.DataFrame({"A": [1, 2, 3], "B": [10, 20, 30]})
    test_df.to_excel(test_state["file_path"], index=False)
    
    # Use your enhanced question pattern
    enhanced_question = f"""
{test_question}

File Information:
- File available at: {test_state['file_path']}
- File name: {test_state['file_name']}
- Access this file directly in your Python code using the file path above

Important: Use pandas.read_excel() to load the file and sum column B.
"""
    
    additional_args = {
        'file_path': test_state['file_path'],
        'file_name': test_state['file_name'],
        'has_file': True
    }
    
    print(f"üß™ Testing with OpenRouter model: {config.model_name}")
    print(f"üìÅ Test file: {test_state['file_name']}")
    
    try:
        result = test_agent.run(
            task=enhanced_question,
            additional_args=additional_args
        )
        
        print("‚úÖ GAIA infrastructure integration SUCCESS!")
        print(f"üìä Expected sum: 60 (10+20+30)")
        print(f"üìä Agent result: {result}")
        
        # Check if result contains expected answer
        if "60" in str(result):
            print("üéØ Correct answer detected!")
            return True
        else:
            print("‚ö†Ô∏è Answer may not be correct, but execution succeeded")
            return True
            
    except Exception as e:
        print(f"‚ùå Integration test FAILED: {e}")
        import traceback
        traceback.print_exc()
        return False
    finally:
        # Cleanup
        if os.path.exists(test_state['file_path']):
            os.remove(test_state['file_path'])
            print("üßπ Cleaned up test file")

if __name__ == "__main__":
    success = test_with_existing_gaia_infrastructure()
    print(f"\n{'üéØ READY FOR FULL INTEGRATION' if success else 'üîß NEEDS DEBUGGING'}")