# GAIA Evaluator
## Implementation plan, testing framework, and deployment

**Objective:** Complete implementation roadmap and evaluation system  
**Target:** 45-55% GAIA accuracy within $10 budget

---

In [None]:
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import json
from pathlib import Path

# Your production system
from gaia_agent_system import (
    create_gaia_agent, 
    create_production_gaia_agent,
    GAIAConfig,
    ModelConfigs,
    run_gaia_benchmark,
    quick_test,
    compare_configs
)

# Testing framework
from hf_production_testing_agent_assignment import (
    GAIAProductionTestFramework,
    TestConfig,
    run_production_test,
    quick_agent_validation,
    compare_agent_configs,
    benchmark_best_config
)

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
plt.rcParams['figure.figsize'] = [12, 8]
sns.set_style("whitegrid")

print("üöÄ GAIA Production Evaluation Environment Ready")
print(f"üìÖ Session: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

## Section 1: Quick system validation

In [None]:
# Test that your system is working correctly
print("üîß Testing Core Components...")

# Test retriever
try:
    from dev_retriever import load_gaia_retriever
    retriever = load_gaia_retriever("gaia_embeddings.csv")
    if retriever and retriever.is_ready():
        print("‚úÖ Retriever: Working")
        
        # Test search
        results = retriever.search("Calculate 15% of 100", k=2)
        print(f"  ‚îú‚îÄ‚îÄ Retrieved {len(results)} similar examples")
    else:
        print("‚ùå Retriever: Failed to initialize")
except Exception as e:
    print(f"‚ùå Retriever: Error - {e}")

# Test agent creation
try:
    agent = create_gaia_agent("qwen2.5_coder")
    print("‚úÖ Agent Creation: Working")
    agent.close()
except Exception as e:
    print(f"‚ùå Agent Creation: Error - {e}")

# Test model configurations
configs = ModelConfigs.get_all_configs()
print(f"‚úÖ Model Configs: {len(configs)} configurations available")
for provider in ['openrouter', 'groq', 'google', 'ollama']:
    provider_configs = [name for name, config in configs.items() 
                       if config['model_provider'] == provider]
    print(f"  ‚îú‚îÄ‚îÄ {provider}: {len(provider_configs)} configs")

In [None]:
# Quick test to verify everything works
print("\nüß™ Quick Functionality Test...")

test_questions = [
    "Calculate 25% of 800",
    "What is 15 + 27?",
    "Convert 100 fahrenheit to celsius"
]

for i, question in enumerate(test_questions):
    print(f"\nTest {i+1}: {question}")
    try:
        result = quick_test(question, "qwen2.5_coder")
        
        if "error" not in result:
            answer = result.get('final_answer', 'No answer')
            strategy = result.get('selected_strategy', 'Unknown')
            print(f"  ‚îú‚îÄ‚îÄ Answer: {answer}")
            print(f"  ‚îî‚îÄ‚îÄ Strategy: {strategy}")
        else:
            print(f"  ‚îî‚îÄ‚îÄ Error: {result['error']}")
            
    except Exception as e:
        print(f"  ‚îî‚îÄ‚îÄ Exception: {e}")

print("\n‚úÖ Quick functionality test completed")

## Section 2: Single Configuration Deep Dive

In [None]:
# Deep dive into your best performing configuration
best_config = "qwen2.5_coder"  # Change this to your preferred config

print(f"üîç Deep Dive Analysis: {best_config}")
print("=" * 50)

# Create testing framework
test_config = TestConfig(
    max_questions_per_config=25,
    max_total_budget=3.0,  # Limit budget for this test
    enable_performance_tracking=True,
    generate_visualizations=True
)

framework = GAIAProductionTestFramework(test_config)

# Run comprehensive test
result = framework.test_single_configuration(best_config, 25)

# Display results
print(f"\nüìä DETAILED RESULTS FOR {best_config}")
print("-" * 40)
print(f"Total Questions: {result.get('total_questions', 0)}")
print(f"Accuracy: {result.get('accuracy', 0):.3f} ({result.get('accuracy', 0)*100:.1f}%)")
print(f"Average Time: {result.get('avg_execution_time', 0):.2f}s")
print(f"Total Cost: ${result.get('total_cost', 0):.3f}")
print(f"Error Rate: {result.get('error_count', 0) / result.get('total_questions', 1):.3f}")
print(f"Fallback Usage: {result.get('fallback_usage', 0):.3f}")

# Level breakdown
level_performance = result.get('level_breakdown', {})
if level_performance:
    print(f"\nüìà Performance by GAIA Level:")
    for level, stats in level_performance.items():
        accuracy = stats.get('accuracy', 0)
        total = stats.get('total_questions', 0)
        avg_time = stats.get('avg_execution_time', 0)
        print(f"  ‚îú‚îÄ‚îÄ {level}: {accuracy:.3f} accuracy ({total} questions, {avg_time:.2f}s avg)")

# Strategy breakdown
strategy_performance = result.get('strategy_breakdown', {})
if strategy_performance:
    print(f"\nüéØ Performance by Strategy:")
    for strategy, stats in strategy_performance.items():
        accuracy = stats.get('accuracy', 0)
        total = stats.get('total_questions', 0)
        print(f"  ‚îú‚îÄ‚îÄ {strategy}: {accuracy:.3f} accuracy ({total} questions)")

In [None]:
# Analyze the detailed results
detailed_results = result.get('detailed_results', [])
if detailed_results:
    df_detailed = pd.DataFrame(detailed_results)
    
    print(f"\nüî¨ Detailed Analysis:")
    print(f"Total records: {len(df_detailed)}")
    
    # Accuracy by level
    if 'level' in df_detailed.columns:
        level_accuracy = df_detailed.groupby('level')['is_correct'].agg(['mean', 'count'])
        print(f"\nAccuracy by Level:")
        for level, stats in level_accuracy.iterrows():
            print(f"  Level {level}: {stats['mean']:.3f} ({stats['count']} questions)")
    
    # Strategy effectiveness
    if 'strategy_used' in df_detailed.columns:
        strategy_accuracy = df_detailed.groupby('strategy_used')['is_correct'].agg(['mean', 'count'])
        print(f"\nStrategy Effectiveness:")
        for strategy, stats in strategy_accuracy.iterrows():
            print(f"  {strategy}: {stats['mean']:.3f} ({stats['count']} uses)")
    
    # Error analysis
    error_questions = df_detailed[df_detailed['errors'].apply(lambda x: len(x) if isinstance(x, list) else 0) > 0]
    if len(error_questions) > 0:
        print(f"\nError Analysis:")
        print(f"  Questions with errors: {len(error_questions)}")
        print(f"  Error rate: {len(error_questions)/len(df_detailed):.3f}")
        
        # Show sample errors
        print("  Sample errors:")
        for i, (_, row) in enumerate(error_questions.head(3).iterrows()):
            question = row['question'][:50] + "..." if len(row['question']) > 50 else row['question']
            errors = row['errors']
            print(f"    {i+1}. {question}")
            if isinstance(errors, list) and errors:
                print(f"       Error: {errors[0]}")
    
    # Show some correct and incorrect examples
    print(f"\n‚úÖ Sample Correct Answers:")
    correct_samples = df_detailed[df_detailed['is_correct'] == True].head(3)
    for i, (_, row) in enumerate(correct_samples.iterrows()):
        question = row['question'][:50] + "..." if len(row['question']) > 50 else row['question']
        print(f"  {i+1}. Q: {question}")
        print(f"     A: {row['predicted_answer']} (Expected: {row['ground_truth']})")
    
    print(f"\n‚ùå Sample Incorrect Answers:")
    incorrect_samples = df_detailed[df_detailed['is_correct'] == False].head(3)
    for i, (_, row) in enumerate(incorrect_samples.iterrows()):
        question = row['question'][:50] + "..." if len(row['question']) > 50 else row['question']
        print(f"  {i+1}. Q: {question}")
        print(f"     A: {row['predicted_answer']} (Expected: {row['ground_truth']})")
else:
    print("‚ö†Ô∏è No detailed results available")

## Section 3: Multi-Configuration Comparison

In [None]:
# Compare your top configurations
print("üèÜ Multi-Configuration Comparison")
print("=" * 50)

# Select configurations to compare
configs_to_compare = [
    "qwen2.5_coder",      # OpenRouter - High performance
    "qwen_qwq_groq",      # Groq - Fast execution
    "gemini_flash_04",    # Google - Balanced
    "deepseek"            # OpenRouter - Alternative
]

print(f"Comparing {len(configs_to_compare)} configurations:")
for config in configs_to_compare:
    model_info = ModelConfigs.get_all_configs().get(config, {})
    provider = model_info.get('model_provider', 'unknown')
    model = model_info.get('primary_model', 'unknown')
    print(f"  ‚îú‚îÄ‚îÄ {config}: {provider}/{model}")

# Run comparison with budget management
comparison_df = compare_agent_configs(configs_to_compare, sample_size=15)

print(f"\nüìä COMPARISON RESULTS")
print("=" * 30)
print(comparison_df.round(3))

# Find best performers
best_accuracy = comparison_df.loc[comparison_df['accuracy'].idxmax()]
fastest_model = comparison_df.loc[comparison_df['avg_time'].idxmin()]
most_efficient = comparison_df.loc[(comparison_df['accuracy'] / comparison_df['total_cost']).idxmax()]

print(f"\nüèÜ TOP PERFORMERS")
print(f"‚îú‚îÄ‚îÄ Best Accuracy: {best_accuracy['config']} ({best_accuracy['accuracy']:.3f})")
print(f"‚îú‚îÄ‚îÄ Fastest: {fastest_model['config']} ({fastest_model['avg_time']:.2f}s)")
print(f"‚îî‚îÄ‚îÄ Most Cost-Efficient: {most_efficient['config']}")

In [None]:
# Create comprehensive comparison visualizations
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# 1. Accuracy comparison
axes[0,0].bar(comparison_df['config'], comparison_df['accuracy'])
axes[0,0].set_title('Accuracy by Configuration')
axes[0,0].set_ylabel('Accuracy')
axes[0,0].tick_params(axis='x', rotation=45)
axes[0,0].axhline(y=0.45, color='red', linestyle='--', label='GAIA Target')
axes[0,0].legend()

# Add value labels
for i, v in enumerate(comparison_df['accuracy']):
    axes[0,0].text(i, v + 0.01, f'{v:.3f}', ha='center', va='bottom')

# 2. Execution time comparison
axes[0,1].bar(comparison_df['config'], comparison_df['avg_time'])
axes[0,1].set_title('Average Execution Time')
axes[0,1].set_ylabel('Time (seconds)')
axes[0,1].tick_params(axis='x', rotation=45)

# 3. Cost vs Performance
axes[1,0].scatter(comparison_df['total_cost'], comparison_df['accuracy'], s=100)
for i, config in enumerate(comparison_df['config']):
    axes[1,0].annotate(config, 
                      (comparison_df.iloc[i]['total_cost'], comparison_df.iloc[i]['accuracy']),
                      xytext=(5, 5), textcoords='offset points', fontsize=9)
axes[1,0].set_xlabel('Total Cost ($)')
axes[1,0].set_ylabel('Accuracy')
axes[1,0].set_title('Cost vs Performance')
axes[1,0].grid(True, alpha=0.3)

# 4. Error rates
axes[1,1].bar(comparison_df['config'], comparison_df['error_rate'])
axes[1,1].set_title('Error Rate by Configuration')
axes[1,1].set_ylabel('Error Rate')
axes[1,1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

# Summary table
print(f"\nüìã SUMMARY TABLE")
summary_table = comparison_df[['config', 'accuracy', 'avg_time', 'total_cost', 'error_rate']].copy()
summary_table['gaia_target_met'] = summary_table['accuracy'] >= 0.45
summary_table['cost_efficiency'] = summary_table['accuracy'] / summary_table['total_cost']

print(summary_table.round(3).to_string(index=False))

## Section 4. Production Readiness Assessment

In [None]:
 Run comprehensive production test with best configuration
print("üöÄ Production Readiness Assessment")
print("=" * 50)

# Identify best configuration from comparison
best_config_name = comparison_df.loc[comparison_df['accuracy'].idxmax(), 'config']
print(f"Testing production readiness with: {best_config_name}")

# Configure production test
production_config = TestConfig(
    max_questions_per_config=50,  # Substantial test
    max_total_budget=5.0,         # Reserve budget for final submission
    enable_model_comparison=False,
    enable_level_analysis=True,
    enable_error_analysis=True,
    enable_performance_tracking=True,
    generate_visualizations=True
)

# Run production test
production_framework = GAIAProductionTestFramework(production_config)
production_results = production_framework.run_comprehensive_evaluation([best_config_name], 50)

# Extract results
eval_results = production_results['evaluation_results']
analysis = production_results['analysis']

print(f"\nüéØ PRODUCTION ASSESSMENT RESULTS")
print("=" * 40)

# Overall metrics
summary = analysis.get('summary', {})
print(f"Total Questions: {summary.get('total_questions_tested', 0)}")
print(f"Overall Accuracy: {summary.get('overall_accuracy', 0):.3f} ({summary.get('overall_accuracy', 0)*100:.1f}%)")
print(f"Average Response Time: {summary.get('average_execution_time', 0):.2f}s")
print(f"Total Cost: ${summary.get('total_cost', 0):.3f}")
print(f"Error Rate: {summary.get('error_rate', 0):.3f}")

# GAIA compliance
gaia_compliance = analysis.get('gaia_compliance', {})
target_met = gaia_compliance.get('meets_target', False)
target_accuracy = gaia_compliance.get('target_accuracy', 0.45)
achieved_accuracy = gaia_compliance.get('achieved_accuracy', 0)

print(f"\nüéØ GAIA Compliance Assessment:")
print(f"‚îú‚îÄ‚îÄ Target Accuracy: {target_accuracy:.1%}")
print(f"‚îú‚îÄ‚îÄ Achieved Accuracy: {achieved_accuracy:.1%}")
print(f"‚îî‚îÄ‚îÄ Target Met: {'‚úÖ YES' if target_met else '‚ùå NO'}")

# Level performance
level_performance = gaia_compliance.get('level_performance_distribution', {})
if level_performance:
    print(f"\nüìà Level Performance:")
    for level_key, perf in level_performance.items():
        level_num = level_key.replace('level_', '')
        accuracy = perf.get('accuracy', 0)
        total = perf.get('total_questions', 0)
        print(f"‚îú‚îÄ‚îÄ Level {level_num}: {accuracy:.1%} ({total} questions)")

# Budget analysis
budget_analysis = analysis.get('budget_analysis', {})
print(f"\nüí∞ Budget Analysis:")
print(f"‚îú‚îÄ‚îÄ Budget Allocated: ${budget_analysis.get('budget_allocated', 0):.2f}")
print(f"‚îú‚îÄ‚îÄ Budget Used: ${budget_analysis.get('budget_used', 0):.2f}")
print(f"‚îú‚îÄ‚îÄ Budget Remaining: ${budget_analysis.get('budget_remaining', 0):.2f}")
print(f"‚îú‚îÄ‚îÄ Cost per Question: ${budget_analysis.get('cost_per_question', 0):.3f}")
print(f"‚îî‚îÄ‚îÄ Efficiency Score: {budget_analysis.get('efficiency_score', 0):.3f}")

# Recommendations
recommendations = analysis.get('recommendations', [])
print(f"\nüí° Recommendations:")
for rec in recommendations:
    print(f"‚îú‚îÄ‚îÄ {rec}")

In [None]:
# Production readiness checklist
print(f"\n‚úÖ PRODUCTION READINESS CHECKLIST")
print("=" * 40)

readiness_criteria = {
    "Accuracy >= 45%": achieved_accuracy >= 0.45,
    "Average response time < 60s": summary.get('average_execution_time', 0) < 60,
    "Error rate < 10%": summary.get('error_rate', 0) < 0.10,
    "Budget usage reasonable": budget_analysis.get('budget_used', 0) < budget_analysis.get('budget_allocated', 0) * 0.8,
    "Level 1 accuracy > 60%": level_performance.get('level_1', {}).get('accuracy', 0) > 0.60,
    "Level 2 accuracy > 30%": level_performance.get('level_2', {}).get('accuracy', 0) > 0.30,
    "Fallback rate < 20%": summary.get('fallback_usage_rate', 0) < 0.20
}

readiness_score = 0
total_criteria = len(readiness_criteria)

for criterion, passed in readiness_criteria.items():
    status = "‚úÖ" if passed else "‚ùå"
    print(f"{status} {criterion}")
    if passed:
        readiness_score += 1

print(f"\nüéØ READINESS SCORE: {readiness_score}/{total_criteria} ({readiness_score/total_criteria*100:.1f}%)")

if readiness_score >= total_criteria * 0.8:
    print("üü¢ PRODUCTION READY - Agent meets most criteria")
elif readiness_score >= total_criteria * 0.6:
    print("üü° NEEDS IMPROVEMENT - Agent meets basic criteria but has room for optimization")
else:
    print("üî¥ NOT READY - Agent needs significant improvements before production deployment")

## Section 5. Error Analysis and Improvement

In [None]:
# Analyze errors in detail for improvement opportunities
print(f"\nüîç Detailed Error Analysis")
print("=" * 40)

# Get detailed results from production test
detailed_results = []
for config_name, config_result in eval_results.items():
    if "detailed_results" in config_result:
        detailed_results.extend(config_result["detailed_results"])

if detailed_results:
    df_production = pd.DataFrame(detailed_results)
    
    # Error categorization
    error_questions = df_production[df_production['errors'].apply(lambda x: len(x) if isinstance(x, list) else 0) > 0]
    incorrect_questions = df_production[df_production['is_correct'] == False]
    
    print(f"Total questions analyzed: {len(df_production)}")
    print(f"Questions with errors: {len(error_questions)}")
    print(f"Incorrect answers: {len(incorrect_questions)}")
    print(f"Questions using fallback: {len(df_production[df_production['fallback_used'] == True])}")
    
    # Analyze patterns in incorrect answers
    if len(incorrect_questions) > 0:
        print(f"\n‚ùå Analysis of Incorrect Answers:")
        
        # By level
        incorrect_by_level = incorrect_questions.groupby('level').size()
        print(f"Incorrect answers by level:")
        for level, count in incorrect_by_level.items():
            total_for_level = len(df_production[df_production['level'] == level])
            error_rate = count / total_for_level if total_for_level > 0 else 0
            print(f"  ‚îú‚îÄ‚îÄ Level {level}: {count}/{total_for_level} ({error_rate:.1%} error rate)")
        
        # By strategy
        if 'strategy_used' in incorrect_questions.columns:
            incorrect_by_strategy = incorrect_questions.groupby('strategy_used').size()
            print(f"\nIncorrect answers by strategy:")
            for strategy, count in incorrect_by_strategy.items():
                total_for_strategy = len(df_production[df_production['strategy_used'] == strategy])
                error_rate = count / total_for_strategy if total_for_strategy > 0 else 0
                print(f"  ‚îú‚îÄ‚îÄ {strategy}: {count}/{total_for_strategy} ({error_rate:.1%} error rate)")
        
        # Show worst performing questions for improvement
        print(f"\nüéØ Questions for Improvement (Sample):")
        sample_incorrect = incorrect_questions.sample(min(5, len(incorrect_questions)))
        for i, (_, row) in enumerate(sample_incorrect.iterrows()):
            question = row['question'][:60] + "..." if len(row['question']) > 60 else row['question']
            print(f"  {i+1}. Q: {question}")
            print(f"     Expected: {row['ground_truth']}")
            print(f"     Got: {row['predicted_answer']}")
            print(f"     Level: {row['level']}, Strategy: {row.get('strategy_used', 'Unknown')}")
            if isinstance(row['errors'], list) and row['errors']:
                print(f"     Error: {row['errors'][0]}")
            print()

# Performance optimization suggestions
print(f"\nüí° Performance Optimization Suggestions:")

optimization_suggestions = []

# Accuracy improvements
if achieved_accuracy < 0.50:
    optimization_suggestions.append("Consider using higher-performance models for complex questions")
    optimization_suggestions.append("Improve RAG context by adding more diverse examples")
    optimization_suggestions.append("Enhance prompt engineering with better GAIA formatting instructions")

# Speed improvements
avg_time = summary.get('average_execution_time', 0)
if avg_time > 30:
    optimization_suggestions.append("Optimize agent selection logic to reduce unnecessary steps")
    optimization_suggestions.append("Consider using faster models for simple questions")

# Cost optimization
cost_per_question = budget_analysis.get('cost_per_question', 0)
if cost_per_question > 0.10:
    optimization_suggestions.append("Implement better model tiering to use cheaper models when appropriate")
    optimization_suggestions.append("Optimize retry logic to avoid unnecessary API calls")

# Error reduction
error_rate = summary.get('error_rate', 0)
if error_rate > 0.05:
    optimization_suggestions.append("Improve error handling and recovery mechanisms")
    optimization_suggestions.append("Add input validation to prevent malformed requests")

# Fallback optimization
fallback_rate = summary.get('fallback_usage_rate', 0)
if fallback_rate > 0.15:
    optimization_suggestions.append("Investigate and fix root causes of SmolagAgent failures")
    optimization_suggestions.append("Improve agent reliability and timeout handling")

if optimization_suggestions:
    for i, suggestion in enumerate(optimization_suggestions, 1):
        print(f"  {i}. {suggestion}")
else:
    print("  ‚úÖ System is performing well across all metrics!")

In [None]:
# Generate executive summary and final recommendation
print(f"\nüìã EXECUTIVE SUMMARY")
print("=" * 50)

# Calculate overall grade
grade_components = {
    "Accuracy": (achieved_accuracy, 0.45, 40),  # (actual, target, weight)
    "Speed": (1 - min(avg_time / 60, 1), 0.5, 20),  # Normalized speed score
    "Cost Efficiency": (min(budget_analysis.get('efficiency_score', 0) / 10, 1), 0.5, 20),
    "Reliability": (1 - summary.get('error_rate', 0), 0.95, 20)  # Error rate inverted
}

overall_score = 0
total_weight = 0

print(f"Performance Component Analysis:")
for component, (actual, target, weight) in grade_components.items():
    normalized_score = min(actual / target, 1.0) if target > 0 else 0
    weighted_score = normalized_score * weight
    overall_score += weighted_score
    total_weight += weight
    
    status = "‚úÖ" if normalized_score >= 0.8 else "‚ö†Ô∏è" if normalized_score >= 0.6 else "‚ùå"
    print(f"  {status} {component}: {actual:.3f} (target: {target:.3f}) - Score: {normalized_score:.1%}")

final_grade = overall_score / total_weight
letter_grade = "A" if final_grade >= 0.9 else "B" if final_grade >= 0.8 else "C" if final_grade >= 0.7 else "D" if final_grade >= 0.6 else "F"

print(f"\nüéØ OVERALL ASSESSMENT:")
print(f"‚îú‚îÄ‚îÄ Final Score: {final_grade:.1%}")
print(f"‚îú‚îÄ‚îÄ Letter Grade: {letter_grade}")
print(f"‚îî‚îÄ‚îÄ GAIA Target Met: {'‚úÖ YES' if target_met else '‚ùå NO'}")

# Final recommendation
print(f"\nüöÄ PRODUCTION DEPLOYMENT RECOMMENDATION:")

if final_grade >= 0.8 and target_met:
    recommendation = "üü¢ DEPLOY TO PRODUCTION"
    details = [
        "System meets GAIA accuracy targets",
        "Performance metrics are acceptable",
        "Ready for production workloads",
        f"Recommended configuration: {best_config_name}"
    ]
elif final_grade >= 0.7:
    recommendation = "üü° DEPLOY WITH MONITORING"
    details = [
        "System shows good performance but needs monitoring",
        "Consider implementing additional safeguards",
        "Monitor performance closely in production",
        "Plan for iterative improvements"
    ]
else:
    recommendation = "üî¥ FURTHER DEVELOPMENT NEEDED"
    details = [
        "System requires additional optimization",
        "Focus on accuracy and reliability improvements",
        "Consider alternative model configurations",
        "Conduct additional testing before deployment"
    ]

print(f"\n{recommendation}")
for detail in details:
    print(f"‚îú‚îÄ‚îÄ {detail}")

# Cost projection for production
print(f"\nüí∞ Production Cost Projection:")
questions_per_day = 100  # Estimate
daily_cost = questions_per_day * budget_analysis.get('cost_per_question', 0)
monthly_cost = daily_cost * 30

print(f"‚îú‚îÄ‚îÄ Cost per question: ${budget_analysis.get('cost_per_question', 0):.3f}")
print(f"‚îú‚îÄ‚îÄ Estimated daily cost (100 questions): ${daily_cost:.2f}")
print(f"‚îî‚îÄ‚îÄ Estimated monthly cost: ${monthly_cost:.2f}")

print(f"\nüìä Testing Session Complete!")
print(f"Results saved in: {production_framework.results_dir}")

In [None]:
# Save comprehensive testing report
report_data = {
    "session_metadata": {
        "timestamp": datetime.now().isoformat(),
        "configurations_tested": configs_to_compare,
        "best_configuration": best_config_name,
        "total_questions_tested": summary.get('total_questions_tested', 0)
    },
    "performance_summary": {
        "overall_accuracy": achieved_accuracy,
        "gaia_target_met": target_met,
        "average_response_time": avg_time,
        "total_cost": summary.get('total_cost', 0),
        "error_rate": summary.get('error_rate', 0),
        "final_grade": final_grade,
        "letter_grade": letter_grade
    },
    "detailed_analysis": analysis,
    "readiness_assessment": {
        "readiness_score": f"{readiness_score}/{total_criteria}",
        "readiness_percentage": readiness_score/total_criteria*100,
        "criteria_met": readiness_criteria
    },
    "recommendations": {
        "deployment_recommendation": recommendation,
        "optimization_suggestions": optimization_suggestions,
        "production_configuration": best_config_name
    }
}

# Save to file
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
report_file = Path(f"gaia_evaluation_report_{timestamp}.json")

with open(report_file, 'w') as f:
    json.dump(report_data, f, indent=2, default=str)

print(f"üìã Comprehensive report saved: {report_file}")

# Create summary table for easy reference
summary_df = pd.DataFrame([{
    "Configuration": best_config_name,
    "Accuracy": f"{achieved_accuracy:.1%}",
    "GAIA Target Met": "‚úÖ" if target_met else "‚ùå",
    "Avg Response Time": f"{avg_time:.2f}s",
    "Cost per Question": f"${budget_analysis.get('cost_per_question', 0):.3f}",
    "Final Grade": letter_grade,
    "Recommendation": recommendation.split()[1]  # Extract status emoji
}])

print(f"\nüìä FINAL SUMMARY TABLE:")
print(summary_df.to_string(index=False))

print(f"\nüéâ GAIA Evaluation Complete!")
print(f"Your system is ready for the next phase of development.")