# GAIA Dataset Analysis
## Understanding the 165 GAIA validation examples

**Objective:** Analyze GAIA patterns and build RAG vector store  
**Output:** Tool priorities and FAISS index for agent development

---

In [None]:
# Setup and imports
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter, OrderedDict
import random
import re
from pathlib import Path
from datetime import datetime
import os
from typing import Dict, List, Optional, Tuple
import warnings
warnings.filterwarnings('ignore')

# Set up plotting style
plt.style.use('default')
sns.set_palette("husl")

print("üîç GAIA Dataset Analysis")
print("=" * 40)
print("Goal: Understand patterns and build vector store")
print("=" * 40)

# Section 1: Load & Explore GAIA Data

In [None]:
def load_gaia_metadata(file_path='metadata.jsonl'):
    """Load and parse GAIA validation dataset"""
    try:
        json_QA = []
        with open(file_path, 'r') as f:
            for line in f:
                json_data = json.loads(line.strip())
                json_QA.append(json_data)
        
        print(f"‚úÖ Successfully loaded {len(json_QA)} GAIA examples")
        return json_QA
    except FileNotFoundError:
        print("‚ùå metadata.jsonl not found. Creating sample data for demonstration.")
        return create_sample_gaia_data()
    except Exception as e:
        print(f"‚ùå Error loading GAIA data: {e}")
        return []

def create_sample_gaia_data():
    """Create sample GAIA data for demonstration purposes"""
    sample_data = [
        {
            'task_id': 'sample_001',
            'Question': 'What is the population of Seattle according to the 2020 census?',
            'Level': 1,
            'Final answer': '737015',
            'file_name': None,
            'Annotator Metadata': {
                'Steps': 'Search for Seattle population 2020 census data',
                'Tools': 'web browser\nsearch engine'
            }
        },
        {
            'task_id': 'sample_002', 
            'Question': 'Calculate the compound interest on $5000 at 3.5% annual rate for 10 years',
            'Level': 2,
            'Final answer': '7052.78',
            'file_name': None,
            'Annotator Metadata': {
                'Steps': 'Use compound interest formula: A = P(1 + r)^t',
                'Tools': 'calculator'
            }
        },
        {
            'task_id': 'sample_003',
            'Question': 'What is the average temperature in the attached Excel file?',
            'Level': 1,
            'Final answer': '23.4',
            'file_name': 'temperature_data.xlsx',
            'Annotator Metadata': {
                'Steps': 'Open Excel file, calculate average of temperature column',
                'Tools': 'excel\ncalculator'
            }
        }
    ]
    print("üìù Using sample GAIA data for demonstration")
    return sample_data

# Load the dataset
json_QA = load_gaia_metadata()

In [None]:
def analyze_qa_patterns_for_retrieval():
    """
    Analyze Q&A patterns relevant for retrieval similarity
    Focus: What makes questions similar for vector search
    """
    if not json_QA:
        return
    
    print("üìä Q&A RETRIEVAL ANALYSIS")
    print("=" * 60)
    
    # Basic stats relevant for retrieval
    print(f"Total Q&A pairs: {len(json_QA)}")
    
    # Level distribution (affects retrieval quality)
    levels = [q.get('Level', 'Unknown') for q in json_QA]
    level_counts = Counter(levels)
    print(f"\nLevel distribution (retrieval diversity):")
    for level in sorted(level_counts.keys()):
        print(f"  Level {level}: {level_counts[level]} questions ({level_counts[level]/len(json_QA)*100:.1f}%)")
    
    # Question length analysis (affects embedding quality)
    question_lengths = [len(q.get('Question', '')) for q in json_QA]
    print(f"\nQuestion length analysis:")
    print(f"  Average length: {np.mean(question_lengths):.0f} characters")
    print(f"  Min/Max: {min(question_lengths)}/{max(question_lengths)} characters")
    
    # Answer format analysis (what the LLM should produce)
    answer_lengths = [len(str(q.get('Final answer', ''))) for q in json_QA]
    print(f"\nAnswer format analysis:")
    print(f"  Average answer length: {np.mean(answer_lengths):.1f} characters")
    print(f"  Short answers (<10 chars): {sum(1 for l in answer_lengths if l < 10)}")
    print(f"  Long answers (>50 chars): {sum(1 for l in answer_lengths if l > 50)}")
    
    # File attachment analysis (affects retrieval context)
    files_present = sum(1 for q in json_QA if q.get('file_name'))
    print(f"\nFile attachment distribution:")
    print(f"  With files: {files_present} ({files_present/len(json_QA)*100:.1f}%)")
    print(f"  Text-only: {len(json_QA) - files_present} ({(len(json_QA) - files_present)/len(json_QA)*100:.1f}%)")

def sample_qa_for_vector_store():
    """
    Show sample Q&A pairs that will go into vector store
    """
    print("\nüìù SAMPLE Q&A PAIRS FOR VECTOR STORE")
    print("=" * 60)
    print("This is exactly what retriever will return:")
    print()
    
    sample_questions = random.sample(json_QA, min(3, len(json_QA)))
    
    for i, sample in enumerate(sample_questions, 1):
        # This is EXACTLY what goes in the vector store (pure Q&A)
        vector_content = f"Question : {sample['Question']}\n\nFinal answer : {sample['Final answer']}"
        
        print(f"Vector Store Entry {i}:")
        print("-" * 40)
        print(vector_content)
        print(f"\nMetadata: {{'source': '{sample.get('task_id', 'N/A')}'}}")
        print(f"Level: {sample.get('Level', 'N/A')} | File: {sample.get('file_name', 'None')}")
        print("=" * 60)

# Run core analysis
analyze_qa_patterns_for_retrieval()
sample_qa_for_vector_store()

# Section 2: Tool Usage Analysis

In [None]:
import re
from collections import Counter, OrderedDict

def analyze_tool_usage_fixed():
    """Analyze tool frequency with proper normalization"""
    if not json_QA:
        return {}
    
    tools = []
    tool_details = []
    
    def normalize_tool_name(tool):
        """Normalize tool names to remove duplicates"""
        # Convert to lowercase
        tool = tool.lower().strip()
        
        # Remove numbered prefixes (1., 2., 3., etc.)
        tool = re.sub(r'^\d+\.\s*', '', tool)
        
        # Remove articles (a, an, the)
        tool = re.sub(r'^(a|an|the)\s+', '', tool)
        
        # Remove parentheses and content inside
        tool = re.sub(r'\([^)]*\)', '', tool)
        
        # Remove extra whitespace
        tool = ' '.join(tool.split())
        
        # Common normalizations
        normalizations = {
            'web browser': ['browser', 'web browsers', 'internet browser'],
            'search engine': ['search engines', 'google search', 'web search'],
            'calculator': ['math calculator', 'calculations', 'calculation tool'],
            'excel': ['microsoft excel', 'spreadsheet', 'ms excel'],
            'pdf viewer': ['pdf reader', 'pdf access', 'pdf'],
            'image recognition': ['image recognition tools', 'image analysis', 'image processing'],
            'text editor': ['word processor', 'text processing'],
            'file manager': ['file explorer', 'file system'],
            'audio player': ['music player', 'media player'],
            'video player': ['video viewer', 'media player']
        }
        
        # Apply normalizations
        for canonical, variants in normalizations.items():
            if tool in variants or any(variant in tool for variant in variants):
                return canonical
        
        return tool
    
    for sample in json_QA:
        if 'Annotator Metadata' in sample and 'Tools' in sample['Annotator Metadata']:
            tools_text = sample['Annotator Metadata']['Tools']
            
            # Parse tools (handle different formats)
            tool_lines = tools_text.split('\n')
            for tool_line in tool_lines:
                tool = tool_line.strip()
                
                # Skip empty lines
                if not tool:
                    continue
                
                # Remove bullet points and list markers
                tool = re.sub(r'^[-‚Ä¢*]\s*', '', tool)
                
                # Normalize the tool name
                normalized_tool = normalize_tool_name(tool)
                
                if normalized_tool and normalized_tool != 'none':
                    tools.append(normalized_tool)
                    tool_details.append({
                        'tool': normalized_tool,
                        'original': tool,
                        'question_id': sample.get('task_id'),
                        'level': sample.get('Level'),
                        'has_file': sample.get('file_name') is not None
                    })
    
    # Count tool frequencies
    tools_counter = OrderedDict(Counter(tools).most_common())
    
    print("üéØ Fixed Tool Usage Priority Analysis:")
    print(f"Total tool instances: {len(tools)}")
    print(f"Unique tools identified: {len(tools_counter)}")
    print("\nüìä Implementation Priority (by frequency):")
    
    for i, (tool, count) in enumerate(tools_counter.items(), 1):
        if count >= 20:
            priority = "üî¥ CRITICAL"
        elif count >= 10:
            priority = "üü† HIGH"
        elif count >= 5:
            priority = "üü° MEDIUM"
        else:
            priority = "üü¢ LOW"
        
        print(f"  {i:2d}. {tool:<25} : {count:3d} occurrences {priority}")
        
        if i <= 10:  # Show top 10 details
            # Show which levels use this tool most
            level_usage = {}
            for detail in tool_details:
                if detail['tool'] == tool:
                    level = detail['level']
                    level_usage[level] = level_usage.get(level, 0) + 1
            
            level_str = ", ".join([f"L{k}:{v}" for k, v in sorted(level_usage.items())])
            print(f"      ‚îî‚îÄ‚îÄ Level usage: {level_str}")
    
    return tools_counter, tool_details

def create_implementation_roadmap(tools_counter):
    """Create implementation roadmap based on tool frequency"""
    
    print("\nüöÄ GAIA Agent Implementation Roadmap:")
    print("=" * 50)
    
    # Group tools by implementation priority
    critical_tools = []
    high_tools = []
    medium_tools = []
    
    for tool, count in tools_counter.items():
        if count >= 20:
            critical_tools.append((tool, count))
        elif count >= 10:
            high_tools.append((tool, count))
        elif count >= 5:
            medium_tools.append((tool, count))
    
    print("üî¥ PHASE 1 - CRITICAL (implement first):")
    for tool, count in critical_tools:
        print(f"  ‚úÖ {tool} ({count} uses)")
    
    print("\nüü† PHASE 2 - HIGH PRIORITY:")
    for tool, count in high_tools:
        print(f"  üîß {tool} ({count} uses)")
    
    print("\nüü° PHASE 3 - MEDIUM PRIORITY:")
    for tool, count in medium_tools:
        print(f"  ‚öôÔ∏è {tool} ({count} uses)")
    
    # Map to actual tool implementations
    print("\nüõ†Ô∏è RECOMMENDED TOOL MAPPING:")
    tool_mapping = {
        'web browser': 'ContentRetrieverTool + WebDriverTool',
        'search engine': 'GoogleSearchTool + SerperTool', 
        'calculator': 'GAIACalculatorTool + PythonREPL',
        'excel': 'GetAttachmentTool + PandasTool',
        'pdf viewer': 'ContentRetrieverTool + PyPDFTool',
        'image recognition': 'VisionTool + ImageAnalysisTool',
        'text editor': 'TextProcessingTool',
        'file manager': 'GetAttachmentTool + FileSystemTool'
    }
    
    for tool, count in list(tools_counter.items())[:8]:
        implementation = tool_mapping.get(tool, f"Custom{tool.title().replace(' ', '')}Tool")
        print(f"  {tool:<20} ‚Üí {implementation}")

# Run the fixed analysis
tools_counter, tool_details = analyze_tool_usage_fixed()
create_implementation_roadmap(tools_counter)

# Show some examples of what was normalized
print("\nüîç Normalization Examples:")
unique_originals = {}
for detail in tool_details[:20]:  # Show first 20
    tool = detail['tool']
    original = detail['original']
    if tool not in unique_originals:
        unique_originals[tool] = []
    if original not in unique_originals[tool]:
        unique_originals[tool].append(original)

for tool, originals in list(unique_originals.items())[:5]:
    if len(originals) > 1:
        print(f"  '{tool}' ‚Üê {originals}")

In [None]:
# Create visualization of tool usage
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.patches import Patch

def create_tool_usage_visualization(tools_counter):
    """Create an enhanced visualization of tool usage with updated priority levels"""
    
    if not tools_counter:
        print("‚ùå No tools data to visualize")
        return
    
    # Create figure with subplots
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 10))
    
    # ===== MAIN BAR CHART =====
    # Top 15 tools
    top_tools = list(tools_counter.items())[:15]
    tool_names = [item[0] for item in top_tools]
    tool_counts = [item[1] for item in top_tools]
    
    # Updated color scheme with new thresholds
    colors = []
    for count in tool_counts:
        if count >= 20:
            colors.append('#DC2626')  # Red - CRITICAL
        elif count >= 10:
            colors.append('#F59E0B')  # Orange - HIGH
        elif count >= 5:
            colors.append('#10B981')  # Green - MEDIUM
        else:
            colors.append('#6B7280')  # Gray - LOW
    
    bars = ax1.barh(range(len(tool_names)), tool_counts, color=colors, alpha=0.8)
    ax1.set_yticks(range(len(tool_names)))
    ax1.set_yticklabels(tool_names, fontsize=10)
    ax1.set_xlabel('Usage Frequency', fontsize=12, fontweight='bold')
    ax1.set_title('GAIA Tool Usage Analysis\n(Normalized & Cleaned)', fontsize=14, fontweight='bold')
    ax1.invert_yaxis()
    
    # Add count labels on bars
    for i, (bar, count) in enumerate(zip(bars, tool_counts)):
        # Position label inside bar if bar is wide enough, otherwise outside
        label_x = bar.get_width() - 2 if bar.get_width() > 10 else bar.get_width() + 0.5
        label_color = 'white' if bar.get_width() > 10 else 'black'
        
        ax1.text(label_x, bar.get_y() + bar.get_height()/2, 
                str(count), va='center', ha='right' if bar.get_width() > 10 else 'left',
                fontweight='bold', color=label_color, fontsize=9)
    
    # Add priority zone backgrounds
    ax1.axvspan(20, max(tool_counts) + 5, alpha=0.1, color='red', label='Critical Zone')
    ax1.axvspan(10, 20, alpha=0.1, color='orange', label='High Zone')
    ax1.axvspan(5, 10, alpha=0.1, color='green', label='Medium Zone')
    
    # Enhanced legend
    legend_elements = [
        Patch(facecolor='#DC2626', label='üî¥ CRITICAL (‚â•20 uses)'),
        Patch(facecolor='#F59E0B', label='üü† HIGH (10-19 uses)'),
        Patch(facecolor='#10B981', label='üü° MEDIUM (5-9 uses)'),
        Patch(facecolor='#6B7280', label='üü¢ LOW (<5 uses)')
    ]
    ax1.legend(handles=legend_elements, loc='lower right', fontsize=10)
    
    # ===== IMPLEMENTATION PRIORITY PIE CHART =====
    # Calculate priority distribution
    critical_count = sum(1 for count in tools_counter.values() if count >= 20)
    high_count = sum(1 for count in tools_counter.values() if 10 <= count < 20)
    medium_count = sum(1 for count in tools_counter.values() if 5 <= count < 10)
    low_count = sum(1 for count in tools_counter.values() if count < 5)
    
    priority_labels = ['CRITICAL', 'HIGH', 'MEDIUM', 'LOW']
    priority_counts = [critical_count, high_count, medium_count, low_count]
    priority_colors = ['#DC2626', '#F59E0B', '#10B981', '#6B7280']
    
    # Only show non-zero segments
    non_zero_data = [(label, count, color) for label, count, color in 
                     zip(priority_labels, priority_counts, priority_colors) if count > 0]
    
    if non_zero_data:
        labels, counts, colors = zip(*non_zero_data)
        
        wedges, texts, autotexts = ax2.pie(counts, labels=labels, colors=colors, autopct='%1.0f%%',
                                          startangle=90, textprops={'fontsize': 10})
        
        # Enhance pie chart text
        for autotext in autotexts:
            autotext.set_color('white')
            autotext.set_fontweight('bold')
        
        ax2.set_title('Implementation Priority Distribution\n(Tool Count by Priority)', 
                     fontsize=12, fontweight='bold')
        
        # Add total count in center
        total_tools = len(tools_counter)
        ax2.text(0, 0, f'{total_tools}\nTotal\nTools', ha='center', va='center',
                fontsize=14, fontweight='bold', 
                bbox=dict(boxstyle='round,pad=0.3', facecolor='white', alpha=0.8))
    
    plt.tight_layout()
    plt.show()
    
    # ===== SUMMARY STATS =====
    print("\nüìä VISUALIZATION SUMMARY:")
    print("=" * 40)
    print(f"Total unique tools: {len(tools_counter)}")
    print(f"Total tool instances: {sum(tools_counter.values())}")
    print(f"üî¥ Critical tools (‚â•20): {critical_count}")
    print(f"üü† High priority (10-19): {high_count}")
    print(f"üü° Medium priority (5-9): {medium_count}")
    print(f"üü¢ Low priority (<5): {low_count}")
    
    # Show top 5 with percentages
    total_instances = sum(tools_counter.values())
    print(f"\nüéØ TOP 5 TOOLS (% of total usage):")
    for i, (tool, count) in enumerate(list(tools_counter.items())[:5], 1):
        percentage = (count / total_instances) * 100
        print(f"  {i}. {tool:<20}: {count:3d} uses ({percentage:5.1f}%)")

def create_level_breakdown_chart(tool_details):
    """Additional chart showing tool usage by GAIA level"""
    
    if not tool_details:
        return
    
    # Analyze tool usage by level
    level_tool_usage = {}
    for detail in tool_details:
        level = detail.get('level', 'Unknown')
        tool = detail['tool']
        
        if level not in level_tool_usage:
            level_tool_usage[level] = {}
        
        level_tool_usage[level][tool] = level_tool_usage[level].get(tool, 0) + 1
    
    # Create stacked bar chart
    plt.figure(figsize=(14, 8))
    
    # Get top 10 tools
    from collections import Counter
    all_tools = [detail['tool'] for detail in tool_details]
    top_10_tools = [tool for tool, _ in Counter(all_tools).most_common(10)]
    
    # Prepare data for stacked bars
    levels = sorted(level_tool_usage.keys())
    level_data = {level: [] for level in levels}
    
    for tool in top_10_tools:
        for level in levels:
            count = level_tool_usage[level].get(tool, 0)
            level_data[level].append(count)
    
    # Create stacked bars
    bottom = np.zeros(len(top_10_tools))
    colors_level = ['#EF4444', '#F59E0B', '#10B981']  # Red, Orange, Green for levels 1,2,3
    
    for i, level in enumerate(levels):
        plt.bar(top_10_tools, level_data[level], bottom=bottom, 
               label=f'Level {level}', color=colors_level[i % len(colors_level)], alpha=0.8)
        bottom += level_data[level]
    
    plt.xlabel('Tools', fontsize=12, fontweight='bold')
    plt.ylabel('Usage Count', fontsize=12, fontweight='bold')
    plt.title('Tool Usage by GAIA Difficulty Level\n(Top 10 Tools)', fontsize=14, fontweight='bold')
    plt.xticks(rotation=45, ha='right')
    plt.legend(title='GAIA Level', fontsize=10)
    plt.tight_layout()
    plt.show()

# Usage after running the fixed analysis:
if 'tools_counter' in globals() and tools_counter:
    print("üé® Creating enhanced visualizations...")
    create_tool_usage_visualization(tools_counter)
    
    if 'tool_details' in globals() and tool_details:
        create_level_breakdown_chart(tool_details)
else:
    print("‚ùå Run the fixed tool analysis first to generate visualizations")
    print("Execute: tools_counter, tool_details = analyze_tool_usage_fixed()")

In [None]:
def generate_implementation_recommendations(tools_counter):
    """Generate data-driven tool implementation recommendations"""
    if not tools_counter:
        return
    
    print("\nüí° Implementation Recommendations:")
    print("=" * 45)
    
    # Essential tools (high frequency)
    essential = [(tool, count) for tool, count in tools_counter.items() if count >= 10]
    important = [(tool, count) for tool, count in tools_counter.items() if 5 <= count < 10]
    optional = [(tool, count) for tool, count in tools_counter.items() if count < 5]
    
    print(f"üî¥ ESSENTIAL TOOLS (Implement First):")
    for tool, count in essential:
        print(f"  ‚îú‚îÄ‚îÄ {tool}: {count} occurrences")
    
    print(f"\nüü° IMPORTANT TOOLS (Implement Second):")
    for tool, count in important:
        print(f"  ‚îú‚îÄ‚îÄ {tool}: {count} occurrences")
    
    print(f"\nüü¢ OPTIONAL TOOLS (If Budget Allows):")
    for tool, count in optional[:5]:  # Show top 5 optional
        print(f"  ‚îú‚îÄ‚îÄ {tool}: {count} occurrences")
    
    # File type analysis
    print(f"\nüìÅ File Processing Requirements:")
    file_questions = [q for q in json_QA if q.get('file_name')]
    if file_questions:
        file_extensions = []
        for q in file_questions:
            filename = q.get('file_name', '')
            if '.' in filename:
                ext = Path(filename).suffix.lower()
                file_extensions.append(ext)
        
        ext_counts = Counter(file_extensions)
        for ext, count in ext_counts.most_common():
            print(f"  ‚îú‚îÄ‚îÄ {ext}: {count} files")

# Generate recommendations
generate_implementation_recommendations(tools_counter)

# Section 3: Load Weaviate Vector Store for question answer samples via retriever

In [None]:
# Check for running instances of Weaviate
def cleanup_weaviate_connections():
    """Close any existing Weaviate connections"""
    try:
        # Try to connect to existing instance and close it
        import weaviate
        client = weaviate.connect_to_local(port=8080, grpc_port=50051)
        if client:
            client.close()
            print("‚úÖ Closed existing Weaviate connection")
    except:
        pass  # No existing connection to close

# Run cleanup first
cleanup_weaviate_connections()

# Wait a moment for ports to be released
import time
time.sleep(2)

In [None]:
import pandas as pd
import json
import time
import base64
from dev_retriever import load_gaia_retriever, DevelopmentGAIARetriever
from langchain_core.messages import HumanMessage  # Correct import
import random
import numpy as np

In [None]:
from dev_retriever import load_gaia_retriever

# Quick setup
retriever = load_gaia_retriever()

# Test it works
retriever.test_retrieval()

# Try a search
results = retriever.search("Calculate compound interest")
print(results[0].page_content)

In [None]:
def test_basic_setup():
    """Test basic Weaviate setup and CSV loading"""
    print("üß™ TEST 1: Basic Setup and Connection")
    print("=" * 50)
    
    # Check if CSV exists
    import os
    if not os.path.exists('gaia_embeddings.csv'):
        print("‚ùå gaia_embeddings.csv not found!")
        print("üí° Run 'python build_vectorstore.py' first")
        return False
    
    # Load CSV and check structure
    try:
        df = pd.read_csv('gaia_embeddings.csv')
        print(f"‚úÖ CSV loaded: {len(df)} documents")
        
        # Check required columns for OPTIMIZED format
        required_cols = ['content', 'source', 'embedding_b64']
        missing_cols = [col for col in required_cols if col not in df.columns]
        if missing_cols:
            print(f"‚ùå Missing columns: {missing_cols}")
            print(f"Available columns: {list(df.columns)}")
            return False
        
        print("‚úÖ All required columns present")
        
        # Test optimized embedding format
        sample_embedding_b64 = df.iloc[0]['embedding_b64']
        embedding_bytes = base64.b64decode(sample_embedding_b64)
        embedding_vector = np.frombuffer(embedding_bytes, dtype=np.float32)
        print(f"‚úÖ Optimized embedding dimension: {len(embedding_vector)}")
        
        # Test simplified metadata format
        sample_source = df.iloc[0]['source']
        print(f"‚úÖ Metadata format: {{'source': '{sample_source}'}}")
        
        return True
        
    except Exception as e:
        print(f"‚ùå CSV test failed: {e}")
        return False

# Run basic setup test
setup_success = test_basic_setup()

In [None]:
def test_basic_search(retriever):
    """Test basic search functionality"""
    print("\nüß™ TEST 2: Basic Search Functionality")
    print("=" * 50)
    
    if not retriever:
        print("‚ùå No retriever available")
        return False
    
    test_queries = [
        "Calculate compound interest",
        "What is the population of a city?",
        "Analyze data in Excel file",
        "Find information about scientific research"
    ]
    
    search_success = 0
    
    for i, query in enumerate(test_queries, 1):
        print(f"\nüîç Test Query {i}: '{query}'")
        
        try:
            start_time = time.time()
            results = retriever.search(query, k=1)
            search_time = time.time() - start_time
            
            if results:
                print(f"‚úÖ Found result in {search_time:.3f}s")
                print(f"   Content preview: {results[0].page_content[:100]}...")
                print(f"   Source: {results[0].metadata.get('source', 'N/A')}")
                search_success += 1
            else:
                print("‚ùå No results found")
                
        except Exception as e:
            print(f"‚ùå Search error: {e}")
    
    success_rate = search_success / len(test_queries)
    print(f"\nüìä Search Success Rate: {search_success}/{len(test_queries)} ({success_rate:.1%})")
    
    return success_rate > 0.5

# Run basic search test
if retriever:
    search_success = test_basic_search(retriever)
else:
    print("‚è≠Ô∏è Skipping search test - no retriever")
    search_success = False

In [None]:
def test_example_student_interface(retriever):
    """Test the exact interface used by example student"""
    print("\nüß™ TEST 3: Example Student's Retriever Interface")
    print("=" * 50)
    
    if not retriever:
        print("‚ùå No retriever available")
        return False
    
    # Simulate agent state messages (like example student)
    test_questions = [
        "How do I calculate the area of a circle?",
        "What's the GDP of France?",
        "How can I analyze this Excel spreadsheet?"
    ]
    
    interface_success = 0
    
    for i, question in enumerate(test_questions, 1):
        print(f"\nüîÑ Testing Example Student Interface {i}")
        print(f"Question: '{question}'")
        
        try:
            # Create state messages like #1 student's agent
            state_messages = [HumanMessage(content=question)]
            
            # Use #1 student's exact retriever node interface
            result = retriever.retriever_node(state_messages)
            
            if "messages" in result and len(result["messages"]) > 1:
                # Extract the example message (should be last)
                example_msg = result["messages"][-1]
                
                print("‚úÖ Retriever node works")
                print(f"   Retrieved example length: {len(example_msg.content)} chars")
                
                # Verify it contains the expected format
                if "Here I provide a similar question and answer for reference:" in example_msg.content:
                    print("‚úÖ Correct example student format")
                    interface_success += 1
                else:
                    print("‚ùå Incorrect format")
            else:
                print("‚ùå No example returned")
                
        except Exception as e:
            print(f"‚ùå Interface error: {e}")
    
    success_rate = interface_success / len(test_questions)
    print(f"\nüìä Interface Success Rate: {interface_success}/{len(test_questions)} ({success_rate:.1%})")
    
    return success_rate > 0.5

# Run example student interface test
if retriever:
    interface_success = test_example_student_interface(retriever)
else:
    print("‚è≠Ô∏è Skipping interface test - no retriever")
    interface_success = False

In [None]:
from langchain_core.messages import HumanMessage
def test_example_interface(retriever):
    """Test the exact interface used by example student"""
    print("\nüß™ TEST 4: Example Student's Retriever Interface")
    print("=" * 50)
    
    if not retriever:
        print("‚ùå No retriever available")
        return False
    
    # Simulate agent state messages (like #1 student)
    test_questions = [
        "How do I calculate the area of a circle?",
        "What's the GDP of France?",
        "How can I analyze this Excel spreadsheet?"
    ]
    
    interface_success = 0
    
    for i, question in enumerate(test_questions, 1):
        print(f"\nüîÑ Testing Example Student Interface {i}")
        print(f"Question: '{question}'")
        
        try:
            # Create state messages like example student's agent
            state_messages = [HumanMessage(content=question)]
            
            # Use #1 student's exact retriever node interface
            result = retriever.retriever_node(state_messages)
            
            if "messages" in result and len(result["messages"]) > 1:
                # Extract the example message (should be last)
                example_msg = result["messages"][-1]
                
                print("‚úÖ Retriever node works")
                print(f"   Retrieved example length: {len(example_msg.content)} chars")
                
                # Verify it contains the expected format
                if "Here I provide a similar question and answer for reference:" in example_msg.content:
                    print("‚úÖ Correct example student format")
                    interface_success += 1
                else:
                    print("‚ùå Incorrect format")
            else:
                print("‚ùå No example returned")
                
        except Exception as e:
            print(f"‚ùå Interface error: {e}")
    
    success_rate = interface_success / len(test_questions)
    print(f"\nüìä Interface Success Rate: {interface_success}/{len(test_questions)} ({success_rate:.1%})")
    
    return success_rate > 0.5

# Run example interface test
if retriever:
    interface_success = test_example_interface(retriever)
else:
    print("‚è≠Ô∏è Skipping interface test - no retriever")
    interface_success = False

In [None]:
def test_retrieval_quality(retriever):
    """Test quality of retrieved examples"""
    print("\nüß™ TEST 5: Retrieval Quality Assessment")
    print("=" * 50)
    
    if not retriever:
        print("‚ùå No retriever available")
        return False
    
    # Load original Q&A for comparison
    try:
        with open('metadata.jsonl', 'r') as f:
            original_qa = [json.loads(line) for line in f]
    except:
        print("‚ùå Cannot load original Q&A for comparison")
        return False
    
    quality_scores = []
    
    # Test 1: Same questions (should find exact matches)
    print("üéØ Test 1: Exact retrieval (using original questions)")
    sample_questions = random.sample(original_qa, min(3, len(original_qa)))
    
    for i, sample in enumerate(sample_questions, 1):
        question = sample['Question']
        
        print(f"\nüîç Exact Test {i}")
        print(f"Query: {question[:80]}...")
        
        try:
            results = retriever.search(question, k=1)
            
            if results:
                result_content = results[0].page_content
                
                # Check for exact match (this is GOOD!)
                if question in result_content:
                    print("‚úÖ Perfect exact match found")
                    quality_scores.append(1.0)
                else:
                    print("‚ö†Ô∏è  No exact match - might be similarity threshold issue")
                    quality_scores.append(0.7)
                
                print(f"   Retrieved: {results[0].page_content[:100]}...")
                
            else:
                print("‚ùå No results retrieved")
                quality_scores.append(0.0)
                
        except Exception as e:
            print(f"‚ùå Search error: {e}")
            quality_scores.append(0.0)
    
    # Test 2: Similar questions (test generalization)
    print("\nüéØ Test 2: Similarity retrieval (using modified questions)")
    
    similarity_tests = [
        {
            "original": "Calculate compound interest on $5000 at 3% for 10 years",
            "modified": "How do I compute compound interest for an investment?",
            "expected_keywords": ["interest", "calculate", "compound"]
        },
        {
            "original": "What is the population of Seattle?", 
            "modified": "How many people live in a major US city?",
            "expected_keywords": ["population", "city", "people"]
        }
    ]
    
    for i, test in enumerate(similarity_tests, 1):
        print(f"\nüîç Similarity Test {i}")
        print(f"Query: {test['modified']}")
        
        try:
            results = retriever.search(test['modified'], k=3)
            
            if results:
                # Check if retrieved results are relevant
                found_relevant = False
                
                for result in results:
                    result_content = result.page_content.lower()
                    
                    # Check for keyword relevance
                    keyword_matches = sum(1 for keyword in test['expected_keywords'] 
                                        if keyword.lower() in result_content)
                    
                    if keyword_matches >= 1:
                        found_relevant = True
                        break
                
                if found_relevant:
                    print("‚úÖ Found relevant similar example")
                    quality_scores.append(1.0)
                else:
                    print("‚ö†Ô∏è  Retrieved example seems unrelated")
                    quality_scores.append(0.3)
                
                print(f"   Retrieved: {results[0].page_content[:100]}...")
                
            else:
                print("‚ùå No results retrieved")
                quality_scores.append(0.0)
                
        except Exception as e:
            print(f"‚ùå Search error: {e}")
            quality_scores.append(0.0)
    
    avg_quality = np.mean(quality_scores) if quality_scores else 0
    print(f"\nüìä Average Quality Score: {avg_quality:.2f}/1.0")
    
    # Interpretation
    if avg_quality >= 0.9:
        print("üéâ Excellent retrieval quality!")
    elif avg_quality >= 0.7:
        print("‚úÖ Good retrieval quality")
    elif avg_quality >= 0.5:
        print("‚ö†Ô∏è  Acceptable retrieval quality")
    else:
        print("‚ùå Poor retrieval quality - needs investigation")
    
    return avg_quality > 0.6

# Run quality assessment
if retriever:
    quality_success = test_retrieval_quality(retriever)
else:
    print("‚è≠Ô∏è Skipping quality test - no retriever")
    quality_success = False

In [None]:
def test_performance_benchmarks(retriever):
    """Test retrieval performance and speed"""
    print("\nüß™ TEST 6: Performance Benchmarks")
    print("=" * 50)
    
    if not retriever:
        print("‚ùå No retriever available")
        return False
    
    # Speed test
    test_queries = [
        "Calculate compound interest",
        "Population data analysis", 
        "Excel spreadsheet processing",
        "Scientific research information",
        "Mathematical computation"
    ]
    
    search_times = []
    
    print("üöÄ Speed Test:")
    for query in test_queries:
        try:
            start_time = time.time()
            results = retriever.search(query, k=1)
            search_time = time.time() - start_time
            search_times.append(search_time)
            print(f"  {query[:30]:<30}: {search_time:.3f}s")
        except Exception as e:
            print(f"  {query[:30]:<30}: ERROR ({e})")
    
    if search_times:
        avg_time = np.mean(search_times)
        max_time = max(search_times)
        min_time = min(search_times)
        
        print(f"\nüìä Performance Results:")
        print(f"  Average search time: {avg_time:.3f}s")
        print(f"  Min/Max time: {min_time:.3f}s / {max_time:.3f}s")
        
        # Performance assessment
        if avg_time < 0.1:
            print("‚úÖ Excellent performance (<100ms)")
            return True
        elif avg_time < 0.5:
            print("‚úÖ Good performance (<500ms)")
            return True
        else:
            print("‚ö†Ô∏è  Slow performance (>500ms)")
            return False
    else:
        print("‚ùå No successful searches for performance test")
        return False

# Run performance test
if retriever:
    performance_success = test_performance_benchmarks(retriever)
else:
    print("‚è≠Ô∏è Skipping performance test - no retriever")
    performance_success = False

In [None]:
def print_test_summary():
    """Print comprehensive test summary"""
    print("\n" + "=" * 60)
    print("üéØ WEAVIATE VECTOR STORE TEST SUMMARY")
    print("=" * 60)
    
    tests = [
        ("CSV Setup", setup_success if 'setup_success' in globals() else False),
        ("Retriever Init", retriever is not None if 'retriever' in globals() else False),
        ("Basic Search", search_success if 'search_success' in globals() else False),
        ("Example Student Interface", interface_success if 'interface_success' in globals() else False),
        ("Retrieval Quality", quality_success if 'quality_success' in globals() else False),
        ("Performance", performance_success if 'performance_success' in globals() else False)
    ]
    
    passed_tests = sum(1 for _, success in tests if success)
    total_tests = len(tests)
    
    print(f"Test Results: {passed_tests}/{total_tests} passed")
    print()
    
    for test_name, success in tests:
        status = "‚úÖ PASS" if success else "‚ùå FAIL"
        print(f"  {test_name:<20}: {status}")
    
    print()
    
    if passed_tests == total_tests:
        print("üéâ ALL TESTS PASSED!")
        print("‚úÖ Vector store is ready for agent integration")
        print("‚úÖ Compatible with example student's approach")
        print("‚úÖ Production ready")
    elif passed_tests >= total_tests * 0.8:
        print("‚ö†Ô∏è  MOSTLY WORKING")
        print("‚úÖ Core functionality works")
        print("üí° Minor issues to address")
    else:
        print("‚ùå SIGNIFICANT ISSUES")
        print("üîß Requires debugging before agent integration")
    
    print("\nüéØ Ready for agent development!" if passed_tests >= total_tests * 0.8 else "\nüîß Fix issues before proceeding")

# Print final summary
print_test_summary()

In [None]:
# Clean up
if 'retriever' in globals() and retriever:
    retriever.close()
    print("\nüßπ Retriever closed and cleaned up")

# Section 4: Creation of test batches

In [None]:
import gaia_dataset_utils

# See all available functions
print("Available functions:", [attr for attr in dir(gaia_dataset_utils) if not attr.startswith('_')])

# Check if the function exists
if hasattr(gaia_dataset_utils, 'quick_dataset_check'):
    print("‚úÖ quick_dataset_check found!")
else:
    print("‚ùå quick_dataset_check missing!")

In [None]:
from gaia_dataset_utils import quick_dataset_check, GAIADatasetManager

# Check your dataset structure and file references
quick_dataset_check("/tests/gaia_data")

In [None]:
import json
import os

# Check what's actually in your metadata.json
metadata_path = "./tests/gaia_data/metadata.json"

if os.path.exists(metadata_path):
    with open(metadata_path, 'r') as f:
        data = json.load(f)
    
    print(f"Metadata type: {type(data)}")
    print(f"Metadata keys/length: {list(data.keys()) if isinstance(data, dict) else len(data)}")
    
    # Show first item structure
    if isinstance(data, dict):
        first_key = list(data.keys())[0]
        print(f"First item: {first_key}: {data[first_key]}")
    elif isinstance(data, list):
        print(f"First item: {data[0] if data else 'Empty list'}")
    else:
        print(f"Unexpected data structure: {data}")
else:
    print(f"‚ùå metadata.json not found at {metadata_path}")