# GAIA Dataset Analysis
## Understanding the 165 GAIA validation examples

**Objective:** Analyze GAIA patterns and build RAG vector store  
**Output:** Tool priorities and FAISS index for agent development

---

In [None]:
# Setup and imports
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter, OrderedDict
import random
import re
from pathlib import Path
from datetime import datetime
import os
from typing import Dict, List, Optional, Tuple
import warnings
warnings.filterwarnings('ignore')

# Set up plotting style
plt.style.use('default')
sns.set_palette("husl")

print("üîç GAIA Dataset Analysis")
print("=" * 40)
print("Goal: Understand patterns and build vector store")
print("=" * 40)

# Section 1: Load & Explore GAIA Data

In [None]:
def load_gaia_metadata(file_path='metadata.jsonl'):
    """Load and parse GAIA validation dataset"""
    try:
        json_QA = []
        with open(file_path, 'r') as f:
            for line in f:
                json_data = json.loads(line.strip())
                json_QA.append(json_data)
        
        print(f"‚úÖ Successfully loaded {len(json_QA)} GAIA examples")
        return json_QA
    except FileNotFoundError:
        print("‚ùå metadata.jsonl not found. Creating sample data for demonstration.")
        return create_sample_gaia_data()
    except Exception as e:
        print(f"‚ùå Error loading GAIA data: {e}")
        return []

def create_sample_gaia_data():
    """Create sample GAIA data for demonstration purposes"""
    sample_data = [
        {
            'task_id': 'sample_001',
            'Question': 'What is the population of Seattle according to the 2020 census?',
            'Level': 1,
            'Final answer': '737015',
            'file_name': None,
            'Annotator Metadata': {
                'Steps': 'Search for Seattle population 2020 census data',
                'Tools': 'web browser\nsearch engine'
            }
        },
        {
            'task_id': 'sample_002', 
            'Question': 'Calculate the compound interest on $5000 at 3.5% annual rate for 10 years',
            'Level': 2,
            'Final answer': '7052.78',
            'file_name': None,
            'Annotator Metadata': {
                'Steps': 'Use compound interest formula: A = P(1 + r)^t',
                'Tools': 'calculator'
            }
        },
        {
            'task_id': 'sample_003',
            'Question': 'What is the average temperature in the attached Excel file?',
            'Level': 1,
            'Final answer': '23.4',
            'file_name': 'temperature_data.xlsx',
            'Annotator Metadata': {
                'Steps': 'Open Excel file, calculate average of temperature column',
                'Tools': 'excel\ncalculator'
            }
        }
    ]
    print("üìù Using sample GAIA data for demonstration")
    return sample_data

# Load the dataset
json_QA = load_gaia_metadata()

In [None]:
# Basic dataset statistics
if json_QA:
    print(f"\nüìà Dataset Overview:")
    print(f"  ‚îú‚îÄ‚îÄ Total Questions: {len(json_QA)}")
    
    # Level distribution
    if 'Level' in json_QA[0]:
        levels = [q.get('Level', 'Unknown') for q in json_QA]
        level_counts = Counter(levels)
        print(f"  ‚îú‚îÄ‚îÄ Level Distribution:")
        for level in sorted(level_counts.keys()):
            print(f"  ‚îÇ   ‚îú‚îÄ‚îÄ Level {level}: {level_counts[level]} questions")
    
    # File attachment analysis
    files_present = sum(1 for q in json_QA if q.get('file_name'))
    print(f"  ‚îú‚îÄ‚îÄ Questions with Files: {files_present}")
    print(f"  ‚îî‚îÄ‚îÄ Questions without Files: {len(json_QA) - files_present}")

In [None]:
def analyze_gaia_patterns(sample_size=5):
    """Analyze question patterns and structures"""
    if not json_QA:
        return
    
    print(f"\nüîç Sample Question Analysis (Random {sample_size}):")
    print("=" * 120)
    
    sample_questions = random.sample(json_QA, min(sample_size, len(json_QA)))
    
    for i, sample in enumerate(sample_questions, 1):
        print(f"\nüìù Question {i}:")
        print(f"ID: {sample.get('task_id', 'N/A')}")
        print(f"Level: {sample.get('Level', 'N/A')}")
        print(f"Question: {sample.get('Question', 'N/A')[:100]}...")
        print(f"File: {sample.get('file_name', 'None')}")
        
        if 'Annotator Metadata' in sample:
            metadata = sample['Annotator Metadata']
            print(f"Steps: {metadata.get('Steps', 'N/A')[:80]}...")
            print(f"Tools: {metadata.get('Tools', 'N/A')}")
        
        print(f"Answer: {sample.get('Final answer', 'N/A')}")
        print("-" * 40)

# Run pattern analysis
analyze_gaia_patterns()

# Section 2: Tool Usage Analysis

In [None]:
import re
from collections import Counter, OrderedDict

def analyze_tool_usage_fixed():
    """Analyze tool frequency with proper normalization"""
    if not json_QA:
        return {}
    
    tools = []
    tool_details = []
    
    def normalize_tool_name(tool):
        """Normalize tool names to remove duplicates"""
        # Convert to lowercase
        tool = tool.lower().strip()
        
        # Remove numbered prefixes (1., 2., 3., etc.)
        tool = re.sub(r'^\d+\.\s*', '', tool)
        
        # Remove articles (a, an, the)
        tool = re.sub(r'^(a|an|the)\s+', '', tool)
        
        # Remove parentheses and content inside
        tool = re.sub(r'\([^)]*\)', '', tool)
        
        # Remove extra whitespace
        tool = ' '.join(tool.split())
        
        # Common normalizations
        normalizations = {
            'web browser': ['browser', 'web browsers', 'internet browser'],
            'search engine': ['search engines', 'google search', 'web search'],
            'calculator': ['math calculator', 'calculations', 'calculation tool'],
            'excel': ['microsoft excel', 'spreadsheet', 'ms excel'],
            'pdf viewer': ['pdf reader', 'pdf access', 'pdf'],
            'image recognition': ['image recognition tools', 'image analysis', 'image processing'],
            'text editor': ['word processor', 'text processing'],
            'file manager': ['file explorer', 'file system'],
            'audio player': ['music player', 'media player'],
            'video player': ['video viewer', 'media player']
        }
        
        # Apply normalizations
        for canonical, variants in normalizations.items():
            if tool in variants or any(variant in tool for variant in variants):
                return canonical
        
        return tool
    
    for sample in json_QA:
        if 'Annotator Metadata' in sample and 'Tools' in sample['Annotator Metadata']:
            tools_text = sample['Annotator Metadata']['Tools']
            
            # Parse tools (handle different formats)
            tool_lines = tools_text.split('\n')
            for tool_line in tool_lines:
                tool = tool_line.strip()
                
                # Skip empty lines
                if not tool:
                    continue
                
                # Remove bullet points and list markers
                tool = re.sub(r'^[-‚Ä¢*]\s*', '', tool)
                
                # Normalize the tool name
                normalized_tool = normalize_tool_name(tool)
                
                if normalized_tool and normalized_tool != 'none':
                    tools.append(normalized_tool)
                    tool_details.append({
                        'tool': normalized_tool,
                        'original': tool,
                        'question_id': sample.get('task_id'),
                        'level': sample.get('Level'),
                        'has_file': sample.get('file_name') is not None
                    })
    
    # Count tool frequencies
    tools_counter = OrderedDict(Counter(tools).most_common())
    
    print("üéØ Fixed Tool Usage Priority Analysis:")
    print(f"Total tool instances: {len(tools)}")
    print(f"Unique tools identified: {len(tools_counter)}")
    print("\nüìä Implementation Priority (by frequency):")
    
    for i, (tool, count) in enumerate(tools_counter.items(), 1):
        if count >= 20:
            priority = "üî¥ CRITICAL"
        elif count >= 10:
            priority = "üü† HIGH"
        elif count >= 5:
            priority = "üü° MEDIUM"
        else:
            priority = "üü¢ LOW"
        
        print(f"  {i:2d}. {tool:<25} : {count:3d} occurrences {priority}")
        
        if i <= 10:  # Show top 10 details
            # Show which levels use this tool most
            level_usage = {}
            for detail in tool_details:
                if detail['tool'] == tool:
                    level = detail['level']
                    level_usage[level] = level_usage.get(level, 0) + 1
            
            level_str = ", ".join([f"L{k}:{v}" for k, v in sorted(level_usage.items())])
            print(f"      ‚îî‚îÄ‚îÄ Level usage: {level_str}")
    
    return tools_counter, tool_details

def create_implementation_roadmap(tools_counter):
    """Create implementation roadmap based on tool frequency"""
    
    print("\nüöÄ GAIA Agent Implementation Roadmap:")
    print("=" * 50)
    
    # Group tools by implementation priority
    critical_tools = []
    high_tools = []
    medium_tools = []
    
    for tool, count in tools_counter.items():
        if count >= 20:
            critical_tools.append((tool, count))
        elif count >= 10:
            high_tools.append((tool, count))
        elif count >= 5:
            medium_tools.append((tool, count))
    
    print("üî¥ PHASE 1 - CRITICAL (implement first):")
    for tool, count in critical_tools:
        print(f"  ‚úÖ {tool} ({count} uses)")
    
    print("\nüü† PHASE 2 - HIGH PRIORITY:")
    for tool, count in high_tools:
        print(f"  üîß {tool} ({count} uses)")
    
    print("\nüü° PHASE 3 - MEDIUM PRIORITY:")
    for tool, count in medium_tools:
        print(f"  ‚öôÔ∏è {tool} ({count} uses)")
    
    # Map to actual tool implementations
    print("\nüõ†Ô∏è RECOMMENDED TOOL MAPPING:")
    tool_mapping = {
        'web browser': 'ContentRetrieverTool + WebDriverTool',
        'search engine': 'GoogleSearchTool + SerperTool', 
        'calculator': 'GAIACalculatorTool + PythonREPL',
        'excel': 'GetAttachmentTool + PandasTool',
        'pdf viewer': 'ContentRetrieverTool + PyPDFTool',
        'image recognition': 'VisionTool + ImageAnalysisTool',
        'text editor': 'TextProcessingTool',
        'file manager': 'GetAttachmentTool + FileSystemTool'
    }
    
    for tool, count in list(tools_counter.items())[:8]:
        implementation = tool_mapping.get(tool, f"Custom{tool.title().replace(' ', '')}Tool")
        print(f"  {tool:<20} ‚Üí {implementation}")

# Run the fixed analysis
tools_counter, tool_details = analyze_tool_usage_fixed()
create_implementation_roadmap(tools_counter)

# Show some examples of what was normalized
print("\nüîç Normalization Examples:")
unique_originals = {}
for detail in tool_details[:20]:  # Show first 20
    tool = detail['tool']
    original = detail['original']
    if tool not in unique_originals:
        unique_originals[tool] = []
    if original not in unique_originals[tool]:
        unique_originals[tool].append(original)

for tool, originals in list(unique_originals.items())[:5]:
    if len(originals) > 1:
        print(f"  '{tool}' ‚Üê {originals}")

In [None]:
# Create visualization of tool usage
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.patches import Patch

def create_tool_usage_visualization(tools_counter):
    """Create an enhanced visualization of tool usage with updated priority levels"""
    
    if not tools_counter:
        print("‚ùå No tools data to visualize")
        return
    
    # Create figure with subplots
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 10))
    
    # ===== MAIN BAR CHART =====
    # Top 15 tools
    top_tools = list(tools_counter.items())[:15]
    tool_names = [item[0] for item in top_tools]
    tool_counts = [item[1] for item in top_tools]
    
    # Updated color scheme with new thresholds
    colors = []
    for count in tool_counts:
        if count >= 20:
            colors.append('#DC2626')  # Red - CRITICAL
        elif count >= 10:
            colors.append('#F59E0B')  # Orange - HIGH
        elif count >= 5:
            colors.append('#10B981')  # Green - MEDIUM
        else:
            colors.append('#6B7280')  # Gray - LOW
    
    bars = ax1.barh(range(len(tool_names)), tool_counts, color=colors, alpha=0.8)
    ax1.set_yticks(range(len(tool_names)))
    ax1.set_yticklabels(tool_names, fontsize=10)
    ax1.set_xlabel('Usage Frequency', fontsize=12, fontweight='bold')
    ax1.set_title('GAIA Tool Usage Analysis\n(Normalized & Cleaned)', fontsize=14, fontweight='bold')
    ax1.invert_yaxis()
    
    # Add count labels on bars
    for i, (bar, count) in enumerate(zip(bars, tool_counts)):
        # Position label inside bar if bar is wide enough, otherwise outside
        label_x = bar.get_width() - 2 if bar.get_width() > 10 else bar.get_width() + 0.5
        label_color = 'white' if bar.get_width() > 10 else 'black'
        
        ax1.text(label_x, bar.get_y() + bar.get_height()/2, 
                str(count), va='center', ha='right' if bar.get_width() > 10 else 'left',
                fontweight='bold', color=label_color, fontsize=9)
    
    # Add priority zone backgrounds
    ax1.axvspan(20, max(tool_counts) + 5, alpha=0.1, color='red', label='Critical Zone')
    ax1.axvspan(10, 20, alpha=0.1, color='orange', label='High Zone')
    ax1.axvspan(5, 10, alpha=0.1, color='green', label='Medium Zone')
    
    # Enhanced legend
    legend_elements = [
        Patch(facecolor='#DC2626', label='üî¥ CRITICAL (‚â•20 uses)'),
        Patch(facecolor='#F59E0B', label='üü† HIGH (10-19 uses)'),
        Patch(facecolor='#10B981', label='üü° MEDIUM (5-9 uses)'),
        Patch(facecolor='#6B7280', label='üü¢ LOW (<5 uses)')
    ]
    ax1.legend(handles=legend_elements, loc='lower right', fontsize=10)
    
    # ===== IMPLEMENTATION PRIORITY PIE CHART =====
    # Calculate priority distribution
    critical_count = sum(1 for count in tools_counter.values() if count >= 20)
    high_count = sum(1 for count in tools_counter.values() if 10 <= count < 20)
    medium_count = sum(1 for count in tools_counter.values() if 5 <= count < 10)
    low_count = sum(1 for count in tools_counter.values() if count < 5)
    
    priority_labels = ['CRITICAL', 'HIGH', 'MEDIUM', 'LOW']
    priority_counts = [critical_count, high_count, medium_count, low_count]
    priority_colors = ['#DC2626', '#F59E0B', '#10B981', '#6B7280']
    
    # Only show non-zero segments
    non_zero_data = [(label, count, color) for label, count, color in 
                     zip(priority_labels, priority_counts, priority_colors) if count > 0]
    
    if non_zero_data:
        labels, counts, colors = zip(*non_zero_data)
        
        wedges, texts, autotexts = ax2.pie(counts, labels=labels, colors=colors, autopct='%1.0f%%',
                                          startangle=90, textprops={'fontsize': 10})
        
        # Enhance pie chart text
        for autotext in autotexts:
            autotext.set_color('white')
            autotext.set_fontweight('bold')
        
        ax2.set_title('Implementation Priority Distribution\n(Tool Count by Priority)', 
                     fontsize=12, fontweight='bold')
        
        # Add total count in center
        total_tools = len(tools_counter)
        ax2.text(0, 0, f'{total_tools}\nTotal\nTools', ha='center', va='center',
                fontsize=14, fontweight='bold', 
                bbox=dict(boxstyle='round,pad=0.3', facecolor='white', alpha=0.8))
    
    plt.tight_layout()
    plt.show()
    
    # ===== SUMMARY STATS =====
    print("\nüìä VISUALIZATION SUMMARY:")
    print("=" * 40)
    print(f"Total unique tools: {len(tools_counter)}")
    print(f"Total tool instances: {sum(tools_counter.values())}")
    print(f"üî¥ Critical tools (‚â•20): {critical_count}")
    print(f"üü† High priority (10-19): {high_count}")
    print(f"üü° Medium priority (5-9): {medium_count}")
    print(f"üü¢ Low priority (<5): {low_count}")
    
    # Show top 5 with percentages
    total_instances = sum(tools_counter.values())
    print(f"\nüéØ TOP 5 TOOLS (% of total usage):")
    for i, (tool, count) in enumerate(list(tools_counter.items())[:5], 1):
        percentage = (count / total_instances) * 100
        print(f"  {i}. {tool:<20}: {count:3d} uses ({percentage:5.1f}%)")

def create_level_breakdown_chart(tool_details):
    """Additional chart showing tool usage by GAIA level"""
    
    if not tool_details:
        return
    
    # Analyze tool usage by level
    level_tool_usage = {}
    for detail in tool_details:
        level = detail.get('level', 'Unknown')
        tool = detail['tool']
        
        if level not in level_tool_usage:
            level_tool_usage[level] = {}
        
        level_tool_usage[level][tool] = level_tool_usage[level].get(tool, 0) + 1
    
    # Create stacked bar chart
    plt.figure(figsize=(14, 8))
    
    # Get top 10 tools
    from collections import Counter
    all_tools = [detail['tool'] for detail in tool_details]
    top_10_tools = [tool for tool, _ in Counter(all_tools).most_common(10)]
    
    # Prepare data for stacked bars
    levels = sorted(level_tool_usage.keys())
    level_data = {level: [] for level in levels}
    
    for tool in top_10_tools:
        for level in levels:
            count = level_tool_usage[level].get(tool, 0)
            level_data[level].append(count)
    
    # Create stacked bars
    bottom = np.zeros(len(top_10_tools))
    colors_level = ['#EF4444', '#F59E0B', '#10B981']  # Red, Orange, Green for levels 1,2,3
    
    for i, level in enumerate(levels):
        plt.bar(top_10_tools, level_data[level], bottom=bottom, 
               label=f'Level {level}', color=colors_level[i % len(colors_level)], alpha=0.8)
        bottom += level_data[level]
    
    plt.xlabel('Tools', fontsize=12, fontweight='bold')
    plt.ylabel('Usage Count', fontsize=12, fontweight='bold')
    plt.title('Tool Usage by GAIA Difficulty Level\n(Top 10 Tools)', fontsize=14, fontweight='bold')
    plt.xticks(rotation=45, ha='right')
    plt.legend(title='GAIA Level', fontsize=10)
    plt.tight_layout()
    plt.show()

# Usage after running the fixed analysis:
if 'tools_counter' in globals() and tools_counter:
    print("üé® Creating enhanced visualizations...")
    create_tool_usage_visualization(tools_counter)
    
    if 'tool_details' in globals() and tool_details:
        create_level_breakdown_chart(tool_details)
else:
    print("‚ùå Run the fixed tool analysis first to generate visualizations")
    print("Execute: tools_counter, tool_details = analyze_tool_usage_fixed()")

In [None]:
def generate_implementation_recommendations(tools_counter):
    """Generate data-driven tool implementation recommendations"""
    if not tools_counter:
        return
    
    print("\nüí° Implementation Recommendations:")
    print("=" * 45)
    
    # Essential tools (high frequency)
    essential = [(tool, count) for tool, count in tools_counter.items() if count >= 10]
    important = [(tool, count) for tool, count in tools_counter.items() if 5 <= count < 10]
    optional = [(tool, count) for tool, count in tools_counter.items() if count < 5]
    
    print(f"üî¥ ESSENTIAL TOOLS (Implement First):")
    for tool, count in essential:
        print(f"  ‚îú‚îÄ‚îÄ {tool}: {count} occurrences")
    
    print(f"\nüü° IMPORTANT TOOLS (Implement Second):")
    for tool, count in important:
        print(f"  ‚îú‚îÄ‚îÄ {tool}: {count} occurrences")
    
    print(f"\nüü¢ OPTIONAL TOOLS (If Budget Allows):")
    for tool, count in optional[:5]:  # Show top 5 optional
        print(f"  ‚îú‚îÄ‚îÄ {tool}: {count} occurrences")
    
    # File type analysis
    print(f"\nüìÅ File Processing Requirements:")
    file_questions = [q for q in json_QA if q.get('file_name')]
    if file_questions:
        file_extensions = []
        for q in file_questions:
            filename = q.get('file_name', '')
            if '.' in filename:
                ext = Path(filename).suffix.lower()
                file_extensions.append(ext)
        
        ext_counts = Counter(file_extensions)
        for ext, count in ext_counts.most_common():
            print(f"  ‚îú‚îÄ‚îÄ {ext}: {count} files")

# Generate recommendations
generate_implementation_recommendations(tools_counter)

# Section 3: Build Weaviate Vector Store

In [None]:
# GAIA Weaviate Vector Store - LangChain Implementation with Efficient Serialization
# Uses LangChain's serialize_to_bytes() to avoid model bloat

# Weaviate GAIA Vector Store Implementation
# Modern, numpy 2.0 compatible, production-ready

import os
import json
import time
from typing import List, Dict, Tuple, Optional
import weaviate
import weaviate.classes as wvc
from weaviate.classes.config import Property, DataType

# Configuration
WEAVIATE_URL = "http://localhost:8080"  # Local Docker
COLLECTION_NAME = "GAIAExamples"
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"

def check_weaviate_dependencies():
    """Check if Weaviate dependencies are available"""
    missing = []
    
    try:
        import weaviate
        print("‚úÖ Weaviate client available")
    except ImportError:
        missing.append("weaviate-client")
        print("‚ùå Weaviate client not available")
    
    try:
        from langchain_weaviate import WeaviateVectorStore
        print("‚úÖ LangChain Weaviate available")
    except ImportError:
        missing.append("langchain-weaviate")
        print("‚ùå LangChain Weaviate not available")
    
    try:
        from langchain_huggingface import HuggingFaceEmbeddings
        print("‚úÖ HuggingFace embeddings available")
    except ImportError:
        missing.append("langchain-huggingface")
        print("‚ùå HuggingFace embeddings not available")
    
    if missing:
        print(f"\nüì¶ Missing packages: {missing}")
        print("Install with:")
        for pkg in missing:
            print(f"  poetry add {pkg}")
        return False
    
    return True

def setup_local_weaviate():
    """Setup local Weaviate with Docker Compose"""
    docker_compose = """
version: '3.4'
services:
  weaviate:
    command:
    - --host
    - 0.0.0.0
    - --port
    - '8080'
    - --scheme
    - http
    image: cr.weaviate.io/semitechnologies/weaviate:1.26.1
    ports:
    - 8080:8080
    - 50051:50051
    restart: on-failure:0
    environment:
      QUERY_DEFAULTS_LIMIT: 25
      AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: 'true'
      PERSISTENCE_DATA_PATH: '/var/lib/weaviate'
      DEFAULT_VECTORIZER_MODULE: 'text2vec-huggingface'
      ENABLE_MODULES: 'text2vec-huggingface'
      HUGGINGFACE_APIKEY: 'YOUR_HUGGINGFACE_KEY'  # Optional for local models
      CLUSTER_HOSTNAME: 'node1'
"""
    
    print("üê≥ Docker Compose for Weaviate:")
    print("Save this as docker-compose.yml and run: docker-compose up -d")
    print(docker_compose)

def connect_to_weaviate(url: str = WEAVIATE_URL) -> weaviate.WeaviateClient:
    """Connect to Weaviate instance"""
    try:
        # Try connecting to local instance
        client = weaviate.connect_to_local(
            host="localhost",
            port=8080,
            grpc_port=50051
        )
        
        if client.is_ready():
            print(f"‚úÖ Connected to Weaviate at {url}")
            return client
        else:
            print(f"‚ùå Weaviate not ready at {url}")
            return None
            
    except Exception as e:
        print(f"‚ùå Failed to connect to Weaviate: {e}")
        print("üí° Make sure Weaviate is running with Docker:")
        print("   docker-compose up -d")
        return None

def create_gaia_collection(client: weaviate.WeaviateClient) -> bool:
    """Create GAIA collection in Weaviate"""
    try:
        # Delete existing collection if it exists
        if client.collections.exists(COLLECTION_NAME):
            print(f"üóëÔ∏è Deleting existing collection: {COLLECTION_NAME}")
            client.collections.delete(COLLECTION_NAME)
        
        # Create new collection with schema
        collection = client.collections.create(
            name=COLLECTION_NAME,
            properties=[
                Property(name="task_id", data_type=DataType.TEXT),
                Property(name="question", data_type=DataType.TEXT),
                Property(name="answer", data_type=DataType.TEXT),
                Property(name="level", data_type=DataType.INT),
                Property(name="has_file", data_type=DataType.BOOL),
                Property(name="steps", data_type=DataType.TEXT),
                Property(name="content", data_type=DataType.TEXT),  # Main searchable content
            ],
            # Use built-in text2vec-huggingface vectorizer
            vectorizer_config=wvc.config.Configure.Vectorizer.text2vec_huggingface(
                model=EMBEDDING_MODEL
            ),
            # Configure which field to vectorize
            vector_index_config=wvc.config.Configure.VectorIndex.hnsw(),
        )
        
        print(f"‚úÖ Created collection: {COLLECTION_NAME}")
        return True
        
    except Exception as e:
        print(f"‚ùå Failed to create collection: {e}")
        return False

def populate_gaia_collection(client: weaviate.WeaviateClient, json_QA: List[Dict], max_examples: Optional[int] = None) -> bool:
    """Populate Weaviate collection with GAIA examples"""
    try:
        collection = client.collections.get(COLLECTION_NAME)
        
        # Limit examples if specified
        examples = json_QA[:max_examples] if max_examples else json_QA
        print(f"üìö Populating collection with {len(examples)} examples...")
        
        # Prepare data objects for batch insert
        data_objects = []
        
        for i, sample in enumerate(examples):
            # Create rich content for vectorization
            content = f"Question: {sample.get('Question', '')}"
            
            if sample.get('Final answer'):
                content += f"\nAnswer: {sample.get('Final answer', '')}"
            
            # Add steps if available
            steps = ""
            if (sample.get('Annotator Metadata') and 
                sample['Annotator Metadata'].get('Steps')):
                steps = sample['Annotator Metadata']['Steps']
                content += f"\nSteps: {steps}"
            
            # Create data object
            data_obj = {
                "task_id": sample.get('task_id', f'gaia_{i}'),
                "question": sample.get('Question', ''),
                "answer": sample.get('Final answer', ''),
                "level": sample.get('Level', 1),
                "has_file": sample.get('file_name') is not None,
                "steps": steps,
                "content": content,  # This gets vectorized
            }
            
            data_objects.append(data_obj)
        
        print(f"  ‚îú‚îÄ‚îÄ Average content length: {sum(len(obj['content']) for obj in data_objects) / len(data_objects):.0f} chars")
        
        # Batch insert with automatic vectorization
        print("üîÑ Inserting data with automatic vectorization...")
        start_time = time.time()
        
        # Use batch insert for efficiency
        with collection.batch.dynamic() as batch:
            for obj in data_objects:
                batch.add_object(obj)
        
        duration = time.time() - start_time
        
        # Verify insertion
        total_objects = collection.aggregate.over_all(total_count=True).total_count
        
        print(f"‚úÖ Successfully inserted {total_objects} objects in {duration:.2f} seconds")
        print(f"  ‚îú‚îÄ‚îÄ Rate: {total_objects/duration:.1f} objects/second")
        print(f"  ‚îî‚îÄ‚îÄ Collection: {COLLECTION_NAME}")
        
        return True
        
    except Exception as e:
        print(f"‚ùå Failed to populate collection: {e}")
        import traceback
        traceback.print_exc()
        return False

def search_gaia_examples(client: weaviate.WeaviateClient, query: str, k: int = 3) -> List[Tuple[Dict, float]]:
    """Search for similar GAIA examples"""
    try:
        collection = client.collections.get(COLLECTION_NAME)
        
        # Perform semantic search
        response = collection.query.near_text(
            query=query,
            limit=k,
            return_metadata=wvc.query.MetadataQuery(distance=True, certainty=True)
        )
        
        # Format results
        results = []
        for obj in response.objects:
            # Extract properties
            meta = {
                'task_id': obj.properties.get('task_id'),
                'question': obj.properties.get('question'),
                'answer': obj.properties.get('answer'),
                'level': obj.properties.get('level'),
                'has_file': obj.properties.get('has_file'),
                'steps': obj.properties.get('steps'),
            }
            
            # Convert distance to similarity score (higher = more similar)
            distance = obj.metadata.distance or 0
            similarity_score = 1 - distance  # Convert distance to similarity
            
            results.append((meta, similarity_score))
        
        return results
        
    except Exception as e:
        print(f"‚ùå Search failed: {e}")
        return []

def print_search_results(query: str, results: List[Tuple[Dict, float]]):
    """Pretty print search results"""
    print(f"\nüîç Query: '{query}'")
    print("=" * 50)
    
    if not results:
        print("‚ùå No results found")
        return
    
    for i, (meta, score) in enumerate(results, 1):
        print(f"\n{i}. Similarity: {score:.3f} | Level: {meta.get('level', 'N/A')}")
        print(f"   Question: {meta.get('question', '')[:100]}...")
        print(f"   Answer: {meta.get('answer', '')[:100]}...")
        if meta.get('has_file'):
            print(f"   üìé Has attachment")

def get_collection_info(client: weaviate.WeaviateClient):
    """Get information about the GAIA collection"""
    try:
        if not client.collections.exists(COLLECTION_NAME):
            print(f"‚ùå Collection {COLLECTION_NAME} does not exist")
            return
        
        collection = client.collections.get(COLLECTION_NAME)
        
        # Get collection statistics
        agg_result = collection.aggregate.over_all(
            total_count=True,
            group_by="level"
        )
        
        total_count = agg_result.total_count
        
        print(f"üìä Collection Info: {COLLECTION_NAME}")
        print(f"  ‚îú‚îÄ‚îÄ Total objects: {total_count}")
        print(f"  ‚îú‚îÄ‚îÄ Vectorizer: text2vec-huggingface")
        print(f"  ‚îú‚îÄ‚îÄ Model: {EMBEDDING_MODEL}")
        
        # Level distribution
        level_counts = {}
        for group in agg_result.groups:
            level = group.grouped_by.value
            count = group.total_count
            level_counts[level] = count
        
        print(f"  ‚îî‚îÄ‚îÄ Level distribution: {level_counts}")
        
    except Exception as e:
        print(f"‚ùå Error getting collection info: {e}")

# LANGCHAIN INTEGRATION

def create_langchain_weaviate_store(client: weaviate.WeaviateClient):
    """Create LangChain Weaviate vector store"""
    try:
        from langchain_weaviate import WeaviateVectorStore
        from langchain_huggingface import HuggingFaceEmbeddings
        
        # Create embeddings model (for LangChain compatibility)
        embeddings = HuggingFaceEmbeddings(
            model_name=EMBEDDING_MODEL,
            model_kwargs={'device': 'cpu'},
            encode_kwargs={'batch_size': 8}
        )
        
        # Create LangChain vector store
        vectorstore = WeaviateVectorStore(
            client=client,
            index_name=COLLECTION_NAME,
            text_key="content",
            embedding=embeddings,
            attributes=["task_id", "question", "answer", "level", "has_file", "steps"]
        )
        
        print("‚úÖ LangChain Weaviate store created")
        return vectorstore
        
    except Exception as e:
        print(f"‚ùå Failed to create LangChain store: {e}")
        return None

def langchain_search_examples(vectorstore, query: str, k: int = 3) -> List[Dict]:
    """Search using LangChain interface"""
    try:
        # Search with scores
        results = vectorstore.similarity_search_with_score(query, k=k)
        
        formatted_results = []
        for doc, score in results:
            meta = doc.metadata.copy()
            meta['content'] = doc.page_content
            meta['similarity_score'] = float(score)
            formatted_results.append(meta)
        
        return formatted_results
        
    except Exception as e:
        print(f"‚ùå LangChain search failed: {e}")
        return []

# PRODUCTION FUNCTIONS FOR GAIA AGENT

class GAIAWeaviateStore:
    """Production-ready Weaviate store for GAIA agent"""
    
    def __init__(self, weaviate_url: str = WEAVIATE_URL):
        self.client = None
        self.vectorstore = None
        self.url = weaviate_url
        self._connect()
    
    def _connect(self):
        """Connect to Weaviate and setup vector store"""
        self.client = connect_to_weaviate(self.url)
        if self.client and self.client.collections.exists(COLLECTION_NAME):
            self.vectorstore = create_langchain_weaviate_store(self.client)
    
    def is_ready(self) -> bool:
        """Check if the store is ready for use"""
        return self.client is not None and self.vectorstore is not None
    
    def get_relevant_examples(self, question: str, k: int = 3, min_score: float = 0.7) -> List[Dict]:
        """Get relevant GAIA examples for a question"""
        if not self.is_ready():
            return []
        
        try:
            results = langchain_search_examples(self.vectorstore, question, k=k)
            # Filter by minimum similarity score
            return [r for r in results if r.get('similarity_score', 0) >= min_score]
        except Exception as e:
            print(f"‚ùå Error getting examples: {e}")
            return []
    
    def format_for_prompt(self, examples: List[Dict], max_examples: int = 3) -> str:
        """Format examples for LLM prompt"""
        if not examples:
            return "No relevant GAIA examples found."
        
        limited = examples[:max_examples]
        
        prompt_section = "üìö Relevant GAIA Examples:\n\n"
        for i, ex in enumerate(limited, 1):
            prompt_section += f"Example {i} (Level {ex.get('level', 'N/A')}):\n"
            prompt_section += f"Q: {ex.get('question', '')}\n"
            prompt_section += f"A: {ex.get('answer', '')}\n"
            
            if ex.get('has_file'):
                prompt_section += "üìé Involves file processing\n"
            
            prompt_section += "\n"
        
        return prompt_section
    
    def select_agent_with_context(self, question: str) -> Tuple[str, str]:
        """Select agent based on similar GAIA examples"""
        examples = self.get_relevant_examples(question, k=2, min_score=0.6)
        context = self.format_for_prompt(examples, max_examples=2)
        
        # Simple agent selection based on examples
        if any(ex.get('has_file', False) for ex in examples):
            return "document_processor", context
        elif any('calculat' in ex.get('question', '').lower() for ex in examples):
            return "data_analyst", context
        elif any('search' in ex.get('question', '').lower() for ex in examples):
            return "web_researcher", context
        else:
            return "general_assistant", context

# SETUP AND TESTING FUNCTIONS

def complete_weaviate_setup(json_QA: List[Dict], max_examples: Optional[int] = None) -> GAIAWeaviateStore:
    """Complete Weaviate setup for GAIA project"""
    print("üöÄ Complete Weaviate Setup for GAIA")
    print("=" * 50)
    
    # Check dependencies
    if not check_weaviate_dependencies():
        return None
    
    # Connect to Weaviate
    client = connect_to_weaviate()
    if not client:
        print("üí° To start Weaviate locally:")
        setup_local_weaviate()
        return None
    
    # Create collection
    if not create_gaia_collection(client):
        return None
    
    # Populate with data
    if not populate_gaia_collection(client, json_QA, max_examples):
        return None
    
    # Get collection info
    get_collection_info(client)
    
    # Test search
    print("\nüß™ Testing search functionality...")
    results = search_gaia_examples(client, "calculate compound interest", k=3)
    print_search_results("calculate compound interest", results)
    
    # Create production store
    store = GAIAWeaviateStore()
    if store.is_ready():
        print("\n‚úÖ GAIA Weaviate store ready for production!")
        return store
    else:
        print("\n‚ùå Failed to create production store")
        return None

# Example usage for your GAIA project:
"""
# Setup Weaviate for GAIA
store = complete_weaviate_setup(json_QA, max_examples=100)

if store:
    # Use in your agent
    question = "How do I calculate compound interest?"
    agent, context = store.select_agent_with_context(question)
    print(f"Selected agent: {agent}")
    print(f"Context: {context}")
"""

deps_available = check_weaviate_dependencies()
print(f"‚úÖ All dependencies available: {deps_available}")

if not deps_available:
    print("\nüì¶ Install missing packages with:")
    print("poetry add weaviate-client langchain-weaviate langchain-huggingface")

# Check numpy version
import numpy as np
print(f"üìä NumPy version: {np.__version__}")

In [None]:
# Start Weaviate (Docker Setup)
# Run this to start Weaviate locally

# Check if Weaviate is running
client_test = connect_to_weaviate()

if client_test is None:
    print("üê≥ Weaviate not running. Setup Docker Compose:")
    setup_local_weaviate()
    print("\nüí° Steps to start Weaviate:")
    print("1. Save the docker-compose.yml content above to a file")
    print("2. Run: docker-compose up -d")
    print("3. Wait ~30 seconds for startup")
    print("4. Re-run this cell to verify connection")
else:
    print("‚úÖ Weaviate is running and ready!")
    client_test.close()  # Close test connection

In [None]:
# Cell 3: Load GAIA Data
# Make sure your GAIA data is loaded

# Check if json_QA is available from previous work
if 'json_QA' in globals() and json_QA:
    print(f"‚úÖ GAIA data already loaded: {len(json_QA)} examples")
else:
    # Load GAIA data (adjust path as needed)
    json_QA = []
    
    # Option 1: From metadata.jsonl
    if os.path.exists("metadata.jsonl"):
        print("üìÅ Loading from metadata.jsonl")
        with open("metadata.jsonl", "r") as f:
            for line in f:
                item = json.loads(line.strip())
                if item.get("Final answer"):  # Only validation examples
                    json_QA.append(item)
    
    # Option 2: From metadata.json
    elif os.path.exists("metadata.json"):
        print("üìÅ Loading from metadata.json")
        with open("metadata.json", "r") as f:
            gaia_data = json.load(f)
            json_QA = gaia_data.get("validation", [])
    
    # Option 3: Load your existing data
    # json_QA = your_existing_gaia_data
    
    print(f"üìä Loaded {len(json_QA)} GAIA examples")

if json_QA:
    sample = json_QA[0]
    print(f"Sample keys: {list(sample.keys())}")
    print(f"Sample question: {sample.get('Question', '')[:100]}...")

In [None]:
# Cell 4: Complete Weaviate Setup (Main Setup)
# This will create collection, populate data, and test

if json_QA and deps_available:
    print("üöÄ Starting complete Weaviate setup...")
    
    # For testing, start with fewer examples (increase as needed)
    max_examples = 100  # Adjust this number based on your needs
    
    # Run complete setup
    store = complete_weaviate_setup(json_QA, max_examples=max_examples)
    
    if store:
        print("\nüéâ Weaviate setup successful!")
        print("‚úÖ Ready for GAIA agent integration")
    else:
        print("\nüí• Setup failed - check error messages above")
        
else:
    print("‚ùå Cannot proceed: missing GAIA data or dependencies")

In [None]:
# Cell 5: Test Basic Search Functionality
# Test the search capabilities with various queries

if 'store' in globals() and store and store.is_ready():
    print("üîç Testing Weaviate Search Functionality")
    print("=" * 50)
    
    test_queries = [
        "calculate compound interest rate",
        "extract text from PDF document",
        "analyze image data", 
        "solve mathematical equation",
        "process Excel spreadsheet",
        "find information online",
        "convert file format"
    ]
    
    for query in test_queries:
        print(f"\nüîç Query: '{query}'")
        
        # Test with direct Weaviate search
        results = search_gaia_examples(store.client, query, k=3)
        
        if results:
            for i, (meta, score) in enumerate(results, 1):
                print(f"  {i}. Score: {score:.3f} | Level: {meta.get('level')}")
                print(f"     Answer: {meta.get('answer', '')[:80]}...")
        else:
            print("  No results found")
            
else:
    print("‚ùå Store not available. Run Cell 4 first.")

In [None]:
# Cell 6: Test LangChain Integration
# Test the LangChain wrapper functionality

if 'store' in globals() and store and store.is_ready():
    print("ü¶ú Testing LangChain Integration")
    print("=" * 40)
    
    # Test LangChain search
    test_question = "How do I calculate compound interest on a monthly basis?"
    
    print(f"Question: {test_question}")
    
    # Get relevant examples using LangChain interface
    examples = store.get_relevant_examples(test_question, k=3, min_score=0.5)
    
    print(f"\nFound {len(examples)} relevant examples:")
    for i, ex in enumerate(examples, 1):
        print(f"\n{i}. Similarity: {ex.get('similarity_score', 0):.3f}")
        print(f"   Level: {ex.get('level')}")
        print(f"   Q: {ex.get('question', '')[:100]}...")
        print(f"   A: {ex.get('answer', '')[:100]}...")
    
    # Test prompt formatting
    formatted_prompt = store.format_for_prompt(examples, max_examples=2)
    print(f"\nüìã Formatted for prompt:")
    print(formatted_prompt)
    
else:
    print("‚ùå Store not available. Run Cell 4 first.")

In [None]:
# Cell 7: Test Agent Selection
# Test the agent selection based on similar examples

if 'store' in globals() and store and store.is_ready():
    print("ü§ñ Testing Agent Selection")
    print("=" * 35)
    
    test_scenarios = [
        "Calculate the compound interest for a $10,000 investment",
        "Extract the main points from this PDF document", 
        "Search for recent news about AI developments",
        "Analyze the data in this CSV file",
        "Convert this audio file to text",
        "What is the capital of France?"
    ]
    
    for scenario in test_scenarios:
        print(f"\nüìù Scenario: {scenario}")
        
        # Get agent selection with context
        agent, context = store.select_agent_with_context(scenario)
        
        print(f"üéØ Selected Agent: {agent}")
        print(f"üìö Context length: {len(context)} characters")
        
        # Show first few lines of context
        context_preview = '\n'.join(context.split('\n')[:3])
        print(f"üìñ Context preview: {context_preview}...")
        
else:
    print("‚ùå Store not available. Run Cell 4 first.")

In [None]:
# Cell 8: Collection Information and Statistics
# Get detailed information about your Weaviate collection

if 'store' in globals() and store and store.is_ready():
    print("üìä Weaviate Collection Analysis")
    print("=" * 40)
    
    # Get collection info
    get_collection_info(store.client)
    
    # Additional performance testing
    print(f"\n‚ö° Performance Analysis:")
    
    # Test search speed
    start_time = time.time()
    test_results = search_gaia_examples(store.client, "test query", k=5)
    search_time = time.time() - start_time
    
    print(f"  ‚îú‚îÄ‚îÄ Search time: {search_time:.3f} seconds")
    print(f"  ‚îú‚îÄ‚îÄ Results returned: {len(test_results)}")
    print(f"  ‚îî‚îÄ‚îÄ Speed: {len(test_results)/search_time:.1f} results/second")
    
    # Memory usage if available
    try:
        import psutil
        memory = psutil.virtual_memory()
        print(f"\nüíæ Memory Usage: {memory.percent:.1f}%")
        print(f"  ‚îî‚îÄ‚îÄ Available: {memory.available // 1024 // 1024} MB")
    except ImportError:
        print("\nüíæ Install psutil for memory monitoring: poetry add psutil")
    
    print(f"\n‚úÖ Collection analysis complete!")
    
else:
    print("‚ùå Store not available. Run Cell 4 first.")

In [None]:
# Cell 9: Production Integration Functions
# Functions ready for your main GAIA agent

class GAIAAgentWithWeaviate:
    """GAIA Agent with Weaviate integration"""
    
    def __init__(self, weaviate_store: GAIAWeaviateStore):
        self.store = weaviate_store
        self.agents = {
            "data_analyst": "Handles calculations, data analysis, and mathematical problems",
            "web_researcher": "Searches for information online and retrieves content", 
            "document_processor": "Processes files, extracts text, handles attachments",
            "general_assistant": "Handles general questions and reasoning tasks"
        }
    
    def process_question(self, question: str) -> Dict:
        """Process a GAIA question with context from similar examples"""
        
        # Get agent and context
        selected_agent, context = self.store.select_agent_with_context(question)
        
        # Get relevant examples for additional context
        examples = self.store.get_relevant_examples(question, k=2, min_score=0.6)
        
        return {
            "question": question,
            "selected_agent": selected_agent,
            "agent_description": self.agents.get(selected_agent, "Unknown agent"),
            "context": context,
            "relevant_examples": len(examples),
            "has_file_examples": any(ex.get('has_file', False) for ex in examples),
            "example_levels": [ex.get('level') for ex in examples]
        }
    
    def create_system_prompt(self, question: str) -> str:
        """Create system prompt with GAIA context"""
        
        result = self.process_question(question)
        
        system_prompt = f"""You are a general AI assistant working on GAIA benchmark questions.

Selected Agent: {result['selected_agent']} - {result['agent_description']}

{result['context']}

Your task: {question}

Report your thoughts, and finish with: FINAL ANSWER: [YOUR FINAL ANSWER].
YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list.
- Numbers: no commas, no units ($ %) unless specified
- Strings: no articles (the, a, an), no abbreviations, digits as text unless specified  
- Lists: apply above rules to each element"""

        return system_prompt

# Test the production integration
if 'store' in globals() and store and store.is_ready():
    print("üéØ Testing Production Integration")
    print("=" * 40)
    
    # Create GAIA agent
    gaia_agent = GAIAAgentWithWeaviate(store)
    
    # Test questions
    test_questions = [
        "Calculate the compound interest on $5000 at 3% annually for 10 years",
        "What is the population of Tokyo in 2024?",
        "Extract the key findings from the attached research paper"
    ]
    
    for question in test_questions:
        print(f"\nüìù Question: {question}")
        
        # Process question
        result = gaia_agent.process_question(question)
        print(f"ü§ñ Agent: {result['selected_agent']}")
        print(f"üìä Examples found: {result['relevant_examples']}")
        print(f"üìé Has file examples: {result['has_file_examples']}")
        
        # Create system prompt
        system_prompt = gaia_agent.create_system_prompt(question)
        print(f"üìÑ System prompt length: {len(system_prompt)} chars")
        
        # Show prompt preview
        preview = system_prompt.split('\n')[:3]
        print(f"üìñ Prompt preview: {' '.join(preview)}...")

else:
    print("‚ùå Store not available. Run Cell 4 first.")

In [None]:
# Cell 10: Save and Load Configuration
# Save your setup for future use

def save_weaviate_config():
    """Save Weaviate configuration for future sessions"""
    config = {
        "collection_name": COLLECTION_NAME,
        "embedding_model": EMBEDDING_MODEL,
        "weaviate_url": WEAVIATE_URL,
        "setup_complete": True,
        "total_examples": len(json_QA) if 'json_QA' in globals() else 0
    }
    
    with open("weaviate_gaia_config.json", "w") as f:
        json.dump(config, f, indent=2)
    
    print("üíæ Configuration saved to: weaviate_gaia_config.json")
    return config

def load_weaviate_config():
    """Load existing Weaviate configuration"""
    try:
        with open("weaviate_gaia_config.json", "r") as f:
            config = json.load(f)
        print("üìÇ Configuration loaded from: weaviate_gaia_config.json")
        return config
    except FileNotFoundError:
        print("‚ùå No configuration file found")
        return None

# Save current configuration
if 'store' in globals() and store and store.is_ready():
    config = save_weaviate_config()
    print(f"‚úÖ Saved configuration: {config}")
    
    # Quick reload test
    reloaded_config = load_weaviate_config()
    print(f"üîÑ Reloaded config matches: {config == reloaded_config}")
    
else:
    print("‚ùå Store not available. Run Cell 4 first.")

print("\nüéâ All notebook cells complete!")
print("‚úÖ Your Weaviate GAIA vector store is ready for production use!")

In [None]:
# Cell 11: Quick Restart Function
# Use this to quickly restart from a saved state

def quick_restart_weaviate():
    """Quickly restart Weaviate connection from saved state"""
    
    # Load config
    config = load_weaviate_config()
    if not config:
        print("‚ùå No saved configuration. Run full setup first.")
        return None
    
    # Check dependencies
    if not check_weaviate_dependencies():
        return None
    
    # Connect to Weaviate
    client = connect_to_weaviate()
    if not client:
        print("‚ùå Weaviate not running. Start with: docker-compose up -d")
        return None
    
    # Check if collection exists
    if not client.collections.exists(COLLECTION_NAME):
        print("‚ùå Collection doesn't exist. Run full setup first.")
        return None
    
    # Create store
    store = GAIAWeaviateStore()
    
    if store.is_ready():
        print("‚ö° Quick restart successful!")
        get_collection_info(store.client)
        return store
    else:
        print("‚ùå Quick restart failed")
        return None

# Use this for quick restarts:
# store = quick_restart_weaviate()

print("\nüöÄ Use quick_restart_weaviate() to quickly reload your setup!")


In [None]:
# Cell 12: Integration with Your Main GAIA Agent
# Final integration code for your main agent

"""
# Example integration with your main GAIA agent workflow:

def gaia_agent_with_rag(question: str) -> str:
    '''Main GAIA agent function with RAG support'''
    
    # Initialize Weaviate store
    store = GAIAWeaviateStore()
    
    if not store.is_ready():
        # Fallback without RAG
        return process_without_rag(question)
    
    # Create agent with RAG
    gaia_agent = GAIAAgentWithWeaviate(store)
    
    # Get system prompt with context
    system_prompt = gaia_agent.create_system_prompt(question)
    
    # Process with your LLM (replace with your actual LLM call)
    response = your_llm_call(system_prompt)
    
    return response

# Test the integration
test_question = "Calculate compound interest on $10,000 at 5% for 3 years"
result = gaia_agent_with_rag(test_question)
print(f"Result: {result}")
"""

print("üìö Integration example provided above!")
print("üéØ Ready to integrate with your main GAIA agent!")

# Final status check
if 'store' in globals() and store and store.is_ready():
    print(f"\n‚úÖ STATUS: Weaviate GAIA store is READY")
    print(f"   ‚îú‚îÄ‚îÄ Collection: {COLLECTION_NAME}")
    print(f"   ‚îú‚îÄ‚îÄ Embedding model: {EMBEDDING_MODEL}")
    print(f"   ‚îú‚îÄ‚îÄ NumPy version: {np.__version__}")
    print(f"   ‚îî‚îÄ‚îÄ Agent integration: Ready")
else:
    print(f"\n‚ùå STATUS: Setup incomplete")
    print(f"   ‚îî‚îÄ‚îÄ Run cells 1-4 to complete setup")