# 02. Multi-Provider Testing

This notebook explores TensorZero's multi-provider capabilities:
- Testing different LLM providers
- Comparing response quality and performance
- Understanding provider-specific behaviors
- Gradually enabling more providers

In [None]:
import os
import time
import json
import pandas as pd
from datetime import datetime
from tensorzero import TensorZeroGateway
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Initialize gateway client
client = TensorZeroGateway("http://localhost:3000")
print("‚úÖ Connected to TensorZero gateway")

## 1. Current Provider Setup

Let's start with the providers we have configured and gradually add more.

In [None]:
# Current available variants
current_variants = [
    ("gpt4", "OpenAI GPT-4", "openai"),
    ("gpt4_mini", "OpenAI GPT-4o Mini", "openai"),
    # We'll add more as we enable them
]

# Check which API keys are available
api_keys = {
    "OpenAI": os.getenv("OPENAI_API_KEY"),
    "Anthropic": os.getenv("ANTHROPIC_API_KEY"), 
    "xAI": os.getenv("XAI_API_KEY")
}

print("API Key Status:")
for provider, key in api_keys.items():
    status = "‚úÖ" if key else "‚ùå"
    print(f"{status} {provider}: {'Set' if key else 'Missing'}")

## 2. Provider Performance Testing

Let's test response time and quality across different providers.

In [None]:
def test_variant_performance(variant_name, provider_name, test_prompt, function_name="chat"):
    """Test a specific variant and return performance metrics."""
    start_time = time.time()
    
    try:
        response = client.inference(
            function_name=function_name,
            variant_name=variant_name,
            input={
                "messages": [
                    {"role": "user", "content": test_prompt}
                ]
            }
        )
        
        end_time = time.time()
        response_time = end_time - start_time
        
        # Extract text content
        content_text = ""
        if hasattr(response, 'content') and response.content:
            if isinstance(response.content, list) and len(response.content) > 0:
                content_text = response.content[0].text if hasattr(response.content[0], 'text') else str(response.content[0])
            else:
                content_text = str(response.content)
        
        return {
            "variant": variant_name,
            "provider": provider_name,
            "inference_id": response.inference_id,
            "response_time": response_time,
            "content_length": len(content_text),
            "content": content_text[:200] + "..." if len(content_text) > 200 else content_text,
            "full_content": content_text,
            "success": True,
            "error": None,
            "timestamp": datetime.now().isoformat()
        }
    
    except Exception as e:
        end_time = time.time()
        response_time = end_time - start_time
        
        return {
            "variant": variant_name,
            "provider": provider_name,
            "inference_id": None,
            "response_time": response_time,
            "content_length": 0,
            "content": "",
            "full_content": "",
            "success": False,
            "error": str(e),
            "timestamp": datetime.now().isoformat()
        }

In [None]:
# Test prompts for different scenarios
test_scenarios = [
    {
        "name": "Creative Writing",
        "prompt": "Write a creative short story about a robot discovering emotions (max 100 words)."
    },
    {
        "name": "Technical Explanation", 
        "prompt": "Explain how TensorZero's gateway architecture works in simple terms."
    },
    {
        "name": "Code Generation",
        "prompt": "Write a Python function that calculates the Fibonacci sequence using recursion."
    },
    {
        "name": "Analysis",
        "prompt": "Compare the advantages and disadvantages of microservices vs monolithic architecture."
    }
]

# Run tests
results = []

for scenario in test_scenarios:
    print(f"\nüß™ Testing: {scenario['name']}")
    print("=" * 50)
    
    for variant_name, provider_display, provider_key in current_variants:
        print(f"Testing {provider_display} ({variant_name})...")
        
        result = test_variant_performance(
            variant_name=variant_name,
            provider_name=provider_display,
            test_prompt=scenario['prompt']
        )
        
        result['scenario'] = scenario['name']
        results.append(result)
        
        if result['success']:
            print(f"  ‚úÖ {result['response_time']:.2f}s - {result['content_length']} chars")
            print(f"  üìù {result['content']}")
        else:
            print(f"  ‚ùå Failed: {result['error']}")
        
        print()

print(f"\nüìä Completed {len(results)} tests")

## 3. Performance Analysis

In [None]:
# Convert results to DataFrame for analysis
df = pd.DataFrame(results)

# Filter successful results for analysis
successful_df = df[df['success'] == True].copy()

if not successful_df.empty:
    print("üìà Performance Summary")
    print("=" * 40)
    
    # Average response times by provider
    avg_times = successful_df.groupby('provider')['response_time'].agg(['mean', 'std', 'min', 'max']).round(3)
    print("\nResponse Times by Provider:")
    print(avg_times)
    
    # Average content length by provider
    avg_length = successful_df.groupby('provider')['content_length'].agg(['mean', 'std', 'min', 'max']).round(1)
    print("\nContent Length by Provider:")
    print(avg_length)
    
    # Success rate by provider
    success_rate = df.groupby('provider')['success'].agg(['sum', 'count']).round(3)
    success_rate['success_rate'] = (success_rate['sum'] / success_rate['count']) * 100
    print("\nSuccess Rate by Provider:")
    print(success_rate[['success_rate']])
    
else:
    print("‚ùå No successful results to analyze")

# Show any errors
error_results = df[df['success'] == False]
if not error_results.empty:
    print("\n‚ùå Errors:")
    for _, row in error_results.iterrows():
        print(f"  {row['provider']}: {row['error']}")

## 4. Response Quality Comparison

Let's compare the quality of responses for a specific scenario.

In [None]:
# Pick one scenario to compare in detail
comparison_scenario = "Technical Explanation"
comparison_results = [r for r in results if r['scenario'] == comparison_scenario and r['success']]

print(f"üîç Detailed Comparison: {comparison_scenario}")
print("=" * 60)

for result in comparison_results:
    print(f"\nü§ñ {result['provider']} ({result['response_time']:.2f}s)")
    print("-" * 40)
    print(result['full_content'])
    print(f"\nüìä Length: {result['content_length']} chars | ID: {result['inference_id']}")

## 5. Adding Anthropic Claude

Now let's add Anthropic Claude to our configuration and test it.

In [None]:
# Check if Anthropic API key is available
anthropic_key = os.getenv("ANTHROPIC_API_KEY")

if anthropic_key:
    print("‚úÖ Anthropic API key found. We can add Claude variants.")
    print("""To add Claude variants, update config/tensorzero.toml with:
    
[functions.chat.variants.claude3_sonnet]
type = "chat_completion"
model = "anthropic::claude-3-sonnet-20240229"

[functions.chat.variants.claude3_haiku]
type = "chat_completion" 
model = "anthropic::claude-3-haiku-20240307"

Then restart services with: poe down && poe up
""")
else:
    print("‚ùå Anthropic API key not found. Add it to .env file to test Claude variants.")

## 6. Testing Function Variants

Let's test the haiku generation function across providers.

In [None]:
# Test the haiku generation function
print("üéã Testing Haiku Generation")
print("=" * 30)

haiku_prompts = [
    "Write a haiku about artificial intelligence.",
    "Write a haiku about the ocean at sunset.",
    "Write a haiku about coding late at night."
]

for prompt in haiku_prompts:
    print(f"\nPrompt: {prompt}")
    print("-" * 40)
    
    # Test with the configured variant
    result = test_variant_performance(
        variant_name="gpt_4o_mini",  # This is from the haiku function
        provider_name="OpenAI GPT-4o Mini",
        test_prompt=prompt,
        function_name="generate_haiku"
    )
    
    if result['success']:
        print(f"‚úÖ {result['full_content']}")
        print(f"   ({result['response_time']:.2f}s)")
    else:
        print(f"‚ùå Failed: {result['error']}")

## Key Findings

Based on our testing:

1. **Performance**: Record the response times and reliability of each provider
2. **Quality**: Note differences in response style and content quality
3. **Reliability**: Track success rates and error patterns
4. **Use Cases**: Identify which providers work best for specific scenarios

## Next Steps

1. Add more providers (Anthropic, xAI) to the configuration
2. Implement A/B testing and routing strategies
3. Add structured output functions (JSON schema validation)
4. Test fallback mechanisms

Next notebook: We'll explore observability and tracing features.