# Full Experiment Workflow

This notebook demonstrates how to run complete experiments comparing multiple prompts across models.

**Note**: This notebook uses async execution for 10-50x speedup over sequential processing.

In [None]:
import asyncio
from pathlib import Path

from prompt_sandbox.config.schema import PromptConfig
from prompt_sandbox.prompts.template import PromptTemplate
from prompt_sandbox.evaluators import BLEUEvaluator, ROUGEEvaluator, BERTScoreEvaluator
from prompt_sandbox.experiments import AsyncExperimentRunner, ExperimentConfig

## Setup: Create Mock Model for Demo

For this demo, we'll use a mock model. In real usage, you'd use `OllamaBackend` or `HuggingFaceBackend`.

In [None]:
from prompt_sandbox.models.base import ModelBackend, GenerationResult

class DemoModel(ModelBackend):
    """Mock model that simulates different response patterns"""
    
    def __init__(self, name: str, response_style: str = "direct"):
        self.model_name = name
        self.response_style = response_style
    
    async def generate_async(self, prompt: str, **kwargs) -> GenerationResult:
        await asyncio.sleep(0.1)  # Simulate API latency
        
        # Simple pattern matching for demo responses
        if "2+2" in prompt or "2 + 2" in prompt:
            if self.response_style == "verbose":
                response = "Let me calculate that. 2 plus 2 equals 4."
            else:
                response = "4"
        elif "capital" in prompt.lower() and "france" in prompt.lower():
            if self.response_style == "verbose":
                response = "The capital city of France is Paris."
            else:
                response = "Paris"
        elif "python" in prompt.lower() and "invented" in prompt.lower():
            if self.response_style == "verbose":
                response = "Python was invented by Guido van Rossum."
            else:
                response = "Guido van Rossum"
        else:
            response = "I don't know."
        
        return GenerationResult(
            prompt=prompt,
            generated_text=response,
            tokens_generated=len(response.split()),
            generation_time=0.1,
            model_name=self.model_name
        )
    
    def generate(self, prompt: str, **kwargs) -> GenerationResult:
        return asyncio.run(self.generate_async(prompt, **kwargs))

# Create two different models
model_a = DemoModel("model-a", response_style="direct")
model_b = DemoModel("model-b", response_style="verbose")

## Define Prompts to Test

Let's create three different prompting strategies:

In [None]:
# Strategy 1: Direct question
prompt_direct = PromptTemplate(PromptConfig(
    name="direct",
    template="Q: {{question}}\nA:",
    variables=["question"]
))

# Strategy 2: Chain-of-thought
prompt_cot = PromptTemplate(PromptConfig(
    name="chain_of_thought",
    template="Q: {{question}}\nLet's think step by step and answer:\nA:",
    variables=["question"]
))

# Strategy 3: Instructional
prompt_instructional = PromptTemplate(PromptConfig(
    name="instructional",
    template="Answer the following question concisely.\nQuestion: {{question}}\nAnswer:",
    variables=["question"]
))

prompts = [prompt_direct, prompt_cot, prompt_instructional]
print(f"Created {len(prompts)} prompt templates")

## Create Test Cases

Define questions with expected answers:

In [None]:
test_cases = [
    {
        "input": {"question": "What is 2+2?"},
        "expected_output": "4"
    },
    {
        "input": {"question": "What is the capital of France?"},
        "expected_output": "Paris"
    },
    {
        "input": {"question": "Who invented Python?"},
        "expected_output": "Guido van Rossum"
    }
]

print(f"Created {len(test_cases)} test cases")

## Configure Experiment

Set up the experiment with prompts, models, and evaluators:

In [None]:
# Create output directory
output_dir = Path("experiment_results")
output_dir.mkdir(exist_ok=True)

# Configure experiment
config = ExperimentConfig(
    name="prompt_comparison_demo",
    prompts=prompts,
    models=[model_a, model_b],
    evaluators=[
        BLEUEvaluator(),
        ROUGEEvaluator(),
        # BERTScoreEvaluator()  # Commented out for faster demo
    ],
    test_cases=test_cases,
    save_results=True,
    output_dir=output_dir
)

print(f"Experiment configuration:")
print(f"  - {len(config.prompts)} prompts")
print(f"  - {len(config.models)} models")
print(f"  - {len(config.test_cases)} test cases")
print(f"  - Total runs: {len(config.prompts) * len(config.models) * len(config.test_cases)}")

## Run Experiment (Async)

Execute all combinations in parallel:

In [None]:
# Run experiment
runner = AsyncExperimentRunner(config)
results = await runner.run_async()

print(f"\n‚úÖ Experiment complete! Generated {len(results)} results")

## Analyze Results

Get summary statistics:

In [None]:
summary = runner.get_summary()

print("\nüìä Summary Statistics:\n")
for (prompt_name, model_name), stats in summary.items():
    print(f"=== {prompt_name} + {model_name} ===")
    for metric, values in stats["scores"].items():
        print(f"  {metric.upper()}: {values['mean']:.3f} (¬±{values['std']:.3f})")
    print()

## Find Best Configuration

Use the comparator to identify winners:

In [None]:
from prompt_sandbox.experiments.comparator import ResultComparator

comparator = ResultComparator(results)

# Find best prompt for each model
for model_name in ["model-a", "model-b"]:
    print(f"\nüèÜ Best prompts for {model_name}:")
    for metric in ["bleu", "rouge"]:
        best_prompt, score = comparator.get_best_prompt(model_name, metric)
        print(f"  {metric.upper()}: {best_prompt} (score: {score:.3f})")

## Inspect Individual Results

Look at actual prompts and responses:

In [None]:
# Show first few results in detail
print("\nüîç Sample Results:\n")
for i, result in enumerate(results[:3]):
    print(f"Result {i+1}:")
    print(f"  Prompt: {result['prompt_name']}")
    print(f"  Model: {result['model_name']}")
    print(f"  Input: {result['input']}")
    print(f"  Expected: {result['expected_output']}")
    print(f"  Actual: {result['actual_output']}")
    print(f"  Scores: BLEU={result['evaluation_scores'].get('bleu', 0):.3f}, "
          f"ROUGE={result['evaluation_scores'].get('rouge', 0):.3f}")
    print()

## Save and Load Results

Results are automatically saved. You can reload them later:

In [None]:
from prompt_sandbox.experiments.storage import ResultStorage

# Load saved results
storage = ResultStorage(output_dir)
loaded_results = storage.load_results("prompt_comparison_demo")

print(f"‚úÖ Loaded {len(loaded_results)} results from disk")
print(f"Results saved in: {output_dir}")

## Next Steps

- See `03_visualization.ipynb` for plotting and visualization
- Try with real models: `OllamaBackend('llama3.1')` or `HuggingFaceBackend('meta-llama/Llama-2-7b')`
- Add more evaluators like `BERTScoreEvaluator()` for semantic similarity