# Visualization and Analysis

This notebook demonstrates how to visualize experiment results with charts and heatmaps.

In [None]:
import matplotlib.pyplot as plt
from pathlib import Path

from prompt_sandbox.visualization import ResultVisualizer
from prompt_sandbox.experiments.storage import ResultStorage
from prompt_sandbox.experiments.comparator import ResultComparator

# Enable inline plotting
%matplotlib inline
plt.style.use('seaborn-v0_8-darkgrid')

## Load Experiment Results

First, load results from a previous experiment:

In [None]:
# Load results (you'll need to run 02_full_experiment.ipynb first)
results_dir = Path("experiment_results")

if results_dir.exists():
    storage = ResultStorage(results_dir)
    results = storage.load_results("prompt_comparison_demo")
    print(f"‚úÖ Loaded {len(results)} results")
else:
    print("‚ö†Ô∏è  No results found. Run 02_full_experiment.ipynb first.")
    # Create mock results for demo
    results = [
        {
            "prompt_name": "direct",
            "model_name": "model-a",
            "evaluation_scores": {"bleu": 0.75, "rouge": 0.82}
        },
        {
            "prompt_name": "chain_of_thought",
            "model_name": "model-a",
            "evaluation_scores": {"bleu": 0.68, "rouge": 0.79}
        },
        {
            "prompt_name": "direct",
            "model_name": "model-b",
            "evaluation_scores": {"bleu": 0.72, "rouge": 0.85}
        },
        {
            "prompt_name": "chain_of_thought",
            "model_name": "model-b",
            "evaluation_scores": {"bleu": 0.70, "rouge": 0.88}
        }
    ]
    print("Using mock results for demonstration")

## Create Visualizer

Initialize the visualization engine:

In [None]:
visualizer = ResultVisualizer(results)
print("‚úÖ Visualizer ready")

## Plot 1: Compare Prompts

Bar chart comparing different prompts for a single model:

In [None]:
# Compare prompts on model-a using BLEU score
visualizer.plot_prompt_comparison(
    model_name="model-a",
    metric="bleu",
    output_path=None  # None = display inline
)

plt.tight_layout()
plt.show()

In [None]:
# Same comparison using ROUGE score
visualizer.plot_prompt_comparison(
    model_name="model-a",
    metric="rouge",
    output_path=None
)

plt.tight_layout()
plt.show()

## Plot 2: Compare Models

Bar chart comparing different models for a single prompt:

In [None]:
# Compare models using the direct prompt
visualizer.plot_model_comparison(
    prompt_name="direct",
    metric="bleu",
    output_path=None
)

plt.tight_layout()
plt.show()

## Plot 3: Heatmap Overview

Heatmap showing all prompt √ó model combinations:

In [None]:
# Create heatmap for BLEU scores
visualizer.plot_metric_heatmap(
    metric="bleu",
    output_path=None
)

plt.tight_layout()
plt.show()

In [None]:
# Heatmap for ROUGE scores
visualizer.plot_metric_heatmap(
    metric="rouge",
    output_path=None
)

plt.tight_layout()
plt.show()

## Statistical Analysis

Get detailed statistics using the comparator:

In [None]:
comparator = ResultComparator(results)

# Compare all prompts for model-a
prompt_comparison = comparator.compare_prompts("model-a", "bleu")

print("üìä Prompt Comparison (model-a, BLEU):")
print("-" * 50)
for prompt_name, stats in prompt_comparison.items():
    print(f"{prompt_name:20s}: {stats['mean']:.3f} ¬± {stats['std']:.3f}")
    print(f"{'':20s}  Range: [{stats['min']:.3f}, {stats['max']:.3f}]")
    print()

In [None]:
# Compare models for direct prompt
model_comparison = comparator.compare_models("direct", "bleu")

print("üìä Model Comparison (direct prompt, BLEU):")
print("-" * 50)
for model_name, stats in model_comparison.items():
    print(f"{model_name:20s}: {stats['mean']:.3f} ¬± {stats['std']:.3f}")
    print(f"{'':20s}  Range: [{stats['min']:.3f}, {stats['max']:.3f}]")
    print()

## Find Winners

Identify best configurations:

In [None]:
print("üèÜ Best Configurations:\n")

# Best prompt for each model
for model in ["model-a", "model-b"]:
    for metric in ["bleu", "rouge"]:
        best_prompt, score = comparator.get_best_prompt(model, metric)
        print(f"Best for {model} ({metric.upper()}): {best_prompt} (score: {score:.3f})")

print()

# Best model for each prompt
for prompt in ["direct", "chain_of_thought"]:
    for metric in ["bleu", "rouge"]:
        best_model, score = comparator.get_best_model(prompt, metric)
        print(f"Best for {prompt} ({metric.upper()}): {best_model} (score: {score:.3f})")

## Save Plots to Files

Export visualizations for reports:

In [None]:
# Create output directory for plots
plots_dir = Path("plots")
plots_dir.mkdir(exist_ok=True)

# Save prompt comparison
visualizer.plot_prompt_comparison(
    model_name="model-a",
    metric="bleu",
    output_path=plots_dir / "prompt_comparison.png"
)

# Save model comparison
visualizer.plot_model_comparison(
    prompt_name="direct",
    metric="bleu",
    output_path=plots_dir / "model_comparison.png"
)

# Save heatmaps
visualizer.plot_metric_heatmap(
    metric="bleu",
    output_path=plots_dir / "heatmap_bleu.png"
)

visualizer.plot_metric_heatmap(
    metric="rouge",
    output_path=plots_dir / "heatmap_rouge.png"
)

print(f"‚úÖ Plots saved to: {plots_dir}")

## Custom Analysis

Create your own visualizations:

In [None]:
import numpy as np

# Extract data for custom plotting
prompts = sorted(set(r["prompt_name"] for r in results))
models = sorted(set(r["model_name"] for r in results))

# Create scatter plot of BLEU vs ROUGE
fig, ax = plt.subplots(figsize=(10, 6))

for prompt in prompts:
    bleu_scores = [r["evaluation_scores"].get("bleu", 0) 
                   for r in results if r["prompt_name"] == prompt]
    rouge_scores = [r["evaluation_scores"].get("rouge", 0) 
                    for r in results if r["prompt_name"] == prompt]
    
    ax.scatter(bleu_scores, rouge_scores, label=prompt, s=100, alpha=0.7)

ax.set_xlabel('BLEU Score', fontsize=12, fontweight='bold')
ax.set_ylabel('ROUGE Score', fontsize=12, fontweight='bold')
ax.set_title('BLEU vs ROUGE Correlation', fontsize=14, fontweight='bold')
ax.legend()
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## Summary

This notebook demonstrated:
- Loading experiment results
- Creating bar charts for prompt and model comparisons
- Generating heatmaps for overview analysis
- Statistical comparison with mean and standard deviation
- Finding best configurations
- Saving plots to files
- Custom visualization with matplotlib

All visualizations are publication-ready and can be included in reports!