# Labeled Scenarios Walkthrough

This notebook explores labeled scenarios - categorized test cases that give you coverage visibility across your AI system.

## What You'll Learn

1. How scenarios are organized by category
2. How to run filtered evaluations
3. How to interpret the coverage matrix
4. How to identify gaps in testing

In [None]:
import sys
sys.path.insert(0, "../setup_agent")

from evaluator import load_scenarios, flatten_scenarios, run_scenario

## 1. Understanding the Scenario Structure

Scenarios are organized hierarchically by category and subcategory:

In [None]:
# Load scenarios
scenarios = load_scenarios()

# Show the structure
print("Scenario Categories:")
print("-" * 40)
for category, subcategories in scenarios.items():
    print(f"\n{category}/")
    if isinstance(subcategories, dict):
        for subcategory, items in subcategories.items():
            count = len(items) if isinstance(items, list) else 0
            print(f"  └── {subcategory}: {count} scenarios")

In [None]:
# Get all scenarios
all_scenarios = flatten_scenarios(scenarios)
print(f"Total scenarios: {len(all_scenarios)}")

# Filter to single_tool only
single_tool = flatten_scenarios(scenarios, category_filter="single_tool")
print(f"Single-tool scenarios: {len(single_tool)}")

# Filter to sql_only specifically
sql_only = flatten_scenarios(scenarios, subcategory_filter="sql_only")
print(f"SQL-only scenarios: {len(sql_only)}")

# Filter by difficulty
edge_cases = flatten_scenarios(scenarios, difficulty_filter="edge_case")
print(f"Edge case scenarios: {len(edge_cases)}")

## 3. Running a Single Scenario

Let's run one scenario and see the details:

In [None]:
# Run a single scenario
category, subcategory, scenario = all_scenarios[0]

print(f"Running: {scenario['id']}")
print(f"Category: {category}/{subcategory}")
print(f"Query: {scenario['query']}")
print(f"Difficulty: {scenario.get('difficulty', 'unknown')}")
print()

result = run_scenario(category, subcategory, scenario)

print(f"Passed: {result.passed}")
print(f"Tools used: {result.tools_used}")
print(f"Tool check: {result.tool_check}")
print(f"Content check: {result.content_check}")

if result.errors:
    print(f"Errors: {result.errors}")

## 4. Running a Category

Let's run all scenarios in a specific category:

In [None]:
# Run all vector_only scenarios
vector_scenarios = flatten_scenarios(scenarios, subcategory_filter="vector_only")

print(f"Running {len(vector_scenarios)} vector_only scenarios:")
print("-" * 40)

results = []
for cat, subcat, scenario in vector_scenarios[:3]:  # Just first 3 for demo
    print(f"  {scenario['id']}: {scenario['query'][:40]}...", end=" ")
    result = run_scenario(cat, subcat, scenario)
    results.append(result)
    print("✓" if result.passed else "✗")

passed = sum(1 for r in results if r.passed)
print(f"\nResults: {passed}/{len(results)}")

## 5. Coverage Analysis

The key benefit of labeled scenarios is visibility into coverage:

In [None]:
# Count scenarios by category
coverage = {}
for cat, subcat, scenario in all_scenarios:
    key = f"{cat}/{subcat}"
    coverage.setdefault(key, {"total": 0, "by_difficulty": {}})
    coverage[key]["total"] += 1
    
    diff = scenario.get("difficulty", "unknown")
    coverage[key]["by_difficulty"].setdefault(diff, 0)
    coverage[key]["by_difficulty"][diff] += 1

print("Coverage Matrix:")
print("-" * 60)
print(f"{'Category':<35} {'Total':<8} {'Easy':<8} {'Ambig':<8} {'Edge':<8}")
print("-" * 60)

for key in sorted(coverage.keys()):
    data = coverage[key]
    easy = data["by_difficulty"].get("straightforward", 0)
    ambig = data["by_difficulty"].get("ambiguous", 0)
    edge = data["by_difficulty"].get("edge_case", 0)
    print(f"{key:<35} {data['total']:<8} {easy:<8} {ambig:<8} {edge:<8}")