# üß™ Manual Testing: Evaluation Module

This notebook allows you to manually test the evaluation module:
1. **Load ground truth** from CSV
2. **Create mock predictions** with different accuracy levels
3. **Calculate metrics** (precision, recall, F1)
4. **Generate reports** (CSV, JSON, comparison)
5. **Test cloud pipeline** with GCS (optional)

---

## 1. Import Required Libraries and Evaluation Module

In [None]:
import os
import sys
import json
import random
from pathlib import Path
from datetime import datetime
from typing import Dict, List, Optional

# Add project root to path
PROJECT_ROOT = Path(os.getcwd())
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

# Import evaluation module components
from evaluation.models import (
    VideoAnnotation,
    PredictionResult,
    PredictionSet,
    GroundTruthDataset,
    EvaluationConfig,
    ModelConfig,
)
from evaluation.ground_truth_loader import GroundTruthLoader, ANNOTATION_CATEGORIES
from evaluation.metrics.calculator import MetricsCalculator, ModelEvaluationResult
from evaluation.reports.generator import ReportGenerator

print("‚úì Imports successful!")
print(f"Project root: {PROJECT_ROOT}")
print(f"Annotation categories: {len(ANNOTATION_CATEGORIES)}")
print(f"Categories: {ANNOTATION_CATEGORIES[:5]}...")

## 2. Load Ground Truth Data

Load the ground truth dataset from your CSV file. This contains the actual annotations for each video.

In [None]:
# Configuration - adjust these paths as needed
GROUND_TRUTH_PATH = "cleaned_groundtruth_values_only.csv"  # Your ground truth file
SAMPLE_SIZE = None  # Set to a number (e.g., 50) for quick testing, None for all data
OUTPUT_DIR = "./manual_test_results"

# Load ground truth
loader = GroundTruthLoader(
    dataset_path=GROUND_TRUTH_PATH,
    sample_size=SAMPLE_SIZE,
    random_seed=42,
)
ground_truth = loader.load()

print(f"‚úì Ground truth loaded!")
print(f"  Total videos: {ground_truth.total_count}")
print(f"  Valid videos: {ground_truth.valid_count}")
print(f"  Validation errors: {len(ground_truth.validation_errors)}")

if ground_truth.validation_errors:
    print(f"\n‚ö†Ô∏è First 3 validation errors:")
    for error in ground_truth.validation_errors[:3]:
        print(f"  - {error[:100]}...")

In [None]:
# Explore the ground truth data
print("üìä Sample of loaded videos:\n")

for i, video in enumerate(ground_truth.videos[:5]):
    endorsed = sum(1 for v in video.annotations.values() if v in {1, 2})
    conflict = sum(1 for v in video.annotations.values() if v == -1)
    absent = sum(1 for v in video.annotations.values() if v == 0)
    
    print(f"{i+1}. Video ID: {video.video_id}")
    print(f"   Endorsed: {endorsed}, Conflict: {conflict}, Absent: {absent}")
    print(f"   Sample annotations: {dict(list(video.annotations.items())[:3])}")
    print()

## 3. Create Mock Predictions

Create predictions with different accuracy levels to test the metrics calculation. You can also load your own predictions from a JSON file.

In [None]:
def create_mock_predictions(
    ground_truth: GroundTruthDataset,
    model_name: str = "mock_model",
    accuracy_rate: float = 0.7,
    failure_rate: float = 0.05,
    seed: int = 42,
) -> PredictionSet:
    """Create mock predictions with configurable accuracy."""
    random.seed(seed)
    
    predictions = []
    success_count = 0
    failure_count = 0
    failed_ids = []
    
    for video in ground_truth.videos:
        # Simulate occasional failures
        if random.random() < failure_rate:
            predictions.append(PredictionResult(
                video_id=video.video_id,
                predictions={},
                success=False,
                error_message="Simulated prediction failure",
                inference_time=0.0,
            ))
            failure_count += 1
            failed_ids.append(video.video_id)
            continue
        
        # Create predictions with some noise
        pred_annotations = {}
        for category, gt_value in video.annotations.items():
            if random.random() < accuracy_rate:
                pred_annotations[category] = gt_value  # Correct
            else:
                possible = [v for v in [-1, 0, 1, 2] if v != gt_value]
                pred_annotations[category] = random.choice(possible)  # Wrong
        
        predictions.append(PredictionResult(
            video_id=video.video_id,
            predictions=pred_annotations,
            success=True,
            error_message=None,
            inference_time=random.uniform(0.1, 2.0),
        ))
        success_count += 1
    
    return PredictionSet(
        model_name=model_name,
        predictions=predictions,
        total_count=len(predictions),
        success_count=success_count,
        failure_count=failure_count,
        failed_video_ids=failed_ids,
    )

print("‚úì Mock prediction function defined!")

In [None]:
# Create mock predictions with different accuracy levels
predictions_dict = {}

model_configs = [
    ("high_accuracy_model", 0.85, 0.02),
    ("medium_accuracy_model", 0.65, 0.05),
    ("low_accuracy_model", 0.45, 0.10),
]

for model_name, accuracy, failure_rate in model_configs:
    preds = create_mock_predictions(
        ground_truth,
        model_name=model_name,
        accuracy_rate=accuracy,
        failure_rate=failure_rate,
    )
    predictions_dict[model_name] = preds
    
    print(f"üì¶ {model_name}:")
    print(f"   Total: {preds.total_count}, Success: {preds.success_count}, Failed: {preds.failure_count}")
    print()

### Option: Load Your Own Predictions

If you have actual predictions, you can load them from a JSON file. The format should be:
```json
[
    {
        "video_id": "7441889182883829025",
        "predictions": {
            "Self_Direction_Thought": 0,
            "Self_Direction_Action": 1,
            ...
        },
        "success": true
    },
    ...
]
```

In [None]:
# Uncomment and modify to load your own predictions
# YOUR_PREDICTIONS_FILE = "your_predictions.json"

# if Path(YOUR_PREDICTIONS_FILE).exists():
#     with open(YOUR_PREDICTIONS_FILE, 'r') as f:
#         data = json.load(f)
#     
#     predictions = []
#     for item in data:
#         pred = PredictionResult(
#             video_id=item['video_id'],
#             predictions=item['predictions'],
#             success=item.get('success', True),
#             error_message=item.get('error_message'),
#             inference_time=item.get('inference_time', 0.0),
#         )
#         predictions.append(pred)
#     
#     your_predictions = PredictionSet(
#         model_name="your_model",
#         predictions=predictions,
#         total_count=len(predictions),
#         success_count=sum(1 for p in predictions if p.success),
#         failure_count=sum(1 for p in predictions if not p.success),
#         failed_video_ids=[p.video_id for p in predictions if not p.success],
#     )
#     predictions_dict["your_model"] = your_predictions
#     print(f"‚úì Loaded {len(predictions)} predictions from {YOUR_PREDICTIONS_FILE}")

print("‚ÑπÔ∏è Uncomment the code above to load your own predictions")

## 4. Calculate Metrics

Use the MetricsCalculator to compute precision, recall, F1 scores for each model.

In [None]:
# Calculate metrics for all models
results = {}

for model_name, predictions in predictions_dict.items():
    print(f"\nüîç Calculating metrics for {model_name}...")
    
    calculator = MetricsCalculator(
        ground_truth=ground_truth,
        min_frequency_threshold=0.05,  # Exclude rare categories (<5%)
    )
    
    result = calculator.calculate_model_metrics(predictions)
    results[model_name] = result
    
    print(f"   ‚úì Matched videos: {result.matched_with_ground_truth}")
    print(f"   ‚úì Unmatched predictions: {result.unmatched_count}")
    print(f"   ‚úì Missing predictions: {result.missing_count}")

print(f"\n‚úì Calculated metrics for {len(results)} models")

## 5. Validate and Inspect Metric Results

View the aggregate and per-category metrics to verify correctness.

In [None]:
# Compare aggregate metrics across models
print("üìä AGGREGATE METRICS COMPARISON")
print("=" * 80)
print(f"{'Model':<25} {'Endorsed F1':>12} {'Conflict F1':>12} {'Combined F1':>12} {'Categories':>12}")
print("-" * 80)

for model_name, result in results.items():
    print(f"{model_name:<25} "
          f"{result.endorsed_aggregate.macro_f1:>12.4f} "
          f"{result.conflict_aggregate.macro_f1:>12.4f} "
          f"{result.combined_aggregate.macro_f1:>12.4f} "
          f"{result.endorsed_aggregate.categories_evaluated:>12}")

print("-" * 80)

In [None]:
# Detailed per-category breakdown for the best model
best_model = max(results.items(), key=lambda x: x[1].endorsed_aggregate.macro_f1)
print(f"\nüìà PER-CATEGORY METRICS: {best_model[0]}")
print("=" * 90)
print(f"{'Category':<30} {'Endorsed F1':>12} {'Conflict F1':>12} {'Support':>10} {'TP':>8} {'FP':>8} {'FN':>8}")
print("-" * 90)

result = best_model[1]
for category in ANNOTATION_CATEGORIES:
    endorsed = result.per_category_endorsed.get(category)
    conflict = result.per_category_conflict.get(category)
    if endorsed:
        print(f"{category:<30} "
              f"{endorsed.f1:>12.4f} "
              f"{conflict.f1 if conflict else 0:>12.4f} "
              f"{endorsed.support:>10} "
              f"{endorsed.true_positives:>8} "
              f"{endorsed.false_positives:>8} "
              f"{endorsed.false_negatives:>8}")

print("-" * 90)

## 6. Generate and Inspect Reports

Generate CSV and JSON reports for the evaluation results.

In [None]:
# Create output directory
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Generate reports
generator = ReportGenerator(OUTPUT_DIR)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

generated_files = generator.generate_all_reports(list(results.values()), timestamp)

print("üìÑ GENERATED REPORTS")
print("=" * 60)
for report_type, path in generated_files.items():
    file_size = os.path.getsize(path)
    print(f"  {report_type}: {path.name} ({file_size} bytes)")

print(f"\n‚úì All reports saved to: {OUTPUT_DIR}")

In [None]:
# Preview the comparison report (if generated)
comparison_file = [p for t, p in generated_files.items() if "comparison" in t]
if comparison_file:
    print("üìã COMPARISON REPORT PREVIEW")
    print("=" * 60)
    with open(comparison_file[0], 'r') as f:
        lines = f.readlines()
        for line in lines[:30]:  # First 30 lines
            print(line.rstrip())
    if len(lines) > 30:
        print(f"... ({len(lines) - 30} more lines)")
else:
    # Preview a single model's JSON report
    json_files = [p for t, p in generated_files.items() if "json" in t]
    if json_files:
        print("üìã JSON REPORT PREVIEW (first model)")
        print("=" * 60)
        with open(json_files[0], 'r') as f:
            data = json.load(f)
        print(json.dumps(data["summary"], indent=2))
        print(json.dumps(data["aggregate_metrics"]["endorsed"], indent=2))

---
## 7. Test Cloud Pipeline (Optional)

Test loading scripts from Google Cloud Storage. Make sure you have:
1. Authenticated with GCloud: `gcloud auth application-default login`
2. Scripts uploaded to your bucket

In [None]:
# Cloud configuration
BUCKET_NAME = "videos-scripts-and-annotations"  # Your GCS bucket
SCRIPTS_PREFIX = "saved_scripts/POC_scripts"  # Path to scripts in bucket

# Try to import GCS client
try:
    from google.cloud import storage
    GCS_AVAILABLE = True
    print("‚úì Google Cloud Storage library available")
except ImportError:
    GCS_AVAILABLE = False
    print("‚ö†Ô∏è google-cloud-storage not installed. Run: pip install google-cloud-storage")

In [None]:
# List scripts in GCS bucket
if GCS_AVAILABLE:
    try:
        client = storage.Client()
        bucket = client.bucket(BUCKET_NAME)
        
        print(f"üìÇ Listing scripts in gs://{BUCKET_NAME}/{SCRIPTS_PREFIX}/")
        print("-" * 60)
        
        blobs = list(bucket.list_blobs(prefix=SCRIPTS_PREFIX, max_results=10))
        
        if blobs:
            for blob in blobs:
                print(f"  {blob.name} ({blob.size} bytes)")
            print(f"\n‚úì Found {len(blobs)}+ scripts")
        else:
            print("  ‚ö†Ô∏è No scripts found in this location")
            print("  Make sure you've uploaded scripts to the bucket")
    except Exception as e:
        print(f"‚ùå Error connecting to GCS: {e}")
        print("\nTry running: gcloud auth application-default login")
else:
    print("‚è≠Ô∏è Skipping GCS test - library not installed")

In [None]:
# Test script loading through the ScriptLoader
if GCS_AVAILABLE:
    from evaluation.adapters.script_loader import ScriptLoader
    
    script_loader = ScriptLoader()
    
    print("üîÑ Testing script loading from GCS...")
    print("-" * 60)
    
    # Test with a few videos from ground truth
    test_videos = ground_truth.videos[:3]
    
    for video in test_videos:
        # Construct GCS URI for the script
        script_uri = f"gs://{BUCKET_NAME}/{SCRIPTS_PREFIX}/{video.video_id}.txt"
        
        content = script_loader.load_script(script_uri)
        
        if content:
            preview = content[:100].replace('\n', ' ')
            print(f"‚úì {video.video_id}: {len(content)} chars")
            print(f"  Preview: {preview}...")
        else:
            print(f"‚úó {video.video_id}: Script not found")
        print()
    
    print(f"Cache size: {script_loader.get_cache_size()} scripts")

### Upload Local Scripts to GCS (Optional)

If you have local scripts you want to test with, run this cell to upload them.

In [None]:
# Uncomment and run to upload local scripts
# LOCAL_SCRIPTS_DIR = "./my_local_scripts"  # Change to your scripts directory

# if GCS_AVAILABLE and Path(LOCAL_SCRIPTS_DIR).exists():
#     client = storage.Client()
#     bucket = client.bucket(BUCKET_NAME)
#     
#     script_files = list(Path(LOCAL_SCRIPTS_DIR).glob("*.txt"))
#     print(f"üì§ Uploading {len(script_files)} scripts to GCS...")
#     
#     for script_file in script_files:
#         blob_path = f"{SCRIPTS_PREFIX}/{script_file.name}"
#         blob = bucket.blob(blob_path)
#         blob.upload_from_filename(str(script_file))
#         print(f"  ‚úì Uploaded: {script_file.name}")
#     
#     print(f"\n‚úì All scripts uploaded to gs://{BUCKET_NAME}/{SCRIPTS_PREFIX}/")

print("‚ÑπÔ∏è Uncomment the code above to upload local scripts to GCS")

---
## Summary

This notebook tested the evaluation module with:
- ‚úÖ Ground truth loading from CSV
- ‚úÖ Mock predictions with different accuracy levels
- ‚úÖ Metrics calculation (precision, recall, F1)
- ‚úÖ Report generation (CSV, JSON, comparison)
- ‚úÖ Cloud pipeline testing (optional)

**Next Steps:**
1. Replace mock predictions with your actual model predictions
2. Upload your scripts to GCS and verify they load correctly
3. Run the full evaluation pipeline with `python run_evaluation.py`