# Multimodal Model Evaluation Notebook

This notebook provides an interactive interface for running multimodal model evaluations.

## Features:
- Specify device (CPU/GPU/local)
- Choose model and adapter type
- Configure evaluation parameters
- View results interactively
- Compare different configurations

In [2]:
# Import required libraries
import json
import yaml
from pathlib import Path
from datetime import datetime
import time
from typing import Dict, Any, Optional

# Import our evaluation components
from agent.runner import LocalAgentRunner
from agent.adapters.registry import get_adapter
from orchestrator.cli import create_unique_output_dir

## Configuration 

Choose Device:

In [3]:
# Device Configuration
DEVICE_CONFIG = {
    "device": "cpu",  # "cpu", "cuda", "mps" (for Apple Silicon)
    "platform": "local",  # "local", "edge"
}

Choose the model to run and evaluate:

In [4]:
MODEL_CONFIG = {
    "model_name": "microsoft/DialoGPT-medium",
    "adapter_type": "huggingface",  # "huggingface", "vlm_example", "custom"
    "max_new_tokens": 50,
    "do_sample": True,
    "temperature": 0.7,
}

Evaluation Parameters:

In [5]:
EVAL_CONFIG = {
    "warmup_trials": 2,
    "num_trials": 5,
    "sample_rate_hz": 2,
}

Tasks:

In [6]:
TASK_CONFIG = {
    "type": "vlm_caption",  # "vlm_caption", "image_classification", "asr"
    "prompt": "Describe what you see in this image",
    "image_path": "/mock/path/to/test_image.jpg",  # Mock path for local testing
}

## Create Evaluation Suite
Create the suite configuration from the parameters above:

In [7]:
def create_evaluation_suite() -> Dict[str, Any]:
    suite = {
        "name": f"notebook_eval_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
        "description": "Interactive notebook evaluation",
        "device": DEVICE_CONFIG["device"],
        "platform": DEVICE_CONFIG["platform"],
        
        "adapter_config": {
            "adapter_type": MODEL_CONFIG["adapter_type"],
            "model_name": MODEL_CONFIG["model_name"],
            "max_new_tokens": MODEL_CONFIG["max_new_tokens"],
            "do_sample": MODEL_CONFIG["do_sample"],
            "temperature": MODEL_CONFIG["temperature"],
            "device": DEVICE_CONFIG["device"],
        },
        
        "run": {
            "warmup": EVAL_CONFIG["warmup_trials"],
            "repeats": EVAL_CONFIG["num_trials"],
            "sample_rate_hz": EVAL_CONFIG["sample_rate_hz"],
        },
        
        "tasks": [
            {
                "id": f"{TASK_CONFIG['type']}_task",
                "type": TASK_CONFIG["type"],
                "prompt": TASK_CONFIG["prompt"],
                "image": TASK_CONFIG["image_path"],
            }
        ]
    }
    
    return suite

suite_config = create_evaluation_suite()
print("Evaluation suite created:")
print(json.dumps(suite_config, indent=2))

Evaluation suite created:
{
  "name": "notebook_eval_20250814_233152",
  "description": "Interactive notebook evaluation",
  "device": "cpu",
  "platform": "local",
  "adapter_config": {
    "adapter_type": "huggingface",
    "model_name": "microsoft/DialoGPT-medium",
    "max_new_tokens": 50,
    "do_sample": true,
    "temperature": 0.7,
    "device": "cpu"
  },
  "run": {
    "warmup": 2,
    "repeats": 5,
    "sample_rate_hz": 2
  },
  "tasks": [
    {
      "id": "vlm_caption_task",
      "type": "vlm_caption",
      "prompt": "Describe what you see in this image",
      "image": "/mock/path/to/test_image.jpg"
    }
  ]
}


## Run Evaluation
Execute the evaluation suite:

In [8]:
def run_evaluation(suite_config: Dict[str, Any]) -> Dict[str, Any]:
    """Run the complete evaluation suite"""
    
    print(f"Starting eval: {suite_config['name']}")
    print(f"Device: {suite_config['device']}")
    print(f"Model: {suite_config['adapter_config']['model_name']}")
    print(f"Trials: {suite_config['run']['repeats']}")
    
    # Create output directory
    output_dir = create_unique_output_dir(suite_config['name'])

    try:
        # Create adapter and runner
        print("\nLoading adapter...")
        adapter = get_adapter(suite_config)
        
        print("\nCreating runner...")
        runner = LocalAgentRunner(adapter, suite_config['run']['sample_rate_hz'])
        
        # Run the suite
        print("\n⚡ Running evaluation suite...")
        start_time = time.time()
        results = runner.run_suite(suite_config, output_dir)
        end_time = time.time()
        
        print(f"\nCompleted in {end_time - start_time:.1f} seconds")
        
        return {
            "results": results,
            "output_dir": output_dir,
            "duration": end_time - start_time
        }
        
    except Exception as e:
        print(f"\nEvaluation failed: {e}")
        raise

# Run the evaluation
evaluation_result = run_evaluation(suite_config)

Starting eval: notebook_eval_20250814_233152
Device: cpu
Model: microsoft/DialoGPT-medium
Trials: 5

Loading adapter...
✅ Loaded adapter: HuggingFaceVLMAdapter with config: {'adapter_type': 'huggingface', 'model_name': 'microsoft/DialoGPT-medium', 'max_new_tokens': 50, 'do_sample': True, 'temperature': 0.7, 'device': 'cpu'}

Creating runner...

⚡ Running evaluation suite...
Running 2 warmup trials...
  Warmup 1/2
Running trial 41282370:
  Trial completed: 267.1ms
  Warmup 2/2
Running trial d99c3197:
  Trial completed: 145.3ms
Warmup completed

Task: vlm_caption_task
  Trial 1/5
Running trial dfa119ca:
  Trial completed: 159.2ms
  Trial 2/5
Running trial 2569e6fe:
  Trial completed: 157.4ms
  Trial 3/5
Running trial 94b6f5de:
  Trial completed: 165.5ms
  Trial 4/5
Running trial ff036aff:
  Trial completed: 147.7ms
  Trial 5/5
Running trial 31183241:
  Trial completed: 164.5ms

 Results saved to results/notebook_eval_20250814_233152_20250814_233154

Completed in 5.4 seconds


## View Results
Display the evaluation results:

In [None]:
def display_results(evaluation_result: Dict[str, Any]):

    results = evaluation_result["results"]
    output_dir = evaluation_result["output_dir"]
    
    print(f"Results Summary")
    print(f"Output: {output_dir}")
    print(f"Duration: {evaluation_result['duration']:.1f}s")
    print("\n" + "="*50)
    
    for task_id, task_results in results.items():
        print(f"\nTask: {task_id}")
        print(f"Trials completed: {len(task_results)}")
        
        if task_results:
            timings = [trial.get('timing', {}).get('total_ms', 0) for trial in task_results]
            if timings:
                avg_time = sum(timings) / len(timings)
                min_time = min(timings)
                max_time = max(timings)
                print(f"Timing (ms): avg={avg_time:.1f}, min={min_time:.1f}, max={max_time:.1f}")
            
            if 'resources' in task_results[0]:
                resources = task_results[0]['resources']
                if 'memory' in resources:
                    mem = resources['memory']
                    print(f"Memory: {float(mem.get('ram_mb_peak', 'N/A')):.1f} MB RAM, {float(mem.get('vram_mb_peak', 'N/A')):.1f} MB VRAM")
                
                if 'utilization' in resources:
                    util = resources['utilization']
                    print(f"Utilization: {float(util.get('cpu_util_pct_avg', 'N/A')):.1f}% CPU, {float(util.get('gpu_util_pct_avg', 'N/A')):.1f}% GPU")
    
# Display the results
display_results(evaluation_result)

Results Summary
Output: results/notebook_eval_20250814_222747_20250814_223247
Duration: 5.4s


Task: vlm_caption_task
Trials completed: 5
Timing (ms): avg=155.9, min=145.8, max=168.2
Memory: 15111.7 MB RAM, 15111.7 MB VRAM
Utilization: 27.2% CPU, 19.2% GPU


## Save Results

In [None]:
    output_dir = create_unique_output_dir(suite_config['name'])
    # Check if metrics were computed
    metrics_file = output_dir / "metrics.json"
    if metrics_file.exists():
        print("\nComputed Metrics:")
        with open(metrics_file) as f:
            metrics = json.load(f)
        print(json.dumps(metrics, indent=2))
        print("Saved!")
