In [1]:
import json
from pathlib import Path
import sys

# Add src to path to import utils
sys.path.insert(0, str(Path.cwd().parent / "src"))

from utils.evaluate_mcq_order import load_examples
from utils.mcq_order_models import MCQOrderExample

# Load first 3 examples from the benchmark
dataset_path = Path("../data/mcq_event_timeline_strong.jsonl")
audio_root = Path("../data/audio")

examples = load_examples(dataset_path, limit=10)

print(f"Loaded {len(examples)} examples\n")
print("=" * 80)

Loaded 10 examples



In [2]:
# Display raw temporal data (onset, offset, duration, etc.) for each example
print("Raw Temporal Data from Base Dataset\n")
print("=" * 80)

for i, example in enumerate(examples, 1):
    raw_data = example.raw
    base_event = raw_data.get("base_event", {})
    
    print(f"\n{'='*80}")
    print(f"Example {i}: {example.example_id}")
    print(f"Audio: {example.audio_filename}")
    print(f"{'='*80}")
    
    # Display base event temporal information
    print(f"\nüìç Base Event (the event being asked about):")
    print(f"   Text: \"{base_event.get('text', 'N/A')}\"")
    print(f"   Event Index: {base_event.get('event_index', 'N/A')}")
    print(f"   Onset: {base_event.get('onset', 'N/A'):.3f}s" if isinstance(base_event.get('onset'), (int, float)) else f"   Onset: {base_event.get('onset', 'N/A')}")
    print(f"   Offset: {base_event.get('offset', 'N/A'):.3f}s" if isinstance(base_event.get('offset'), (int, float)) else f"   Offset: {base_event.get('offset', 'N/A')}")
    print(f"   Duration: {base_event.get('duration', 'N/A'):.3f}s" if isinstance(base_event.get('duration'), (int, float)) else f"   Duration: {base_event.get('duration', 'N/A')}")
    print(f"   Occurrence Count: {base_event.get('occurrence_count', 'N/A')}")
    print(f"   Last Offset: {base_event.get('last_offset', 'N/A'):.3f}s" if isinstance(base_event.get('last_offset'), (int, float)) else f"   Last Offset: {base_event.get('last_offset', 'N/A')}")
    
    # Display options with temporal information
    print(f"\nüìã Options with Temporal Information:")
    for option in example.options:
        option_raw = None
        # Find matching option in raw data
        for opt in raw_data.get("options", []):
            if opt.get("label") == option.label:
                option_raw = opt
                break
        
        print(f"\n   {option.label}. {option.text}")
        print(f"      Type: {option.option_type}")
        
        if option_raw:
            if option.option_type == "event":
                print(f"      Event Index: {option_raw.get('event_index', 'N/A')}")
                print(f"      Onset: {option_raw.get('onset', 'N/A'):.3f}s" if isinstance(option_raw.get('onset'), (int, float)) else f"      Onset: {option_raw.get('onset', 'N/A')}")
                print(f"      Offset: {option_raw.get('offset', 'N/A'):.3f}s" if isinstance(option_raw.get('offset'), (int, float)) else f"      Offset: {option_raw.get('offset', 'N/A')}")
                print(f"      Duration: {option_raw.get('duration', 'N/A'):.3f}s" if isinstance(option_raw.get('duration'), (int, float)) else f"      Duration: {option_raw.get('duration', 'N/A')}")
                print(f"      Occurrence Count: {option_raw.get('occurrence_count', 'N/A')}")
                print(f"      Last Offset: {option_raw.get('last_offset', 'N/A'):.3f}s" if isinstance(option_raw.get('last_offset'), (int, float)) else f"      Last Offset: {option_raw.get('last_offset', 'N/A')}")
            else:
                print(f"      (No temporal data - this is a 'none' option)")
        
        if option.label == example.answer_label:
            print(f"      ‚úÖ CORRECT ANSWER")
    
    print()

Raw Temporal Data from Base Dataset


Example 1: 100476.mp3__0
Audio: 100476.mp3

üìç Base Event (the event being asked about):
   Text: "A saxophone plays a soft melody with oscillating volume."
   Event Index: 0
   Onset: 0.294s
   Offset: 7.786s
   Duration: 7.492s
   Occurrence Count: 1
   Last Offset: 7.786s

üìã Options with Temporal Information:

   A. This is the last event, no immediate event after.
      Type: none
      (No temporal data - this is a 'none' option)

   B. A saxophone plays a soft melody with varying volume and fades out.
      Type: event
      Event Index: 1
      Onset: 7.904s
      Offset: 19.000s
      Duration: 11.096s
      Occurrence Count: 1
      Last Offset: 19.000s
      ‚úÖ CORRECT ANSWER

   C. A string is plucked repeatedly without pauses.
      Type: event
      Event Index: 2
      Onset: 14.456s
      Offset: 18.921s
      Duration: 4.466s
      Occurrence Count: 1
      Last Offset: 18.921s


Example 2: 100476.mp3__2
Audio: 100476.mp3

üì

In [3]:
# Display each example with its question and audio
from IPython.display import Audio, display, Markdown

# Load Audio Flamingo 3 predictions
decisions_path = Path("../results/mcq-order/audio-flamingo-3-no-audio/20260217_090228/decisions.jsonl")
decisions_by_example_id = {}

if decisions_path.exists():
    with open(decisions_path, "r", encoding="utf-8") as f:
        for line in f:
            if line.strip():
                decision = json.loads(line)
                decisions_by_example_id[decision["example_id"]] = decision
    print(f"‚úÖ Loaded {len(decisions_by_example_id)} Audio Flamingo 3 predictions\n")
else:
    print(f"‚ö†Ô∏è  Decisions file not found: {decisions_path}\n")

for i, example in enumerate(examples, 1):
    print(f"\n{'='*80}")
    print(f"Example {i}: {example.example_id}")
    print(f"{'='*80}")
    
    # Display question
    print(f"\nüìù Question:")
    print(f"   {example.question}\n")
    
    # Display options
    print("üìã Options:")
    for option in example.options:
        marker = "‚úÖ" if option.label == example.answer_label else "  "
        print(f"   {marker} {option.label}. {option.text}")
    
    print(f"\nüéØ Correct Answer: {example.answer_label} - {example.answer_text}")
    
    # Display Audio Flamingo 3 prediction
    if example.example_id in decisions_by_example_id:
        decision = decisions_by_example_id[example.example_id]
        is_correct = decision["is_correct"]
        status_icon = "‚úÖ" if is_correct else "‚ùå"
        print(f"\nü§ñ Audio Flamingo 3 Prediction: {status_icon}")
        print(f"   Predicted: {decision['predicted_label']} - {decision['predicted_text']}")
        print(f"   {'Correct!' if is_correct else 'Incorrect'}")
    else:
        print(f"\n‚ö†Ô∏è  No prediction found for this example")
    
    # Load and display audio
    audio_path = audio_root / example.audio_filename
    if audio_path.exists():
        print(f"\nüîä Audio: {example.audio_filename}")
        display(Audio(str(audio_path)))
    else:
        print(f"\n‚ö†Ô∏è  Audio file not found: {audio_path}")
        print(f"   (Audio directory may need to be extracted from data/audio.zip)")
    
    print()

‚ö†Ô∏è  Decisions file not found: ../results/mcq-order/audio-flamingo-3-no-audio/20260217_090228/decisions.jsonl


Example 1: 100476.mp3__0

üìù Question:
   What happens immediately after this event first appears: "A saxophone plays a soft melody with oscillating volume."?

üìã Options:
      A. This is the last event, no immediate event after.
   ‚úÖ B. A saxophone plays a soft melody with varying volume and fades out.
      C. A string is plucked repeatedly without pauses.

üéØ Correct Answer: B - A saxophone plays a soft melody with varying volume and fades out.

‚ö†Ô∏è  No prediction found for this example

‚ö†Ô∏è  Audio file not found: ../data/audio/100476.mp3
   (Audio directory may need to be extracted from data/audio.zip)


Example 2: 100476.mp3__2

üìù Question:
   What happens immediately after this event first appears: "A string is plucked repeatedly without pauses."?

üìã Options:
   ‚úÖ A. This is the last event, no immediate event after.
      B. A saxophone plays a