In [17]:
import json
from pathlib import Path
import sys
import random

# Add src to path to import utils
sys.path.insert(0, str(Path.cwd().parent / "src"))

from utils.evaluate_mcq_order import load_examples
from utils.mcq_order_models import MCQOrderExample

# Load all examples, then randomly sample 10 from those predicted by Audio Flamingo
dataset_path = Path("../data/mcq_event_timeline_strong.jsonl")
audio_root = Path("../data/audio")

# Load Audio Flamingo decisions to get example_ids that were predicted
decisions_path = Path("../results/mcq-order/audio-flamingo-3/20260217_140006/decisions.jsonl")
predicted_example_ids = set()
decisions_by_example_id = {}
if decisions_path.exists():
    with open(decisions_path, "r", encoding="utf-8") as f:
        for line in f:
            if line.strip():
                decision = json.loads(line)
                example_id = decision["example_id"]
                predicted_example_ids.add(example_id)
                decisions_by_example_id[example_id] = decision
    print(f"Found {len(predicted_example_ids)} examples predicted by Audio Flamingo")
else:
    print(f"‚ö†Ô∏è  Decisions file not found: {decisions_path}")
    predicted_example_ids = None

# Load all examples from dataset
all_examples = load_examples(dataset_path, limit=None)

# Filter to only examples that were predicted by Audio Flamingo
if predicted_example_ids:
    predicted_examples = [ex for ex in all_examples if ex.example_id in predicted_example_ids]
    print(f"Filtered to {len(predicted_examples)} examples that were predicted")
    # Randomly sample 10 from the predicted examples
    examples = random.sample(predicted_examples, min(10, len(predicted_examples)))
    # Verify all sampled examples have predictions
    missing_predictions = [ex.example_id for ex in examples if ex.example_id not in decisions_by_example_id]
    if missing_predictions:
        print(f"‚ö†Ô∏è  Warning: {len(missing_predictions)} sampled examples missing predictions: {missing_predictions}")
else:
    # Fallback: randomly sample from all examples
    examples = random.sample(all_examples, min(10, len(all_examples)))
    print("‚ö†Ô∏è  Warning: Using all examples (decisions file not found)")

print(f"Randomly sampled {len(examples)} examples for display")
print(f"All examples have predictions: {all(ex.example_id in decisions_by_example_id for ex in examples)}\n")
print("=" * 80)

Found 1000 examples predicted by Audio Flamingo
Filtered to 1000 examples that were predicted
Randomly sampled 10 examples for display
All examples have predictions: True



In [18]:
# Display raw temporal data (onset, offset, duration, etc.) for each example
print("Raw Temporal Data from Base Dataset\n")
print("=" * 80)

for i, example in enumerate(examples, 1):
    raw_data = example.raw
    base_event = raw_data.get("base_event", {})
    
    print(f"\n{'='*80}")
    print(f"Example {i}: {example.example_id}")
    print(f"Audio: {example.audio_filename}")
    print(f"{'='*80}")
    
    # Display base event temporal information
    print(f"\nüìç Base Event (the event being asked about):")
    print(f"   Text: \"{base_event.get('text', 'N/A')}\"")
    print(f"   Event Index: {base_event.get('event_index', 'N/A')}")
    print(f"   Onset: {base_event.get('onset', 'N/A'):.3f}s" if isinstance(base_event.get('onset'), (int, float)) else f"   Onset: {base_event.get('onset', 'N/A')}")
    print(f"   Offset: {base_event.get('offset', 'N/A'):.3f}s" if isinstance(base_event.get('offset'), (int, float)) else f"   Offset: {base_event.get('offset', 'N/A')}")
    print(f"   Duration: {base_event.get('duration', 'N/A'):.3f}s" if isinstance(base_event.get('duration'), (int, float)) else f"   Duration: {base_event.get('duration', 'N/A')}")
    print(f"   Occurrence Count: {base_event.get('occurrence_count', 'N/A')}")
    print(f"   Last Offset: {base_event.get('last_offset', 'N/A'):.3f}s" if isinstance(base_event.get('last_offset'), (int, float)) else f"   Last Offset: {base_event.get('last_offset', 'N/A')}")
    
    # Display options with temporal information
    print(f"\nüìã Options with Temporal Information:")
    for option in example.options:
        option_raw = None
        # Find matching option in raw data
        for opt in raw_data.get("options", []):
            if opt.get("label") == option.label:
                option_raw = opt
                break
        
        print(f"\n   {option.label}. {option.text}")
        print(f"      Type: {option.option_type}")
        
        if option_raw:
            if option.option_type == "event":
                print(f"      Event Index: {option_raw.get('event_index', 'N/A')}")
                print(f"      Onset: {option_raw.get('onset', 'N/A'):.3f}s" if isinstance(option_raw.get('onset'), (int, float)) else f"      Onset: {option_raw.get('onset', 'N/A')}")
                print(f"      Offset: {option_raw.get('offset', 'N/A'):.3f}s" if isinstance(option_raw.get('offset'), (int, float)) else f"      Offset: {option_raw.get('offset', 'N/A')}")
                print(f"      Duration: {option_raw.get('duration', 'N/A'):.3f}s" if isinstance(option_raw.get('duration'), (int, float)) else f"      Duration: {option_raw.get('duration', 'N/A')}")
                print(f"      Occurrence Count: {option_raw.get('occurrence_count', 'N/A')}")
                print(f"      Last Offset: {option_raw.get('last_offset', 'N/A'):.3f}s" if isinstance(option_raw.get('last_offset'), (int, float)) else f"      Last Offset: {option_raw.get('last_offset', 'N/A')}")
            else:
                print(f"      (No temporal data - this is a 'none' option)")
        
        if option.label == example.answer_label:
            print(f"      ‚úÖ CORRECT ANSWER")
    
    print()

Raw Temporal Data from Base Dataset


Example 1: 145480.mp3__3
Audio: 145480.mp3

üìç Base Event (the event being asked about):
   Text: "Music plays to announce a stop."
   Event Index: 3
   Onset: 12.566s
   Offset: 15.915s
   Duration: 3.349s
   Occurrence Count: 1
   Last Offset: 15.915s

üìã Options with Temporal Information:

   A. This is the last event, no immediate event after.
      Type: none
      (No temporal data - this is a 'none' option)

   B. Train wheels squeaking metallically as the train brakes.
      Type: event
      Event Index: 0
      Onset: 0.000s
      Offset: 3.102s
      Duration: 3.102s
      Occurrence Count: 4
      Last Offset: 16.207s

   C. The train wagon sways loudly as the train slows down and then subsides as it comes to a full stop.
      Type: event
      Event Index: 1
      Onset: 0.000s
      Offset: 23.694s
      Duration: 23.694s
      Occurrence Count: 1
      Last Offset: 23.694s

   D. Soft rustling of fabric rubbing against itself.
 

In [19]:
# Display each example with its question and audio
from IPython.display import Audio, display, Markdown

# Use decisions_by_example_id loaded in the first cell
if 'decisions_by_example_id' not in globals() or not decisions_by_example_id:
    # Fallback: reload if not available
    decisions_path = Path("../results/mcq-order/audio-flamingo-3/20260217_140006/decisions.jsonl")
    decisions_by_example_id = {}
    if decisions_path.exists():
        with open(decisions_path, "r", encoding="utf-8") as f:
            for line in f:
                if line.strip():
                    decision = json.loads(line)
                    decisions_by_example_id[decision["example_id"]] = decision
        print(f"‚úÖ Loaded {len(decisions_by_example_id)} Audio Flamingo 3 predictions\n")
    else:
        print(f"‚ö†Ô∏è  Decisions file not found: {decisions_path}\n")
else:
    print(f"‚úÖ Using {len(decisions_by_example_id)} Audio Flamingo 3 predictions from first cell\n")

for i, example in enumerate(examples, 1):
    print(f"\n{'='*80}")
    print(f"Example {i}: {example.example_id}")
    print(f"{'='*80}")
    
    # Display question
    print(f"\nüìù Question:")
    print(f"   {example.question}\n")
    
    # Display options
    print("üìã Options:")
    for option in example.options:
        marker = "‚úÖ" if option.label == example.answer_label else "  "
        print(f"   {marker} {option.label}. {option.text}")
    
    print(f"\nüéØ Correct Answer: {example.answer_label} - {example.answer_text}")
    
    # Display Audio Flamingo 3 prediction
    if example.example_id in decisions_by_example_id:
        decision = decisions_by_example_id[example.example_id]
        is_correct = decision["is_correct"]
        status_icon = "‚úÖ" if is_correct else "‚ùå"
        print(f"\nü§ñ Audio Flamingo 3 Prediction: {status_icon}")
        print(f"   Predicted: {decision['predicted_label']} - {decision['predicted_text']}")
        print(f"   {'Correct!' if is_correct else 'Incorrect'}")
    else:
        print(f"\n‚ö†Ô∏è  No prediction found for this example")
    
    # Load and display audio
    audio_path = audio_root / example.audio_filename
    if audio_path.exists():
        print(f"\nüîä Audio: {example.audio_filename}")
        display(Audio(str(audio_path)))
    else:
        print(f"\n‚ö†Ô∏è  Audio file not found: {audio_path}")
        print(f"   (Audio directory may need to be extracted from data/audio.zip)")
    
    print()

‚úÖ Using 1000 Audio Flamingo 3 predictions from first cell


Example 1: 145480.mp3__3

üìù Question:
   What happens immediately after this event first appears: "Music plays to announce a stop."?

üìã Options:
      A. This is the last event, no immediate event after.
      B. Train wheels squeaking metallically as the train brakes.
      C. The train wagon sways loudly as the train slows down and then subsides as it comes to a full stop.
      D. Soft rustling of fabric rubbing against itself.
   ‚úÖ E. Footsteps.

üéØ Correct Answer: E - Footsteps.

ü§ñ Audio Flamingo 3 Prediction: ‚ùå
   Predicted: B - Train wheels squeaking metallically as the train brakes.
   Incorrect

üîä Audio: 145480.mp3




Example 2: 120777.mp3__2

üìù Question:
   What happens immediately after this event first appears: "A high-pitched sound of train wheels braking on tracks."?

üìã Options:
   ‚úÖ A. This is the last event, no immediate event after.
      B. A train horn honks.
      C. A slow train is passing by.

üéØ Correct Answer: A - This is the last event, no immediate event after.

ü§ñ Audio Flamingo 3 Prediction: ‚ùå
   Predicted: B - A train horn honks.
   Incorrect

üîä Audio: 120777.mp3




Example 3: 127182.mp3__2

üìù Question:
   What happens immediately after this event first appears: "A metallic object is struck repeatedly with a stable and gentle truck engine sound in the background."?

üìã Options:
   ‚úÖ A. This is the last event, no immediate event after.
      B. An engine runs steadily with a clear noise.
      C. Repeated whirring noise of hooking up a caravan to a truck.

üéØ Correct Answer: A - This is the last event, no immediate event after.

ü§ñ Audio Flamingo 3 Prediction: ‚ùå
   Predicted: B - An engine runs steadily with a clear noise.
   Incorrect

üîä Audio: 127182.mp3




Example 4: 115929.mp3__2

üìù Question:
   What happens immediately after this event first appears: "An indescribable robotic stutter occurs periodically."?

üìã Options:
      A. A robotic stuttering sound in a fast rhythmic pattern.
      B. Repeated robotic sounds.
   ‚úÖ C. This is the last event, no immediate event after.

üéØ Correct Answer: C - This is the last event, no immediate event after.

ü§ñ Audio Flamingo 3 Prediction: ‚ùå
   Predicted: A - A robotic stuttering sound in a fast rhythmic pattern.
   Incorrect

üîä Audio: 115929.mp3




Example 5: 156363.mp3__1

üìù Question:
   What happens immediately after this event first appears: "The motor of a lawn mower or construction machine is running."?

üìã Options:
   ‚úÖ A. This is the last event, no immediate event after.
      B. Drilling sounds on a construction site.

üéØ Correct Answer: A - This is the last event, no immediate event after.

ü§ñ Audio Flamingo 3 Prediction: ‚ùå
   Predicted: B - Drilling sounds on a construction site.
   Incorrect

üîä Audio: 156363.mp3




Example 6: 113344.mp3__1

üìù Question:
   What happens immediately after this event first appears: "An unclear radio message is heard in the distance."?

üìã Options:
      A. This is the last event, no immediate event after.
      B. An engine is running nearby.
   ‚úÖ C. A loud beeping sound is heard in the distance.

üéØ Correct Answer: C - A loud beeping sound is heard in the distance.

ü§ñ Audio Flamingo 3 Prediction: ‚ùå
   Predicted: B - An engine is running nearby.
   Incorrect

üîä Audio: 113344.mp3




Example 7: 118341.mp3__0

üìù Question:
   What happens immediately after this event first appears: "Repeated hammering as a nail is being driven."?

üìã Options:
      A. This is the last event, no immediate event after.
      B. A hammer strikes a nail three times with varying pauses in between.
   ‚úÖ C. A hammer hitting a nail repeatedly with varying intensities.

üéØ Correct Answer: C - A hammer hitting a nail repeatedly with varying intensities.

ü§ñ Audio Flamingo 3 Prediction: ‚ùå
   Predicted: B - A hammer strikes a nail three times with varying pauses in between.
   Incorrect

üîä Audio: 118341.mp3




Example 8: 104110.mp3__1

üìù Question:
   What happens immediately after this event first appears: "A man laughs with a cartoony, scary, evil voice."?

üìã Options:
   ‚úÖ A. A man laughs with a deep, cartoonishly evil voice.
      B. This is the last event, no immediate event after.
      C. A man laughs in a cartoony evil voice.

üéØ Correct Answer: A - A man laughs with a deep, cartoonishly evil voice.

ü§ñ Audio Flamingo 3 Prediction: ‚úÖ
   Predicted: A - A man laughs with a deep, cartoonishly evil voice.
   Correct!

üîä Audio: 104110.mp3




Example 9: 155296.mp3__3

üìù Question:
   What happens immediately after this event first appears: "Metallic clinking of coins."?

üìã Options:
      A. A continuous quiet white noise from ventilation.
      B. This is the last event, no immediate event after.
   ‚úÖ C. Quiet distant thumping.
      D. A woman is talking nearby.
      E. A woman is speaking in the distance.
      F. Receipt machines whirring and printing.
      G. Paper rustling.
      H. A car passes by.
      I. Metallic clacking of coins.
      J. A machine is buzzing.
      K. A woman is talking in the distance.
      L. People are talking in the distance.
      M. A receipt machine is printing with a buzzing sound.

üéØ Correct Answer: C - Quiet distant thumping.

ü§ñ Audio Flamingo 3 Prediction: ‚ùå
   Predicted: I - Metallic clacking of coins.
   Incorrect

üîä Audio: 155296.mp3




Example 10: 141529.mp3__4

üìù Question:
   What happens immediately after this event first appears: "Someone blows softly into a microphone, creating an airy whoosh."?

üìã Options:
      A. A calm rainstorm with soft, steady raindrops and gentle wind.
      B. An alarm gradually increases in volume until it reaches a steady, repetitive tone.
      C. Someone utters a brief, disappointed sound in a quiet, shocked tone.
      D. A loud thunderclap strikes nearby, echoing and gradually fading out.
   ‚úÖ E. This is the last event, no immediate event after.

üéØ Correct Answer: E - This is the last event, no immediate event after.

ü§ñ Audio Flamingo 3 Prediction: ‚ùå
   Predicted: D - A loud thunderclap strikes nearby, echoing and gradually fading out.
   Incorrect

üîä Audio: 141529.mp3





## Curated Interesting Audio Cases

This section surfaces non-random, high-signal examples:
- **Audio helped** (`audio_only_correct`)
- **Audio hurt** (`noaudio_only_correct`)
- **Hard failures** (`both_wrong_diff_pred`)

Selection prioritizes larger option sets and parse/format failures in the weaker side.


In [20]:
from pathlib import Path
import json
import pandas as pd
from IPython.display import Audio, Markdown, display

# Self-contained setup for curated audio/no-audio case selection
REPO_ROOT = Path.cwd().parent if Path.cwd().name == 'notebooks' else Path.cwd()
DEBUG_OUT_DIR = REPO_ROOT / 'results' / 'mcq-order' / 'debug_bundle'
DATASET_PATH = REPO_ROOT / 'data' / 'mcq_event_timeline_strong.jsonl'

display(Markdown(f"Using debug bundle: `{DEBUG_OUT_DIR}`"))

queue_all_path = DEBUG_OUT_DIR / 'human_review_queue.csv'
if not queue_all_path.exists():
    raise FileNotFoundError(f"Missing {queue_all_path}. Run `make debug-mcq-bundle` first.")
queue_all = pd.read_csv(queue_all_path)

latest_decisions_path = DEBUG_OUT_DIR / 'latest_decisions_long.csv'
decisions_long = pd.read_csv(latest_decisions_path) if latest_decisions_path.exists() else pd.DataFrame()

# Build dataset + decision indices used by show_review_case
_dataset_index = {}
with open(DATASET_PATH, 'r', encoding='utf-8') as f:
    for line in f:
        if not line.strip():
            continue
        row = json.loads(line)
        example_id = row.get('id')
        if isinstance(example_id, str):
            _dataset_index[example_id] = row

_decisions_by_example = (
    {k: g.copy() for k, g in decisions_long.groupby('example_id')}
    if not decisions_long.empty and 'example_id' in decisions_long.columns
    else {}
)


def show_review_case(example_id: str, *, play_audio: bool = True) -> None:
    row = _dataset_index.get(example_id)
    if row is None:
        print(f'Example not found in dataset: {example_id}')
        return

    print('=' * 100)
    print('Example:', example_id)
    print('Audio:', row.get('audio_filename', ''))
    print('Question:', row.get('question', ''))
    print('Answer label:', row.get('answer_label', ''))
    print('Answer type:', row.get('answer_type', ''))
    print('-' * 100)
    print('Options:')
    for opt in row.get('options', []):
        if not isinstance(opt, dict):
            continue
        print(f"{opt.get('label', '')}. {opt.get('text', '')} [{opt.get('type', '')}]")

    print('-' * 100)
    preds = _decisions_by_example.get(example_id)
    if preds is None or preds.empty:
        print('No predictions found in latest_decisions_long.csv for this example.')
    else:
        display(preds[[
            'model_name', 'predicted_label', 'predicted_type', 'answer_label',
            'is_correct', 'parse_status', 'raw_prediction'
        ]].sort_values('model_name'))

    if play_audio:
        audio_rel = row.get('audio_filename')
        if isinstance(audio_rel, str) and audio_rel:
            audio_path = REPO_ROOT / 'data' / 'audio' / audio_rel
            if audio_path.exists():
                display(Audio(filename=str(audio_path)))
            else:
                print('Audio file missing at', audio_path)


def _has_parse_issue(series: pd.Series) -> pd.Series:
    return series.astype(str).str.lower().str.contains('invalid|empty|missing', regex=True)

queue_all = queue_all.copy()
queue_all['n_options'] = pd.to_numeric(queue_all['n_options'], errors='coerce').fillna(0)
queue_all['audio_parse_issue'] = _has_parse_issue(queue_all['audio_parse_status'])
queue_all['noaudio_parse_issue'] = _has_parse_issue(queue_all['noaudio_parse_status'])

interesting_frames = []
categories = [
    ('audio_only_correct', 'Audio helped'),
    ('noaudio_only_correct', 'Audio hurt'),
    ('both_wrong_diff_pred', 'Both wrong, different predictions'),
]

for pair_name in sorted(queue_all['pair_name'].dropna().unique()):
    pair_df = queue_all[queue_all['pair_name'] == pair_name]
    for category, bucket_label in categories:
        cat_df = pair_df[pair_df['category'] == category].copy()
        if cat_df.empty:
            continue

        if category == 'audio_only_correct':
            score = cat_df['n_options'] + 5 * cat_df['noaudio_parse_issue'].astype(int)
        elif category == 'noaudio_only_correct':
            score = cat_df['n_options'] + 5 * cat_df['audio_parse_issue'].astype(int)
        else:
            score = cat_df['n_options'] + 3 * (cat_df['audio_parse_issue'] | cat_df['noaudio_parse_issue']).astype(int)

        cat_df['interesting_score'] = score
        top_row = cat_df.sort_values(['interesting_score', 'n_options'], ascending=[False, False]).head(1)
        top_row['bucket'] = bucket_label
        interesting_frames.append(top_row)

if interesting_frames:
    interesting_cases = pd.concat(interesting_frames, ignore_index=True)
    interesting_cases = interesting_cases[[
        'pair_name', 'bucket', 'category', 'interesting_score', 'example_id', 'n_options',
        'answer_label', 'audio_prediction', 'noaudio_prediction',
        'audio_parse_status', 'noaudio_parse_status', 'question'
    ]].sort_values(['pair_name', 'bucket'])
    display(Markdown('### Curated Interesting Cases (Auto-selected, non-random)'))
    display(interesting_cases)
else:
    interesting_cases = pd.DataFrame()
    display(Markdown('_No interesting cases found in review queue._'))



Using debug bundle: `/Users/younisskandah/Documents/GitHub/TACoBeLAL/results/mcq-order/debug_bundle`

### Curated Interesting Cases (Auto-selected, non-random)

Unnamed: 0,pair_name,bucket,category,interesting_score,example_id,n_options,answer_label,audio_prediction,noaudio_prediction,audio_parse_status,noaudio_parse_status,question
0,audio-flamingo-3__vs__audio-flamingo-3-no-audio,Audio helped,audio_only_correct,17,128064.mp3__3,17,A,A,F,exact-label,exact-label,What happens immediately after this event firs...
1,audio-flamingo-3__vs__audio-flamingo-3-no-audio,Audio hurt,noaudio_only_correct,17,128064.mp3__2,17,B,C,B,exact-label,exact-label,What happens immediately after this event firs...
2,audio-flamingo-3__vs__audio-flamingo-3-no-audio,"Both wrong, different predictions",both_wrong_diff_pred,17,128064.mp3__9,17,L,A,H,exact-label,exact-label,What happens immediately after this event firs...
3,qwen2-audio-7b-instruct__vs__qwen2-audio-7b-in...,Audio helped,audio_only_correct,19,167750.mp3__1,14,A,A,INVALID,exact-label,invalid,What happens immediately after this event firs...
4,qwen2-audio-7b-instruct__vs__qwen2-audio-7b-in...,Audio hurt,noaudio_only_correct,13,155296.mp3__5,13,A,F,A,exact-label,regex-label,What happens immediately after this event firs...
5,qwen2-audio-7b-instruct__vs__qwen2-audio-7b-in...,"Both wrong, different predictions",both_wrong_diff_pred,20,128064.mp3__2,17,B,C,INVALID,exact-label,invalid,What happens immediately after this event firs...


In [None]:
# Render full case details for curated examples
if not interesting_cases.empty:
    for _, row in interesting_cases.iterrows():
        display(Markdown(
            f"### {row['bucket']} | {row['pair_name']} | {row['example_id']}\n"
            f"- category: `{row['category']}`\n"
            f"- n_options: `{int(row['n_options'])}` | answer: `{row['answer_label']}`\n"
            f"- audio pred: `{row['audio_prediction']}` ({row['audio_parse_status']})\n"
            f"- no-audio pred: `{row['noaudio_prediction']}` ({row['noaudio_parse_status']})"
        ))
        show_review_case(str(row['example_id']), play_audio=True)
else:
    print('No curated examples to render.')


### Audio helped | audio-flamingo-3__vs__audio-flamingo-3-no-audio | 128064.mp3__3
- category: `audio_only_correct`
- n_options: `17` | answer: `A`
- audio pred: `A` (exact-label)
- no-audio pred: `F` (exact-label)

Example: 128064.mp3__3
Audio: 128064.mp3
Question: What happens immediately after this event first appears: "A short, fast, high-pitched pig grunt."?
Answer label: A
Answer type: event
----------------------------------------------------------------------------------------------------
Options:
A. A pig grunts gutturally and raspily in bursts while people talk in the background. [event]
B. A raspy, guttural, high-pitched pig grunt. [event]
C. A pig is grunting continuously. [event]
D. A pig grunts quietly twice. [event]
E. A pig grunts gutturally and raspily, with the pitch increasing. [event]
F. A pig grunts shortly and quickly. [event]
G. Several birds chirping softly and continuously outdoors. [event]
H. A pig grunts continuously in a quiet, monotone manner. [event]
I. A pig grunts twice with a fast, raspy sound separated by a quiet pause. [event]
J. A bird chirps once with a high-pitched tone. [event]
K. High-pitched, continuous raspy pig grunt. [event]
L. Muffled, indistinct overla

Unnamed: 0,model_name,predicted_label,predicted_type,answer_label,is_correct,parse_status,raw_prediction
298,audio-flamingo-3,A,event,A,True,exact-label,A
1298,audio-flamingo-3-no-audio,F,event,A,False,exact-label,F
2298,qwen2-audio-7b-instruct,B,event,A,False,exact-label,B
3298,qwen2-audio-7b-instruct-no-audio,INVALID,invalid,A,False,invalid,"Based on the given description, the immediate ..."
4298,random,G,event,A,False,,


### Audio hurt | audio-flamingo-3__vs__audio-flamingo-3-no-audio | 128064.mp3__2
- category: `noaudio_only_correct`
- n_options: `17` | answer: `B`
- audio pred: `C` (exact-label)
- no-audio pred: `B` (exact-label)

Example: 128064.mp3__2
Audio: 128064.mp3
Question: What happens immediately after this event first appears: "A pig grunts twice with a fast, raspy sound separated by a quiet pause."?
Answer label: B
Answer type: event
----------------------------------------------------------------------------------------------------
Options:
A. Several birds chirping softly and continuously outdoors. [event]
B. A short, fast, high-pitched pig grunt. [event]
C. A pig grunts gutturally and raspily in bursts while people talk in the background. [event]
D. A bird chirps softly and continuously. [event]
E. A bird chirps once with a high-pitched tone. [event]
F. A bird chirping quietly outdoors. [event]
G. A pig grunts quietly twice. [event]
H. Birds chirping continuously with faint background chatter. [event]
I. High-pitched, continuous raspy pig grunt. [event]
J. This is the last event, no immediate event after. [none]
K. A bird emits an abrupt high-pitched call, possibly while flying away. [event]
L. Muf

Unnamed: 0,model_name,predicted_label,predicted_type,answer_label,is_correct,parse_status,raw_prediction
297,audio-flamingo-3,C,event,B,False,exact-label,C
1297,audio-flamingo-3-no-audio,B,event,B,True,exact-label,B
2297,qwen2-audio-7b-instruct,C,event,B,False,exact-label,C
3297,qwen2-audio-7b-instruct-no-audio,INVALID,invalid,B,False,invalid,"Based on the given description, the immediate ..."
4297,random,G,event,B,False,,


### Both wrong, different predictions | audio-flamingo-3__vs__audio-flamingo-3-no-audio | 128064.mp3__9
- category: `both_wrong_diff_pred`
- n_options: `17` | answer: `L`
- audio pred: `A` (exact-label)
- no-audio pred: `H` (exact-label)

Example: 128064.mp3__9
Audio: 128064.mp3
Question: What happens immediately after this event first appears: "A pig grunts gutturally and raspily, with the pitch increasing."?
Answer label: L
Answer type: event
----------------------------------------------------------------------------------------------------
Options:
A. A pig grunts gutturally and raspily in bursts while people talk in the background. [event]
B. A pig grunts twice with a fast, raspy sound separated by a quiet pause. [event]
C. A pig grunts shortly and quickly. [event]
D. A pig is grunting continuously. [event]
E. High-pitched, continuous raspy pig grunt. [event]
F. A bird chirping quietly outdoors. [event]
G. A bird emits an abrupt high-pitched call, possibly while flying away. [event]
H. A raspy, guttural, high-pitched pig grunt. [event]
I. A short, fast, high-pitched pig grunt. [event]
J. A bird chirps once with a high-pitched tone. [event]
K. Several birds chirping softly and continuously outdoors. [event]
L. A pig

Unnamed: 0,model_name,predicted_label,predicted_type,answer_label,is_correct,parse_status,raw_prediction
299,audio-flamingo-3,A,event,L,False,exact-label,A
1299,audio-flamingo-3-no-audio,H,event,L,False,exact-label,H
2299,qwen2-audio-7b-instruct,B,event,L,False,exact-label,B
3299,qwen2-audio-7b-instruct-no-audio,B,event,L,False,regex-label,"Based on the given description, the most likel..."
4299,random,A,event,L,False,,


### Audio helped | qwen2-audio-7b-instruct__vs__qwen2-audio-7b-instruct-no-audio | 167750.mp3__1
- category: `audio_only_correct`
- n_options: `14` | answer: `A`
- audio pred: `A` (exact-label)
- no-audio pred: `INVALID` (invalid)

Example: 167750.mp3__1
Audio: 167750.mp3
Question: What happens immediately after this event first appears: "The pot lid rattles and bangs."?
Answer label: A
Answer type: event
----------------------------------------------------------------------------------------------------
Options:
A. A cracking sound of something hitting wood. [event]
B. A shrill grinding humming noise from a drill gradually decreases in intensity. [event]
C. Footsteps of a man walking slowly. [event]
D. Someone is speaking briefly in the background. [event]
E. This is the last event, no immediate event after. [none]
F. A brief metallic clank of metal hitting the ground. [event]
G. A person is speaking briefly in the background. [event]
H. A shrill grinding humming noise from a drill that fluctuates in intensity. [event]
I. A drill drilling into wood with fast consecutive banging noises. [event]
J. A drill makes a short humming noise while drilling briefly. [event]
K. A drill makes a grinding humming noise. [event

Unnamed: 0,model_name,predicted_label,predicted_type,answer_label,is_correct,parse_status,raw_prediction
823,audio-flamingo-3,N,event,A,False,exact-label,N
1823,audio-flamingo-3-no-audio,N,event,A,False,exact-label,N
2823,qwen2-audio-7b-instruct,A,event,A,True,exact-label,A
3823,qwen2-audio-7b-instruct-no-audio,INVALID,invalid,A,False,invalid,"Based on the given event description, the most..."
4823,random,B,event,A,False,,


### Audio hurt | qwen2-audio-7b-instruct__vs__qwen2-audio-7b-instruct-no-audio | 155296.mp3__5
- category: `noaudio_only_correct`
- n_options: `13` | answer: `A`
- audio pred: `F` (exact-label)
- no-audio pred: `A` (regex-label)

Example: 155296.mp3__5
Audio: 155296.mp3
Question: What happens immediately after this event first appears: "A car passes by."?
Answer label: A
Answer type: event
----------------------------------------------------------------------------------------------------
Options:
A. People are talking in the distance. [event]
B. Metallic clacking of coins. [event]
C. A continuous quiet white noise from ventilation. [event]
D. A machine is buzzing. [event]
E. A woman is talking nearby. [event]
F. A receipt machine is printing with a buzzing sound. [event]
G. Receipt machines whirring and printing. [event]
H. Quiet distant thumping. [event]
I. Metallic clinking of coins. [event]
J. A woman is speaking in the distance. [event]
K. This is the last event, no immediate event after. [none]
L. Paper rustling. [event]
M. A woman is talking in the distance. [event]
----------------------------------------------------------------------------------------------------


Unnamed: 0,model_name,predicted_label,predicted_type,answer_label,is_correct,parse_status,raw_prediction
602,audio-flamingo-3,A,event,A,True,exact-label,A
1602,audio-flamingo-3-no-audio,A,event,A,True,exact-label,A
2602,qwen2-audio-7b-instruct,F,event,A,False,exact-label,F
3602,qwen2-audio-7b-instruct-no-audio,A,event,A,True,regex-label,"Based on the given event ""A car passes by,"" th..."
4602,random,L,event,A,False,,


### Both wrong, different predictions | qwen2-audio-7b-instruct__vs__qwen2-audio-7b-instruct-no-audio | 128064.mp3__2
- category: `both_wrong_diff_pred`
- n_options: `17` | answer: `B`
- audio pred: `C` (exact-label)
- no-audio pred: `INVALID` (invalid)

Example: 128064.mp3__2
Audio: 128064.mp3
Question: What happens immediately after this event first appears: "A pig grunts twice with a fast, raspy sound separated by a quiet pause."?
Answer label: B
Answer type: event
----------------------------------------------------------------------------------------------------
Options:
A. Several birds chirping softly and continuously outdoors. [event]
B. A short, fast, high-pitched pig grunt. [event]
C. A pig grunts gutturally and raspily in bursts while people talk in the background. [event]
D. A bird chirps softly and continuously. [event]
E. A bird chirps once with a high-pitched tone. [event]
F. A bird chirping quietly outdoors. [event]
G. A pig grunts quietly twice. [event]
H. Birds chirping continuously with faint background chatter. [event]
I. High-pitched, continuous raspy pig grunt. [event]
J. This is the last event, no immediate event after. [none]
K. A bird emits an abrupt high-pitched call, possibly while flying away. [event]
L. Muf

Unnamed: 0,model_name,predicted_label,predicted_type,answer_label,is_correct,parse_status,raw_prediction
297,audio-flamingo-3,C,event,B,False,exact-label,C
1297,audio-flamingo-3-no-audio,B,event,B,True,exact-label,B
2297,qwen2-audio-7b-instruct,C,event,B,False,exact-label,C
3297,qwen2-audio-7b-instruct-no-audio,INVALID,invalid,B,False,invalid,"Based on the given description, the immediate ..."
4297,random,G,event,B,False,,
