In [None]:
import json
import pandas as pd
import numpy as np
import os
from pathlib import Path
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import warnings
from collections import Counter
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")


In [None]:
# Load input/ground truth pairs
with open('data/input_ground_truth_pairs.json', 'r') as f:
    input_gt_pairs = json.load(f)

print(f"Loaded {len(input_gt_pairs)} sound clips with input/ground truth pairs")
print(f"Example entry: {input_gt_pairs[0]}")


In [None]:
# Load SBERT model for semantic similarity
print("Loading SBERT model...")
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')
print("SBERT model loaded successfully!")

# Semantic similarity threshold (adjustable)
SIMILARITY_THRESHOLD = 0.7
print(f"Semantic similarity threshold: {SIMILARITY_THRESHOLD}")


In [None]:
# Load existing evaluation results
evaluation_files = {
    'clap_baseline': 'eval/clap_baseline_results.json',
    'clap_df': 'eval/clap_baseline_df_sbert_space_alpha0.7_threshold0.7_results.json',
    'rankst_k1': 'eval/rankst_k1_eval.json',
    'rankst_k2': 'eval/rankst_k2_eval.json', 
    'rankst_k3': 'eval/rankst_k3_eval.json'
}

existing_results = {}

print("=== Loading Evaluation Files ===")
for system_name, file_path in evaluation_files.items():
    if os.path.exists(file_path):
        print(f"Found {file_path}")
        try:
            with open(file_path, 'r') as f:
                data = json.load(f)
                
            # Inspect file structure
            print(f"  File structure keys: {list(data.keys())}")
            
            # Handle different file structures
            if 'detailed_results' in data:
                # CLAP baseline format
                existing_results[system_name] = data['detailed_results']
                print(f"  Using 'detailed_results' key (CLAP format)")
            elif 'evaluation_details' in data:
                # RankST format
                existing_results[system_name] = data['evaluation_details']
                print(f"  Using 'evaluation_details' key (RankST format)")
            elif 'results' in data:
                existing_results[system_name] = data['results']
                print(f"  Using 'results' key")
            else:
                existing_results[system_name] = data
                print(f"  Using whole file as results")
                
            print(f"  Loaded {len(existing_results[system_name])} results for {system_name}")
            
            # Show sample result structure
            if existing_results[system_name]:
                sample_result = existing_results[system_name][0]
                print(f"  Sample result keys: {list(sample_result.keys())}")
                
        except Exception as e:
            print(f"  Error loading {file_path}: {e}")
    else:
        print(f"Missing: {file_path}")

print(f"\nSuccessfully loaded results for {len(existing_results)} systems")
if existing_results:
    print(f"Systems: {list(existing_results.keys())}")
else:
    print("No evaluation files found! Please run the individual evaluation notebooks first.")


In [None]:
# Verify that we loaded the data correctly
print("=== Data Loading Verification ===")
for system_name, results in existing_results.items():
    print(f"\n{system_name}:")
    if results:
        sample = results[0]
        print(f"  Total results: {len(results)}")
        print(f"  Sample sound_id: {sample.get('sound_id')}")
        print(f"  Sample title: {sample.get('title', 'N/A')[:50]}...")
        print(f"  All sample keys: {list(sample.keys())}")
        
        # For debugging, show the actual values in some keys
        if 'recommended_tags' in sample:
            print(f"  📋 recommended_tags value: {sample['recommended_tags']}")
            print(f"  📋 recommended_tags type: {type(sample['recommended_tags'])}")
        if 'ground_truth_tags' in sample:
            print(f"  📋 ground_truth_tags value: {sample['ground_truth_tags']}")
        
        # Check for predicted tags field
        if 'predicted_tags' in sample:
            print(f"  ✓ Uses 'predicted_tags' (CLAP format)")
            print(f"    Sample predictions: {sample['predicted_tags'][:3]}")
        elif 'recommended_tags' in sample:
            print(f"  ✓ Uses 'recommended_tags' (RankST format)")
            print(f"    Sample predictions: {sample['recommended_tags'][:3]}")
        else:
            print(f"  ✗ No predicted/recommended tags found!")
        
        # Check for ground truth
        if 'ground_truth_tags' in sample:
            print(f"  ✓ Has ground_truth_tags: {sample['ground_truth_tags'][:3]}")
        else:
            print(f"  ⚠ No ground_truth_tags, will use input_gt_pairs")
        
        # Check for scores
        if 'prediction_scores' in sample:
            print(f"  ✓ Has prediction scores: {sample['prediction_scores'][:3]}")
        else:
            print(f"  - No prediction scores (expected for RankST)")
            
    else:
        print(f"  ✗ No results loaded")

# Also show raw file structure
print(f"\n=== RAW FILE STRUCTURE DEBUG ===")
for system_name, file_path in evaluation_files.items():
    if os.path.exists(file_path):
        with open(file_path, 'r') as f:
            data = json.load(f)
        print(f"\n{system_name} ({file_path}):")
        print(f"  Top-level keys: {list(data.keys())}")
        
        if 'evaluation_details' in data and data['evaluation_details']:
            sample = data['evaluation_details'][0]
            print(f"  evaluation_details[0] keys: {list(sample.keys())}")
            print(f"  evaluation_details length: {len(data['evaluation_details'])}")
        if 'detailed_results' in data and data['detailed_results']:
            sample = data['detailed_results'][0]
            print(f"  detailed_results[0] keys: {list(sample.keys())}")
            print(f"  detailed_results length: {len(data['detailed_results'])}")

valid_systems = [s for s in existing_results.values() if s]
print(f"\nTotal systems successfully loaded: {len(valid_systems)}")

if len(valid_systems) == 0:
    print("\n⚠ WARNING: No systems loaded successfully!")
    print("Please check that the evaluation files exist and are in the correct format.")
else:
    print("✓ Data loading appears successful, proceeding with SBERT evaluation...")


In [None]:
# Collect all unique tags from all systems and ground truth
print("Collecting all unique tags...")

all_tags = set()

# Add ground truth tags (normalized)
for pair in input_gt_pairs:
    normalized_gt_tags = [tag.lower().replace('-', ' ') for tag in pair['ground_truth_tags']]
    all_tags.update(normalized_gt_tags)

# Add predicted tags from all systems
for system_name, results in existing_results.items():
    for result in results:
        if 'predicted_tags' in result:
            # Normalize predicted tags
            normalized_pred_tags = [tag.lower().replace('-', ' ') for tag in result['predicted_tags']]
            all_tags.update(normalized_pred_tags)
        elif 'recommendations' in result:
            # Handle different result structures
            normalized_pred_tags = [tag.lower().replace('-', ' ') for tag in result['recommendations']]
            all_tags.update(normalized_pred_tags)

all_unique_tags = list(all_tags)
print(f"Total unique tags to encode: {len(all_unique_tags)}")

# Encode all tags with SBERT
print("Encoding tags with SBERT...")
tag_sbert_embeddings = sbert_model.encode(all_unique_tags, show_progress_bar=True)
print(f"SBERT embeddings shape: {tag_sbert_embeddings.shape}")

# Create tag to embedding mapping
tag_to_sbert = {tag: embedding for tag, embedding in zip(all_unique_tags, tag_sbert_embeddings)}
print("SBERT encoding completed!")


In [None]:
def compute_semantic_hits(predicted_tags, ground_truth_tags, tag_to_sbert, similarity_threshold=0.7):
    """
    Compute semantic hits using SBERT embeddings and cosine similarity.
    A predicted tag is considered a hit if its semantic similarity
    with any ground truth tag exceeds the threshold.
    
    Note: Both predicted_tags and ground_truth_tags should already be normalized
    (lowercase + hyphens replaced with spaces).
    """
    hits = []
    semantic_matches = []
    
    # Normalize predicted tags (lowercase + replace hyphens with spaces)
    predicted_tags_normalized = [tag.lower().replace('-', ' ') for tag in predicted_tags]
    # ground_truth_tags should already be normalized
    
    for pred_tag in predicted_tags_normalized:
        if pred_tag not in tag_to_sbert:
            continue
            
        pred_embedding = tag_to_sbert[pred_tag]
        max_similarity = 0.0
        best_match = None
        
        for gt_tag in ground_truth_tags:
            if gt_tag not in tag_to_sbert:
                continue
                
            gt_embedding = tag_to_sbert[gt_tag]
            similarity = cosine_similarity([pred_embedding], [gt_embedding])[0][0]
            
            if similarity > max_similarity:
                max_similarity = similarity
                best_match = gt_tag
        
        if max_similarity >= similarity_threshold:
            hits.append(pred_tag)
            semantic_matches.append({
                'predicted': pred_tag,
                'matched_gt': best_match,
                'similarity': float(max_similarity)
            })
    
    return hits, semantic_matches

# Test the semantic similarity function
test_pred = ['percussion', 'beat', 'rhythm']
test_gt = ['drum', 'drums', 'drumming']
# Normalize test ground truth
test_gt_normalized = [tag.lower().replace('-', ' ') for tag in test_gt]
test_hits, test_matches = compute_semantic_hits(test_pred, test_gt_normalized, tag_to_sbert, SIMILARITY_THRESHOLD)
print(f"Test semantic hits: {test_hits}")
print(f"Test matches: {test_matches}")


In [None]:
def evaluate_system_with_sbert(system_name, results_data, tag_to_sbert, similarity_threshold=0.7):
    """
    Apply SBERT semantic evaluation to a system's results.
    """
    print(f"Evaluating {system_name} with SBERT...")
    
    evaluated_results = []
    
    for result in tqdm(results_data, desc=f"Processing {system_name}"):
        # Handle different result structures for predicted tags
        if 'predicted_tags' in result:
            # CLAP format
            predicted_tags = result['predicted_tags']
        elif 'recommended_tags' in result:
            # RankST format
            predicted_tags = result['recommended_tags']
        elif 'recommendations' in result:
            predicted_tags = result['recommendations']
        else:
            print(f"Warning: No predicted tags found in result for {system_name}")
            print(f"  Available keys: {list(result.keys())}")
            continue
            
        # Get ground truth tags
        if 'ground_truth_tags' in result:
            ground_truth_tags = result['ground_truth_tags']
        else:
            # Find from input_gt_pairs
            sound_id = result.get('sound_id')
            gt_pair = next((pair for pair in input_gt_pairs if pair['sound_id'] == sound_id), None)
            if gt_pair:
                ground_truth_tags = gt_pair['ground_truth_tags']
            else:
                print(f"Warning: No ground truth found for sound_id {sound_id}")
                continue
        
        # Normalize ground truth tags
        ground_truth_tags_normalized = [tag.lower().replace('-', ' ') for tag in ground_truth_tags]
        
        # Take top 10 predictions
        top_10_predictions = predicted_tags[:10]
        
        # Calculate semantic hits
        semantic_hits, semantic_matches = compute_semantic_hits(
            top_10_predictions, ground_truth_tags_normalized, tag_to_sbert, similarity_threshold
        )
        
        # Also compute exact hits for comparison
        predicted_tags_normalized = [tag.lower().replace('-', ' ') for tag in top_10_predictions]
        exact_hits = list(set(predicted_tags_normalized) & set(ground_truth_tags_normalized))
        
        # Calculate metrics
        precision = len(semantic_hits) / 10.0
        recall = len(semantic_hits) / len(ground_truth_tags_normalized) if ground_truth_tags_normalized else 0.0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
        
        # Create enhanced result
        enhanced_result = {
            'sound_id': result.get('sound_id'),
            'title': result.get('title', ''),
            'ground_truth_tags': ground_truth_tags_normalized,
            'predicted_tags': top_10_predictions,
            'semantic_hits': semantic_hits,
            'semantic_matches': semantic_matches,
            'exact_hits': exact_hits,
            'num_semantic_hits': len(semantic_hits),
            'num_exact_hits': len(exact_hits),
            'precision_at_10': precision,
            'recall': recall,
            'f1_score': f1
        }
        
        # Add original prediction scores if available (CLAP has these, RankST doesn't)
        if 'prediction_scores' in result:
            enhanced_result['prediction_scores'] = result['prediction_scores'][:10]
        elif 'scores' in result:
            enhanced_result['prediction_scores'] = result['scores'][:10]
        
        evaluated_results.append(enhanced_result)
    
    return evaluated_results

# Apply SBERT evaluation to all systems
sbert_evaluated_results = {}

print("=== Starting SBERT Evaluation ===")
for system_name, results_data in existing_results.items():
    if results_data:
        print(f"\nStarting evaluation for {system_name} with {len(results_data)} results...")
        try:
            evaluated = evaluate_system_with_sbert(
                system_name, results_data, tag_to_sbert, SIMILARITY_THRESHOLD
            )
            sbert_evaluated_results[system_name] = evaluated
            print(f"Successfully evaluated {len(evaluated)} results for {system_name}")
        except Exception as e:
            print(f"Error evaluating {system_name}: {e}")
            sbert_evaluated_results[system_name] = []
    else:
        print(f"No data for {system_name}, skipping...")
        sbert_evaluated_results[system_name] = []

print(f"\nSBERT evaluation completed!")
print(f"Systems with results: {[k for k, v in sbert_evaluated_results.items() if v]}")
print(f"Systems without results: {[k for k, v in sbert_evaluated_results.items() if not v]}")


In [None]:
def calculate_system_metrics(system_results):
    """
    Calculate overall metrics for a system's results.
    """
    if not system_results:
        return {}
    
    total_semantic_hits = sum(result['num_semantic_hits'] for result in system_results)
    total_exact_hits = sum(result['num_exact_hits'] for result in system_results)
    total_predictions = len(system_results) * 10
    total_ground_truth = sum(len(result['ground_truth_tags']) for result in system_results)
    
    avg_precision_at_10 = np.mean([result['precision_at_10'] for result in system_results])
    avg_recall = np.mean([result['recall'] for result in system_results])
    avg_f1 = np.mean([result['f1_score'] for result in system_results])
    
    # Overall metrics
    overall_precision = total_semantic_hits / total_predictions if total_predictions > 0 else 0
    overall_recall = total_semantic_hits / total_ground_truth if total_ground_truth > 0 else 0
    overall_f1 = 2 * (overall_precision * overall_recall) / (overall_precision + overall_recall) if (overall_precision + overall_recall) > 0 else 0
    
    # Count clips with hits
    clips_with_semantic_hits = sum(1 for result in system_results if result['num_semantic_hits'] > 0)
    clips_with_exact_hits = sum(1 for result in system_results if result['num_exact_hits'] > 0)
    
    return {
        'num_clips': len(system_results),
        'total_semantic_hits': total_semantic_hits,
        'total_exact_hits': total_exact_hits,
        'total_predictions': total_predictions,
        'total_ground_truth': total_ground_truth,
        'avg_precision_at_10': avg_precision_at_10,
        'avg_recall': avg_recall,
        'avg_f1_score': avg_f1,
        'overall_precision': overall_precision,
        'overall_recall': overall_recall,
        'overall_f1_score': overall_f1,
        'clips_with_semantic_hits': clips_with_semantic_hits,
        'clips_with_exact_hits': clips_with_exact_hits,
        'semantic_hit_rate': clips_with_semantic_hits / len(system_results) if system_results else 0,
        'exact_hit_rate': clips_with_exact_hits / len(system_results) if system_results else 0
    }

# Calculate metrics for all systems
system_metrics = {}

print("=== Calculating System Metrics ===")
for system_name, results in sbert_evaluated_results.items():
    if results:
        print(f"Calculating metrics for {system_name} with {len(results)} results...")
        try:
            metrics = calculate_system_metrics(results)
            system_metrics[system_name] = metrics
            print(f"Success! {system_name} metrics: P@10={metrics['avg_precision_at_10']:.3f}, F1={metrics['avg_f1_score']:.3f}")
        except Exception as e:
            print(f"Error calculating metrics for {system_name}: {e}")
            system_metrics[system_name] = {}
    else:
        print(f"No results for {system_name}, creating empty metrics...")
        system_metrics[system_name] = {}

print(f"\nMetrics calculation completed!")
print(f"Systems with metrics: {[k for k, v in system_metrics.items() if v]}")
print(f"Systems without metrics: {[k for k, v in system_metrics.items() if not v]}")


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Set up the plotting style
plt.style.use('default')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (16, 12)
plt.rcParams['font.size'] = 10

# Prepare data for plotting
systems_data = []
for system_name in systems_order:
    if system_name in system_metrics and system_metrics[system_name]:
        metrics = system_metrics[system_name]
        display_name = system_display_names.get(system_name, system_name)
        
        # Calculate exact matching metrics
        exact_precision = metrics['total_exact_hits'] / (metrics['num_clips'] * 10) if metrics['num_clips'] > 0 else 0
        exact_recall = metrics['total_exact_hits'] / metrics['total_ground_truth'] if metrics['total_ground_truth'] > 0 else 0
        exact_f1 = 2 * (exact_precision * exact_recall) / (exact_precision + exact_recall) if (exact_precision + exact_recall) > 0 else 0
        
        # Calculate improvements
        hits_improvement = ((metrics['total_semantic_hits'] / metrics['total_exact_hits']) - 1) * 100 if metrics['total_exact_hits'] > 0 else 0
        clips_improvement = ((metrics['clips_with_semantic_hits'] / metrics['clips_with_exact_hits']) - 1) * 100 if metrics['clips_with_exact_hits'] > 0 else 0
        f1_improvement = ((metrics['overall_f1_score'] / exact_f1) - 1) * 100 if exact_f1 > 0 else 0
        
        systems_data.append({
            'system': display_name,
            'system_type': 'Zero-Shot CLAP' if 'CLAP' in display_name else 'Traditional RankST',
            'exact_precision': exact_precision,
            'exact_recall': exact_recall,
            'exact_f1': exact_f1,
            'semantic_precision': metrics['overall_precision'],
            'semantic_recall': metrics['overall_recall'], 
            'semantic_f1': metrics['overall_f1_score'],
            'exact_hit_rate': metrics['exact_hit_rate'],
            'semantic_hit_rate': metrics['semantic_hit_rate'],
            'hits_improvement': hits_improvement,
            'clips_improvement': clips_improvement,
            'f1_improvement': f1_improvement,
            'total_exact_hits': metrics['total_exact_hits'],
            'total_semantic_hits': metrics['total_semantic_hits'],
            'clips_exact': metrics['clips_with_exact_hits'],
            'clips_semantic': metrics['clips_with_semantic_hits']
        })

# Create the comprehensive visualization
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle('Audio Tag Recommendation: Zero-Shot CLAP vs Traditional RankST\nSemantic Evaluation with SBERT', 
             fontsize=16, fontweight='bold', y=0.95)

# Extract data for plotting
systems = [d['system'] for d in systems_data]
colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4', '#FECA57']

# 1. Overall Performance Comparison (Semantic)
ax1 = axes[0, 0]
x_pos = np.arange(len(systems))
width = 0.25

f1_scores = [d['semantic_f1'] for d in systems_data]
precision_scores = [d['semantic_precision'] for d in systems_data]
recall_scores = [d['semantic_recall'] for d in systems_data]

bars1 = ax1.bar(x_pos - width, f1_scores, width, label='F1 Score', alpha=0.8)
bars2 = ax1.bar(x_pos, precision_scores, width, label='Precision@10', alpha=0.8)
bars3 = ax1.bar(x_pos + width, recall_scores, width, label='Recall', alpha=0.8)

ax1.set_xlabel('System')
ax1.set_ylabel('Score')
ax1.set_title('Overall Performance (Semantic Matching)', fontweight='bold')
ax1.set_xticks(x_pos)
ax1.set_xticklabels(systems, rotation=45, ha='right')
ax1.legend()
ax1.grid(axis='y', alpha=0.3)

# Add value labels on bars
for bars in [bars1, bars2, bars3]:
    for bar in bars:
        height = bar.get_height()
        ax1.text(bar.get_x() + bar.get_width()/2., height + 0.005,
                f'{height:.3f}', ha='center', va='bottom', fontsize=8)

# 2. Exact vs Semantic F1 Comparison
ax2 = axes[0, 1]
x_pos = np.arange(len(systems))
width = 0.35

exact_f1 = [d['exact_f1'] for d in systems_data]
semantic_f1 = [d['semantic_f1'] for d in systems_data]

bars1 = ax2.bar(x_pos - width/2, exact_f1, width, label='Exact Matching', alpha=0.7, color='lightcoral')
bars2 = ax2.bar(x_pos + width/2, semantic_f1, width, label='Semantic Matching', alpha=0.7, color='lightblue')

ax2.set_xlabel('System')
ax2.set_ylabel('F1 Score')
ax2.set_title('Exact vs Semantic Matching Performance', fontweight='bold')
ax2.set_xticks(x_pos)
ax2.set_xticklabels(systems, rotation=45, ha='right')
ax2.legend()
ax2.grid(axis='y', alpha=0.3)

# Add value labels
for bars in [bars1, bars2]:
    for bar in bars:
        height = bar.get_height()
        ax2.text(bar.get_x() + bar.get_width()/2., height + 0.002,
                f'{height:.3f}', ha='center', va='bottom', fontsize=8)

# 3. Improvement Percentages
ax3 = axes[0, 2]
improvements_data = {
    'F1 Improvement': [d['f1_improvement'] for d in systems_data],
    'Total Hits': [d['hits_improvement'] for d in systems_data],
    'Clips+ Improvement': [d['clips_improvement'] for d in systems_data]
}

x_pos = np.arange(len(systems))
width = 0.25

bars1 = ax3.bar(x_pos - width, improvements_data['F1 Improvement'], width, 
               label='F1+%', alpha=0.8, color='gold')
bars2 = ax3.bar(x_pos, improvements_data['Total Hits'], width, 
               label='% More Hits', alpha=0.8, color='lightgreen')
bars3 = ax3.bar(x_pos + width, improvements_data['Clips+ Improvement'], width, 
               label='Clips+%', alpha=0.8, color='plum')

ax3.set_xlabel('System')
ax3.set_ylabel('Improvement (%)')
ax3.set_title('Semantic Matching Improvements', fontweight='bold')
ax3.set_xticks(x_pos)
ax3.set_xticklabels(systems, rotation=45, ha='right')
ax3.legend()
ax3.grid(axis='y', alpha=0.3)

# Add value labels
for bars in [bars1, bars2, bars3]:
    for bar in bars:
        height = bar.get_height()
        ax3.text(bar.get_x() + bar.get_width()/2., height + 1,
                f'{height:.0f}%', ha='center', va='bottom', fontsize=8)

# 4. Hit Rate Analysis
ax4 = axes[1, 0]
exact_rates = [d['exact_hit_rate'] * 100 for d in systems_data]
semantic_rates = [d['semantic_hit_rate'] * 100 for d in systems_data]

x_pos = np.arange(len(systems))
width = 0.35

bars1 = ax4.bar(x_pos - width/2, exact_rates, width, label='Exact Hit Rate', alpha=0.7, color='lightcoral')
bars2 = ax4.bar(x_pos + width/2, semantic_rates, width, label='Semantic Hit Rate', alpha=0.7, color='lightblue')

ax4.set_xlabel('System')
ax4.set_ylabel('Hit Rate (%)')
ax4.set_title('Clips Getting ≥1 Hit (%)', fontweight='bold')
ax4.set_xticks(x_pos)
ax4.set_xticklabels(systems, rotation=45, ha='right')
ax4.legend()
ax4.grid(axis='y', alpha=0.3)

# Add value labels
for bars in [bars1, bars2]:
    for bar in bars:
        height = bar.get_height()
        ax4.text(bar.get_x() + bar.get_width()/2., height + 1,
                f'{height:.1f}%', ha='center', va='bottom', fontsize=8)

# 5. RankST Scaling Analysis
ax5 = axes[1, 1]
rankst_systems = [d for d in systems_data if 'RankST' in d['system']]
k_values = [1, 2, 3]
rankst_f1 = [d['semantic_f1'] for d in rankst_systems]
rankst_precision = [d['semantic_precision'] for d in rankst_systems]
rankst_recall = [d['semantic_recall'] for d in rankst_systems]

ax5.plot(k_values, rankst_f1, 'o-', linewidth=2, markersize=8, label='F1 Score', color='blue')
ax5.plot(k_values, rankst_precision, 's-', linewidth=2, markersize=8, label='Precision@10', color='green')
ax5.plot(k_values, rankst_recall, '^-', linewidth=2, markersize=8, label='Recall', color='red')

ax5.set_xlabel('Input Tags (k)')
ax5.set_ylabel('Score')
ax5.set_title('RankST Performance Scaling', fontweight='bold')
ax5.set_xticks(k_values)
ax5.legend()
ax5.grid(True, alpha=0.3)

# Add value labels
for i, (f1, prec, rec) in enumerate(zip(rankst_f1, rankst_precision, rankst_recall)):
    ax5.text(k_values[i], f1 + 0.01, f'{f1:.3f}', ha='center', va='bottom', fontsize=8)
    ax5.text(k_values[i], prec + 0.01, f'{prec:.3f}', ha='center', va='bottom', fontsize=8)
    ax5.text(k_values[i], rec + 0.01, f'{rec:.3f}', ha='center', va='bottom', fontsize=8)

# 6. Zero-Shot vs Traditional Comparison
ax6 = axes[1, 2]
clap_systems = [d for d in systems_data if 'CLAP' in d['system']]
rankst_best = max(rankst_systems, key=lambda x: x['semantic_f1'])

comparison_data = {
    'System Type': ['CLAP\nBaseline', 'CLAP\nwith DF', 'RankST\n(Best k=3)'],
    'F1 Score': [clap_systems[0]['semantic_f1'], clap_systems[1]['semantic_f1'], rankst_best['semantic_f1']],
    'Training': ['Zero-Shot', 'Zero-Shot', 'Requires Training'],
    'Colors': ['#FF6B6B', '#4ECDC4', '#45B7D1']
}

bars = ax6.bar(comparison_data['System Type'], comparison_data['F1 Score'], 
               color=comparison_data['Colors'], alpha=0.8)

ax6.set_ylabel('F1 Score (Semantic)')
ax6.set_title('Zero-Shot vs Traditional Approaches', fontweight='bold')
ax6.grid(axis='y', alpha=0.3)

# Add value labels and training info
for i, (bar, training) in enumerate(zip(bars, comparison_data['Training'])):
    height = bar.get_height()
    ax6.text(bar.get_x() + bar.get_width()/2., height + 0.005,
            f'{height:.3f}', ha='center', va='bottom', fontweight='bold', fontsize=10)
    ax6.text(bar.get_x() + bar.get_width()/2., height/2,
            training, ha='center', va='center', fontsize=8, 
            bbox=dict(boxstyle="round,pad=0.3", facecolor='white', alpha=0.8))

plt.tight_layout()
plt.subplots_adjust(top=0.91)
plt.show()

print("📊 Visualization Summary:")
print("─" * 50)
print("1. Overall Performance: RankST k=3 leads, but requires training data")
print("2. Semantic vs Exact: All systems benefit significantly from semantic matching")
print("3. Improvements: CLAP systems show larger relative improvements")
print("4. Hit Rates: RankST reaches more clips, but CLAP improves more with semantics")
print("5. RankST Scaling: Performance increases consistently with more input tags")
print("6. Zero-Shot Advantage: CLAP works immediately without training data")


In [None]:
# Two separate comparison graphs for clearer visualization
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Position setup for grouped bars
x_pos = np.arange(len(systems_names))
width = 0.35

# GRAPH 1: F1 Score Comparison
f1_exact_bars = ax1.bar(x_pos - width/2, exact_f1_scores, width, 
                       label='Exact Matching', alpha=0.8, color='lightcoral', 
                       edgecolor='darkred', linewidth=1.5)
f1_semantic_bars = ax1.bar(x_pos + width/2, semantic_f1_scores, width, 
                          label='Semantic Matching', alpha=0.8, color='lightblue',
                          edgecolor='darkblue', linewidth=1.5)

ax1.set_xlabel('System', fontsize=12, fontweight='bold')
ax1.set_ylabel('F1 Score', fontsize=12, fontweight='bold')
ax1.set_title('F1 Score: Exact vs Semantic Matching', fontsize=14, fontweight='bold')
ax1.set_xticks(x_pos)
ax1.set_xticklabels(systems_names, rotation=45, ha='right')
ax1.legend(fontsize=10)
ax1.grid(axis='y', alpha=0.3)

# Add value labels and improvement percentages for F1
for i, (f1_exact, f1_semantic) in enumerate(zip(exact_f1_scores, semantic_f1_scores)):
    # F1 exact value
    ax1.text(x_pos[i] - width/2, f1_exact + 0.003,
            f'{f1_exact:.3f}', ha='center', va='bottom', fontsize=9, fontweight='bold')
    
    # F1 semantic value
    ax1.text(x_pos[i] + width/2, f1_semantic + 0.003,
            f'{f1_semantic:.3f}', ha='center', va='bottom', fontsize=9, fontweight='bold')
    
    # F1 improvement percentage
    f1_improvement = ((f1_semantic / f1_exact) - 1) * 100 if f1_exact > 0 else 0
    max_f1_height = max(f1_exact, f1_semantic)
    ax1.text(x_pos[i], max_f1_height + 0.015, f'+{f1_improvement:.0f}%', 
            ha='center', va='bottom', fontsize=10, fontweight='bold', 
            bbox=dict(boxstyle="round,pad=0.2", facecolor='gold', alpha=0.8))

# GRAPH 2: Total Hits Comparison
hits_exact_bars = ax2.bar(x_pos - width/2, exact_total_hits, width, 
                         label='Exact Matching', alpha=0.8, color='lightcoral',
                         edgecolor='darkred', linewidth=1.5)
hits_semantic_bars = ax2.bar(x_pos + width/2, semantic_total_hits, width, 
                            label='Semantic Matching', alpha=0.8, color='lightblue',
                            edgecolor='darkblue', linewidth=1.5)

ax2.set_xlabel('System', fontsize=12, fontweight='bold')
ax2.set_ylabel('Total Hits', fontsize=12, fontweight='bold')
ax2.set_title('Total Hits: Exact vs Semantic Matching', fontsize=14, fontweight='bold')
ax2.set_xticks(x_pos)
ax2.set_xticklabels(systems_names, rotation=45, ha='right')
ax2.legend(fontsize=10)
ax2.grid(axis='y', alpha=0.3)

# Add value labels and improvement percentages for Total Hits
for i, (hits_exact, hits_semantic) in enumerate(zip(exact_total_hits, semantic_total_hits)):
    # Hits exact value
    ax2.text(x_pos[i] - width/2, hits_exact + 50,
            f'{int(hits_exact)}', ha='center', va='bottom', fontsize=9, fontweight='bold')
    
    # Hits semantic value
    ax2.text(x_pos[i] + width/2, hits_semantic + 50,
            f'{int(hits_semantic)}', ha='center', va='bottom', fontsize=9, fontweight='bold')
    
    # Hits improvement percentage
    hits_improvement = ((hits_semantic / hits_exact) - 1) * 100 if hits_exact > 0 else 0
    max_hits_height = max(hits_exact, hits_semantic)
    ax2.text(x_pos[i], max_hits_height + 150, f'+{hits_improvement:.0f}%', 
            ha='center', va='bottom', fontsize=10, fontweight='bold', 
            bbox=dict(boxstyle="round,pad=0.2", facecolor='gold', alpha=0.8))

plt.tight_layout()
plt.show()

# Print comparison summary
print("\n📊 F1 Score & Total Hits Comparison Summary:")
print("=" * 85)
print(f"{'System':<15} {'F1 Exact':<10} {'F1 Semantic':<12} {'F1 Δ%':<8} {'Hits Exact':<12} {'Hits Semantic':<14} {'Hits Δ%':<8}")
print("-" * 85)

for i, system_name in enumerate(systems_names):
    f1_improvement = ((semantic_f1_scores[i] / exact_f1_scores[i]) - 1) * 100 if exact_f1_scores[i] > 0 else 0
    hits_improvement = ((semantic_total_hits[i] / exact_total_hits[i]) - 1) * 100 if exact_total_hits[i] > 0 else 0
    
    print(f"{system_name:<15} {exact_f1_scores[i]:<10.3f} {semantic_f1_scores[i]:<12.3f} {f1_improvement:<8.0f}% "
          f"{exact_total_hits[i]:<12d} {semantic_total_hits[i]:<14d} {hits_improvement:<8.0f}%")

print("\n💡 Key Insights from Separate Graphs:")
print("─" * 50)
print("• F1 Quality: RankST k=3 best absolute (0.284), CLAP Baseline best improvement")
print("• Total Volume: RankST k=3 achieves highest hits vs CLAP systems")  
print("• Zero-Shot Advantage: CLAP systems work immediately without training data")
print("• Semantic Benefit: All systems improve significantly with semantic matching")
print("• Research Value: Zero-shot shows higher improvement potential")


In [None]:
## 9. System Comparison Graphs

# First, let's debug the percentage calculations by printing the actual values
print("🔍 DEBUGGING PERCENTAGE CALCULATIONS:")
print("=" * 60)

# Prepare data for all systems
systems_names = [system_display_names[s] for s in systems_order if s in system_metrics and system_metrics[s]]
exact_f1_scores = []
semantic_f1_scores = []
exact_total_hits = []
semantic_total_hits = []

for system_name in systems_order:
    if system_name in system_metrics and system_metrics[system_name]:
        metrics = system_metrics[system_name]
        
        # Calculate exact F1
        exact_precision = metrics['total_exact_hits'] / (metrics['num_clips'] * 10) if metrics['num_clips'] > 0 else 0
        exact_recall = metrics['total_exact_hits'] / metrics['total_ground_truth'] if metrics['total_ground_truth'] > 0 else 0
        exact_f1 = 2 * (exact_precision * exact_recall) / (exact_precision + exact_recall) if (exact_precision + exact_recall) > 0 else 0
        
        exact_f1_scores.append(exact_f1)
        semantic_f1_scores.append(metrics['overall_f1_score'])
        exact_total_hits.append(metrics['total_exact_hits'])
        semantic_total_hits.append(metrics['total_semantic_hits'])
        
        # Debug print for each system
        system_display = system_display_names.get(system_name, system_name)
        f1_improvement = ((metrics['overall_f1_score'] / exact_f1) - 1) * 100 if exact_f1 > 0 else 0
        hits_improvement = ((metrics['total_semantic_hits'] / metrics['total_exact_hits']) - 1) * 100 if metrics['total_exact_hits'] > 0 else 0
        
        print(f"{system_display}:")
        print(f"  F1: {exact_f1:.6f} → {metrics['overall_f1_score']:.6f} = +{f1_improvement:.1f}%")
        print(f"  Hits: {metrics['total_exact_hits']} → {metrics['total_semantic_hits']} = +{hits_improvement:.1f}%")
        print()

print("=" * 60)


In [None]:
## 9. System Comparison Graphs

# Two separate comparison graphs for clearer visualization
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Prepare data for all systems
systems_names = [system_display_names[s] for s in systems_order if s in system_metrics and system_metrics[s]]
exact_f1_scores = []
semantic_f1_scores = []
exact_total_hits = []
semantic_total_hits = []

for system_name in systems_order:
    if system_name in system_metrics and system_metrics[system_name]:
        metrics = system_metrics[system_name]
        
        # Calculate exact F1
        exact_precision = metrics['total_exact_hits'] / (metrics['num_clips'] * 10) if metrics['num_clips'] > 0 else 0
        exact_recall = metrics['total_exact_hits'] / metrics['total_ground_truth'] if metrics['total_ground_truth'] > 0 else 0
        exact_f1 = 2 * (exact_precision * exact_recall) / (exact_precision + exact_recall) if (exact_precision + exact_recall) > 0 else 0
        
        exact_f1_scores.append(exact_f1)
        semantic_f1_scores.append(metrics['overall_f1_score'])
        exact_total_hits.append(metrics['total_exact_hits'])
        semantic_total_hits.append(metrics['total_semantic_hits'])

# Position setup for grouped bars
x_pos = np.arange(len(systems_names))
width = 0.35

# GRAPH 1: F1 Score Comparison
f1_exact_bars = ax1.bar(x_pos - width/2, exact_f1_scores, width, 
                       label='Exact Matching', alpha=0.8, color='lightcoral', 
                       edgecolor='darkred', linewidth=1.5)
f1_semantic_bars = ax1.bar(x_pos + width/2, semantic_f1_scores, width, 
                          label='Semantic Matching', alpha=0.8, color='lightblue',
                          edgecolor='darkblue', linewidth=1.5)

ax1.set_xlabel('System', fontsize=12, fontweight='bold')
ax1.set_ylabel('F1 Score', fontsize=12, fontweight='bold')
ax1.set_title('F1 Score: Exact vs Semantic Matching', fontsize=14, fontweight='bold')
ax1.set_xticks(x_pos)
ax1.set_xticklabels(systems_names, rotation=45, ha='right')
ax1.legend(fontsize=10)
ax1.grid(axis='y', alpha=0.3)

# Add value labels and improvement percentages for F1
for i, (bar_exact, bar_semantic) in enumerate(zip(f1_exact_bars, f1_semantic_bars)):
    # F1 exact
    f1_height_exact = bar_exact.get_height()
    ax1.text(bar_exact.get_x() + bar_exact.get_width()/2., f1_height_exact + 0.003,
            f'{f1_height_exact:.3f}', ha='center', va='bottom', fontsize=9, fontweight='bold')
    
    # F1 semantic
    f1_height_semantic = bar_semantic.get_height()
    ax1.text(bar_semantic.get_x() + bar_semantic.get_width()/2., f1_height_semantic + 0.003,
            f'{f1_height_semantic:.3f}', ha='center', va='bottom', fontsize=9, fontweight='bold')
    
    # Improvement percentage (using original F1 data arrays)
    f1_improvement = ((semantic_f1_scores[i] / exact_f1_scores[i]) - 1) * 100 if exact_f1_scores[i] > 0 else 0
    max_f1_height = max(f1_height_exact, f1_height_semantic)
    ax1.text(x_pos[i], max_f1_height + 0.015, f'+{f1_improvement:.0f}%', 
            ha='center', va='bottom', fontsize=10, fontweight='bold', 
            bbox=dict(boxstyle="round,pad=0.2", facecolor='gold', alpha=0.8))

# GRAPH 2: Total Hits Comparison
hits_exact_bars = ax2.bar(x_pos - width/2, exact_total_hits, width, 
                         label='Exact Matching', alpha=0.8, color='lightcoral',
                         edgecolor='darkred', linewidth=1.5)
hits_semantic_bars = ax2.bar(x_pos + width/2, semantic_total_hits, width, 
                            label='Semantic Matching', alpha=0.8, color='lightblue',
                            edgecolor='darkblue', linewidth=1.5)

ax2.set_xlabel('System', fontsize=12, fontweight='bold')
ax2.set_ylabel('Total Hits', fontsize=12, fontweight='bold')
ax2.set_title('Total Hits: Exact vs Semantic Matching', fontsize=14, fontweight='bold')
ax2.set_xticks(x_pos)
ax2.set_xticklabels(systems_names, rotation=45, ha='right')
ax2.legend(fontsize=10)
ax2.grid(axis='y', alpha=0.3)

# Add value labels and improvement percentages for Total Hits
for i, (bar_exact, bar_semantic) in enumerate(zip(hits_exact_bars, hits_semantic_bars)):
    # Hits exact
    hits_height_exact = bar_exact.get_height()
    ax2.text(bar_exact.get_x() + bar_exact.get_width()/2., hits_height_exact + 50,
            f'{int(hits_height_exact)}', ha='center', va='bottom', fontsize=9, fontweight='bold')
    
    # Hits semantic
    hits_height_semantic = bar_semantic.get_height()
    ax2.text(bar_semantic.get_x() + bar_semantic.get_width()/2., hits_height_semantic + 50,
            f'{int(hits_height_semantic)}', ha='center', va='bottom', fontsize=9, fontweight='bold')
    
    # Improvement percentage (using original hits data arrays)
    hits_improvement = ((semantic_total_hits[i] / exact_total_hits[i]) - 1) * 100 if exact_total_hits[i] > 0 else 0
    max_hits_height = max(hits_height_exact, hits_height_semantic)
    ax2.text(x_pos[i], max_hits_height + 150, f'+{hits_improvement:.0f}%', 
            ha='center', va='bottom', fontsize=10, fontweight='bold', 
            bbox=dict(boxstyle="round,pad=0.2", facecolor='gold', alpha=0.8))

plt.tight_layout()
plt.show()

# Print comparison summary
print("\n📊 F1 Score & Total Hits Comparison Summary:")
print("=" * 85)
print(f"{'System':<15} {'F1 Exact':<10} {'F1 Semantic':<12} {'F1 Δ%':<8} {'Hits Exact':<12} {'Hits Semantic':<14} {'Hits Δ%':<8}")
print("-" * 85)

for i, system_name in enumerate(systems_names):
    f1_improvement = ((semantic_f1_scores[i] / exact_f1_scores[i]) - 1) * 100 if exact_f1_scores[i] > 0 else 0
    hits_improvement = ((semantic_total_hits[i] / exact_total_hits[i]) - 1) * 100 if exact_total_hits[i] > 0 else 0
    
    print(f"{system_name:<15} {exact_f1_scores[i]:<10.3f} {semantic_f1_scores[i]:<12.3f} {f1_improvement:<8.0f}% "
          f"{exact_total_hits[i]:<12d} {semantic_total_hits[i]:<14d} {hits_improvement:<8.0f}%")

print("\n💡 Key Insights from Separate Graphs:")
print("─" * 50)
print("• F1 Quality: RankST k=3 best absolute (0.284), CLAP Baseline best improvement (+131%)")
print("• Total Volume: RankST k=3 achieves 3,325 hits vs CLAP DF's 746 hits")  
print("• Zero-Shot Advantage: CLAP systems work immediately without training data")
print("• Semantic Benefit: All systems improve significantly with semantic matching")
print("• Research Value: Zero-shot shows higher improvement potential")


In [None]:
# Display comprehensive comparison table
print("=" * 160)
print("🎯 EXACT vs SEMANTIC MATCHING COMPARISON")
print("=" * 160)
print(f"SBERT Model: all-MiniLM-L6-v2 | Similarity Threshold: {SIMILARITY_THRESHOLD}")
print("Legend: Hits=Total hits, Clips+=Clips with ≥1 hit, F1+%=F1 improvement %, % More=% more total hits, Clips+%=% more clips with hits, HitsPC=% improvement in avg hits per successful clip")
print("=" * 160)

# Create comparison table
systems_order = ['clap_baseline', 'clap_df', 'rankst_k1', 'rankst_k2', 'rankst_k3']
system_display_names = {
    'clap_baseline': 'CLAP Baseline',
    'clap_df': 'CLAP with DF',
    'rankst_k1': 'RankST k=1',
    'rankst_k2': 'RankST k=2', 
    'rankst_k3': 'RankST k=3'
}

# Check if we have any valid metrics
valid_systems = [s for s in systems_order if s in system_metrics and system_metrics[s]]

if not valid_systems:
    print("❌ No valid system metrics found. Please check that the evaluation files exist and are properly formatted.")
    print(f"Expected files: {list(evaluation_files.values())}")
else:
    # Header for comparison table  
    print(f"{'System':<15} {'Clips':<6} │ {'EXACT MATCHING':<50} │ {'SEMANTIC MATCHING':<50} │ {'IMPROVEMENT':<42}")
    print(f"{'':15} {'':6} │ {'Hits':<6} {'Clips+':<7} {'P@10':<8} {'Recall':<8} {'F1':<8} │ {'Hits':<6} {'Clips+':<7} {'P@10':<8} {'Recall':<8} {'F1':<8} │ {'F1+%':<6} {'% More':<7} {'Clips+%':<8} {'HitsPC':<8}")
    print("─" * 160)

    for system_name in systems_order:
        if system_name in system_metrics and system_metrics[system_name]:
            metrics = system_metrics[system_name]
            display_name = system_display_names.get(system_name, system_name)
            
            # Calculate exact matching metrics for comparison
            exact_precision = metrics['total_exact_hits'] / (metrics['num_clips'] * 10) if metrics['num_clips'] > 0 else 0
            exact_recall = metrics['total_exact_hits'] / metrics['total_ground_truth'] if metrics['total_ground_truth'] > 0 else 0
            exact_f1 = 2 * (exact_precision * exact_recall) / (exact_precision + exact_recall) if (exact_precision + exact_recall) > 0 else 0
            
            # F1 improvement calculation
            f1_improvement = ((metrics['overall_f1_score'] / exact_f1) - 1) * 100 if exact_f1 > 0 else float('inf') if metrics['overall_f1_score'] > 0 else 0
            f1_improvement_str = f"+{f1_improvement:.0f}%" if f1_improvement != float('inf') else "∞"
            
            # Improvement calculations
            hits_improvement = ((metrics['total_semantic_hits'] / metrics['total_exact_hits']) - 1) * 100 if metrics['total_exact_hits'] > 0 else float('inf') if metrics['total_semantic_hits'] > 0 else 0
            hits_improvement_str = f"+{hits_improvement:.0f}%" if hits_improvement != float('inf') else "∞"
            
            # Clips improvement calculation - for clips that get hits
            clips_improvement = ((metrics['clips_with_semantic_hits'] / metrics['clips_with_exact_hits']) - 1) * 100 if metrics['clips_with_exact_hits'] > 0 else float('inf') if metrics['clips_with_semantic_hits'] > 0 else 0
            clips_improvement_str = f"+{clips_improvement:.0f}%" if clips_improvement != float('inf') else "∞"
            
            # Hits per clip improvement - for clips that actually have hits
            # This shows: "For clips that get hits, how much better is the average hits per clip?"
            exact_avg_hits_per_successful_clip = metrics['total_exact_hits'] / metrics['clips_with_exact_hits'] if metrics['clips_with_exact_hits'] > 0 else 0
            semantic_avg_hits_per_successful_clip = metrics['total_semantic_hits'] / metrics['clips_with_semantic_hits'] if metrics['clips_with_semantic_hits'] > 0 else 0
            hits_pc_improvement = ((semantic_avg_hits_per_successful_clip / exact_avg_hits_per_successful_clip) - 1) * 100 if exact_avg_hits_per_successful_clip > 0 else float('inf') if semantic_avg_hits_per_successful_clip > 0 else 0
            hits_pc_str = f"+{hits_pc_improvement:.0f}%" if hits_pc_improvement != float('inf') else "∞"
            
            print(f"{display_name:<15} {metrics['num_clips']:<6} │ "
                  f"{metrics['total_exact_hits']:<6} {metrics['clips_with_exact_hits']:<7} {exact_precision:<8.3f} {exact_recall:<8.3f} {exact_f1:<8.3f} │ "
                  f"{metrics['total_semantic_hits']:<6} {metrics['clips_with_semantic_hits']:<7} {metrics['overall_precision']:<8.3f} {metrics['overall_recall']:<8.3f} {metrics['overall_f1_score']:<8.3f} │ "
                  f"{f1_improvement_str:<6} {hits_improvement_str:<7} {clips_improvement_str:<8} {hits_pc_str:<8}")
        else:
            display_name = system_display_names.get(system_name, system_name)
            print(f"{display_name:<15} {'N/A':<6} │ {'N/A':<50} │ {'N/A':<50} │ {'N/A':<42}")

    print("=" * 160)

    # Performance ranking
    print("\n🏆 PERFORMANCE RANKING (by Semantic F1 Score)")
    print("─" * 60)
    
    # Sort systems by semantic F1 score
    ranked_systems = [(name, metrics) for name, metrics in system_metrics.items() if metrics]
    ranked_systems.sort(key=lambda x: x[1]['overall_f1_score'], reverse=True)
    
    for i, (system_name, metrics) in enumerate(ranked_systems, 1):
        display_name = system_display_names.get(system_name, system_name)
        medal = "🥇" if i == 1 else "🥈" if i == 2 else "🥉" if i == 3 else f"{i}."
        print(f"{medal:<3} {display_name:<15} F1: {metrics['overall_f1_score']:.4f} | P@10: {metrics['overall_precision']:.4f} | Recall: {metrics['overall_recall']:.4f}")
    
    print("=" * 160)

    # Hit rate comparison
    print("\n📊 HIT RATE COMPARISON")
    print("─" * 80)
    print(f"{'System':<15} {'Exact Hit Rate':<15} {'Semantic Hit Rate':<18} {'Improvement':<12}")
    print("─" * 80)
    
    for system_name in systems_order:
        if system_name in system_metrics and system_metrics[system_name]:
            metrics = system_metrics[system_name]
            display_name = system_display_names.get(system_name, system_name)
            
            improvement = metrics['semantic_hit_rate'] - metrics['exact_hit_rate']
            improvement_str = f"+{improvement:.1%}"
            
            print(f"{display_name:<15} {metrics['exact_hit_rate']:<15.1%} {metrics['semantic_hit_rate']:<18.1%} {improvement_str:<12}")

    print("=" * 160)

    # Summary insights
    print("\n💡 KEY INSIGHTS")
    print("─" * 50)
    best_system = max(ranked_systems, key=lambda x: x[1]['overall_f1_score'])
    best_improvement_system = max(ranked_systems, key=lambda x: (x[1]['total_semantic_hits'] / x[1]['total_exact_hits']) if x[1]['total_exact_hits'] > 0 else 0)
    
    print(f"• Best Overall Performance: {system_display_names[best_system[0]]} (F1: {best_system[1]['overall_f1_score']:.4f})")
    
    if best_improvement_system[1]['total_exact_hits'] > 0:
        improvement_pct = ((best_improvement_system[1]['total_semantic_hits'] / best_improvement_system[1]['total_exact_hits']) - 1) * 100
        print(f"• Largest Improvement: {system_display_names[best_improvement_system[0]]} (+{improvement_pct:.0f}% more hits)")
    
    # Average improvement across all systems
    avg_improvement = np.mean([
        ((m['total_semantic_hits'] / m['total_exact_hits']) - 1) * 100 
        for m in system_metrics.values() 
        if m and m['total_exact_hits'] > 0
    ])
    print(f"• Average Improvement: Semantic matching finds {avg_improvement:.0f}% more hits than exact matching")
    
    total_semantic = sum(m['total_semantic_hits'] for m in system_metrics.values() if m)
    total_exact = sum(m['total_exact_hits'] for m in system_metrics.values() if m)
    print(f"• Overall: {total_semantic} semantic hits vs {total_exact} exact hits across all systems")
    
    print("=" * 160)


In [None]:
def show_system_examples(system_name, results, num_examples=5):
    """
    Show example results for a system.
    """
    display_name = system_display_names.get(system_name, system_name)
    
    if not results:
        print(f"\n=== {display_name} - No Results Available ===")
        return
        
    print(f"\n=== {display_name} - Top {num_examples} Examples ===")
    
    # Sort by number of semantic hits (best first)
    results_sorted = sorted(results, key=lambda x: x['num_semantic_hits'], reverse=True)
    
    for i, result in enumerate(results_sorted[:num_examples]):
        print(f"\n--- Example {i+1} (Sound ID: {result['sound_id']}) ---")
        print(f"Title: {result['title']}")
        print(f"Ground Truth Tags: {result['ground_truth_tags']}")
        print(f"Predicted Tags:")
        
        for j, tag in enumerate(result['predicted_tags'], 1):
            tag_normalized = tag.lower().replace('-', ' ')
            semantic_hit_marker = "🔮" if tag_normalized in result['semantic_hits'] else " "
            exact_hit_marker = "★" if tag_normalized in result['exact_hits'] else " "
            
            # Show prediction score if available
            score_str = ""
            if 'prediction_scores' in result and j <= len(result['prediction_scores']):
                score = result['prediction_scores'][j-1]
                score_str = f" ({score:.4f})"
            
            print(f"  {j:2d}.{semantic_hit_marker}{exact_hit_marker} {tag:<20}{score_str}")
        
        print(f"\nSemantic Matches (threshold={SIMILARITY_THRESHOLD}):")
        if result['semantic_matches']:
            for match in result['semantic_matches']:
                print(f"  '{match['predicted']}' ↔ '{match['matched_gt']}' (similarity: {match['similarity']:.3f})")
        else:
            print("  No semantic matches found")
        
        print(f"\nExact hits: {result['exact_hits']} ({result['num_exact_hits']} hits)")
        print(f"Semantic hits: {result['semantic_hits']} ({result['num_semantic_hits']} hits)")
        print(f"Precision@10: {result['precision_at_10']:.3f}, Recall: {result['recall']:.3f}, F1: {result['f1_score']:.3f}")

# Show examples for all systems
for system_name in systems_order:
    if system_name in sbert_evaluated_results and sbert_evaluated_results[system_name]:
        show_system_examples(system_name, sbert_evaluated_results[system_name], num_examples=5)
    else:
        display_name = system_display_names.get(system_name, system_name)
        print(f"\n=== {display_name} - No Results Available ===")


In [None]:
# Save comprehensive results
def convert_numpy_types(obj):
    """Recursively convert numpy types to Python types"""
    if isinstance(obj, np.integer):
        return int(obj)
    elif isinstance(obj, np.floating):
        return float(obj)
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    elif isinstance(obj, dict):
        return {key: convert_numpy_types(value) for key, value in obj.items()}
    elif isinstance(obj, list):
        return [convert_numpy_types(item) for item in obj]
    else:
        return obj

# Prepare comprehensive comparison data
comparison_data = {
    'metadata': {
        'sbert_model': 'all-MiniLM-L6-v2',
        'similarity_threshold': float(SIMILARITY_THRESHOLD),
        'evaluation_date': pd.Timestamp.now().isoformat(),
        'normalization': 'lowercase + hyphens to spaces'
    },
    'system_metrics': convert_numpy_types(system_metrics),
    'detailed_results': convert_numpy_types(sbert_evaluated_results)
}

# Save to file
os.makedirs('eval', exist_ok=True)
output_file = f'eval/rankst_clap_sbert_comparison_threshold{SIMILARITY_THRESHOLD}.json'

with open(output_file, 'w') as f:
    json.dump(comparison_data, f, indent=2)

print(f"\nComprehensive results saved to {output_file}")

# Save summary only
summary_data = {
    'metadata': comparison_data['metadata'],
    'system_metrics': comparison_data['system_metrics']
}

summary_file = f'eval/rankst_clap_sbert_summary_threshold{SIMILARITY_THRESHOLD}.json'
with open(summary_file, 'w') as f:
    json.dump(summary_data, f, indent=2)

print(f"Summary saved to {summary_file}")


In [None]:
print("\n" + "="*80)
print("FINAL SUMMARY - SBERT Semantic Evaluation")
print("="*80)
print(f"Similarity Threshold: {SIMILARITY_THRESHOLD}")

# Filter out empty metrics
valid_metrics = {k: v for k, v in system_metrics.items() if v}
print(f"Evaluation completed for {len(valid_metrics)} systems")

if valid_metrics:
    # Find best performing system
    best_f1_system = max(valid_metrics.items(), key=lambda x: x[1]['avg_f1_score'])
    best_precision_system = max(valid_metrics.items(), key=lambda x: x[1]['avg_precision_at_10'])
    best_recall_system = max(valid_metrics.items(), key=lambda x: x[1]['avg_recall'])
    best_hit_rate_system = max(valid_metrics.items(), key=lambda x: x[1]['semantic_hit_rate'])

    print(f"\nBest F1 Score: {system_display_names.get(best_f1_system[0], best_f1_system[0])} ({best_f1_system[1]['avg_f1_score']:.4f})")
    print(f"Best Precision@10: {system_display_names.get(best_precision_system[0], best_precision_system[0])} ({best_precision_system[1]['avg_precision_at_10']:.4f})")
    print(f"Best Recall: {system_display_names.get(best_recall_system[0], best_recall_system[0])} ({best_recall_system[1]['avg_recall']:.4f})")
    print(f"Best Hit Rate: {system_display_names.get(best_hit_rate_system[0], best_hit_rate_system[0])} ({best_hit_rate_system[1]['semantic_hit_rate']:.1%})")
else:
    print("\nNo valid system metrics found. Please ensure the evaluation files exist:")
    for system_name, file_path in evaluation_files.items():
        exists = "✓" if os.path.exists(file_path) else "✗"
        print(f"  {exists} {file_path}")

print("\nEvaluation complete!")


In [None]:
# Debug the F1 improvement calculation for CLAP Baseline
print("=== DEBUGGING F1 IMPROVEMENT CALCULATION ===")

clap_baseline_metrics = system_metrics['clap_baseline']
print(f"System: CLAP Baseline")
print(f"Raw metrics object keys: {list(clap_baseline_metrics.keys())}")

# Calculate exact matching F1 (same way as in the table display)
exact_precision = clap_baseline_metrics['total_exact_hits'] / (clap_baseline_metrics['num_clips'] * 10)
exact_recall = clap_baseline_metrics['total_exact_hits'] / clap_baseline_metrics['total_ground_truth']
exact_f1 = 2 * (exact_precision * exact_recall) / (exact_precision + exact_recall) if (exact_precision + exact_recall) > 0 else 0

print(f"\nExact Matching Calculation:")
print(f"  exact_precision = {clap_baseline_metrics['total_exact_hits']} / ({clap_baseline_metrics['num_clips']} * 10) = {exact_precision:.6f}")
print(f"  exact_recall = {clap_baseline_metrics['total_exact_hits']} / {clap_baseline_metrics['total_ground_truth']} = {exact_recall:.6f}")
print(f"  exact_f1 = 2 * ({exact_precision:.6f} * {exact_recall:.6f}) / ({exact_precision:.6f} + {exact_recall:.6f}) = {exact_f1:.6f}")

print(f"\nSemantic Matching:")
print(f"  semantic F1 = {clap_baseline_metrics['overall_f1_score']:.6f}")

# Calculate improvement
f1_improvement = ((clap_baseline_metrics['overall_f1_score'] / exact_f1) - 1) * 100 if exact_f1 > 0 else float('inf')
print(f"\nF1 Improvement Calculation:")
print(f"  f1_improvement = (({clap_baseline_metrics['overall_f1_score']:.6f} / {exact_f1:.6f}) - 1) * 100")
print(f"  f1_improvement = ({clap_baseline_metrics['overall_f1_score'] / exact_f1:.6f} - 1) * 100")
print(f"  f1_improvement = {f1_improvement:.2f}%")

# Manual verification
manual_calc = (0.016 / 0.007 - 1) * 100
print(f"\nManual verification (using displayed rounded values):")
print(f"  (0.016 / 0.007 - 1) * 100 = {manual_calc:.2f}%")

print(f"\nDISCREPANCY ANALYSIS:")
print(f"  Table shows: +131%")
print(f"  Code calculates: +{f1_improvement:.0f}%") 
print(f"  Manual calc with displayed values: +{manual_calc:.0f}%")
print(f"  Difference between code and manual: {abs(f1_improvement - manual_calc):.2f} percentage points")
