In [None]:
import json
import pandas as pd
import numpy as np
import os
from pathlib import Path
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import warnings
from collections import Counter
import math
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")


In [None]:
# Load existing rankst results for k=1,2,3
k_values = [1, 2, 3]
rankst_results = {}

for k in k_values:
    results_file = f'eval/rankst_k{k}_eval.json'
    if os.path.exists(results_file):
        print(f"Loading rankst results for k={k}...")
        with open(results_file, 'r') as f:
            rankst_results[k] = json.load(f)
        print(f"Loaded {len(rankst_results[k]['evaluation_details'])} test cases for k={k}")
    else:
        print(f"Warning: {results_file} not found")

print(f"\nLoaded results for k values: {list(rankst_results.keys())}")


In [None]:
# Display current metrics summary
print("=== Current Rankst Results Summary (Exact Matching) ===")
print("k | Precision@10 | Recall@10 | F1@10")
print("--|-------------|-----------|-------")
for k in sorted(rankst_results.keys()):
    metrics = rankst_results[k]['metrics']
    p = metrics['precision_10']['mean']
    r = metrics['recall_10']['mean']
    f = metrics['f1_10']['mean']
    print(f"{k} | {p:.4f}      | {r:.4f}    | {f:.4f}")

# Show example result structure
example_result = rankst_results[1]['evaluation_details'][0]
print("\n=== Example Result Structure ===")
print(f"Sound ID: {example_result['sound_id']}")
print(f"Title: {example_result['title']}")
print(f"Input tags (k={example_result['k']}): {example_result['input_tags_k']}")
print(f"Recommended tags: {example_result['recommended_tags'][:5]}...")
print(f"Ground truth tags: {example_result['ground_truth_tags']}")
print(f"Exact hits: {example_result['hits']}")
print(f"Exact Precision@10: {example_result['precision_10']:.3f}")


In [None]:
# Load SBERT model for semantic similarity
print("Loading SBERT model...")
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')
print("SBERT model loaded successfully!")

# Semantic similarity threshold (adjustable)
SIMILARITY_THRESHOLD = 0.7
print(f"Semantic similarity threshold: {SIMILARITY_THRESHOLD}")


In [None]:
# Collect all unique tags from all k values for SBERT encoding
print("Collecting all unique tags for SBERT encoding...")

all_predicted_tags = set()
all_ground_truth_tags = set()

for k in rankst_results.keys():
    for result in rankst_results[k]['evaluation_details']:
        # Add predicted tags
        all_predicted_tags.update([tag.lower() for tag in result['recommended_tags']])
        # Add ground truth tags
        all_ground_truth_tags.update([tag.lower() for tag in result['ground_truth_tags']])

# Combine all unique tags
all_unique_tags = list(all_predicted_tags.union(all_ground_truth_tags))
print(f"Total unique predicted tags: {len(all_predicted_tags)}")
print(f"Total unique ground truth tags: {len(all_ground_truth_tags)}")
print(f"Total unique tags to encode: {len(all_unique_tags)}")


In [None]:
# Encode all tags with SBERT
print("Encoding all tags with SBERT...")
tag_sbert_embeddings = sbert_model.encode(all_unique_tags, show_progress_bar=True)
print(f"SBERT embeddings shape: {tag_sbert_embeddings.shape}")

# Create tag to embedding mapping
tag_to_sbert = {tag: embedding for tag, embedding in zip(all_unique_tags, tag_sbert_embeddings)}
print("SBERT encoding completed!")


In [None]:
def compute_semantic_hits(predicted_tags, ground_truth_tags, tag_to_sbert, similarity_threshold=0.7):
    """
    Compute semantic hits using SBERT embeddings and cosine similarity.
    A predicted tag is considered a hit if its semantic similarity
    with any ground truth tag exceeds the threshold.
    """
    hits = []
    semantic_matches = []
    
    predicted_tags_lower = [tag.lower() for tag in predicted_tags]
    ground_truth_tags_lower = [tag.lower() for tag in ground_truth_tags]
    
    for pred_tag in predicted_tags_lower:
        if pred_tag not in tag_to_sbert:
            continue
            
        pred_embedding = tag_to_sbert[pred_tag]
        max_similarity = 0.0
        best_match = None
        
        for gt_tag in ground_truth_tags_lower:
            if gt_tag not in tag_to_sbert:
                continue
                
            gt_embedding = tag_to_sbert[gt_tag]
            similarity = cosine_similarity([pred_embedding], [gt_embedding])[0][0]
            
            if similarity > max_similarity:
                max_similarity = similarity
                best_match = gt_tag
        
        if max_similarity >= similarity_threshold:
            hits.append(pred_tag)
            semantic_matches.append({
                'predicted': pred_tag,
                'matched_gt': best_match,
                'similarity': max_similarity
            })
    
    return hits, semantic_matches

# Test the semantic similarity function
test_pred = ['percussion', 'beat', 'rhythm']
test_gt = ['drum', 'drums', 'drumming']
test_hits, test_matches = compute_semantic_hits(test_pred, test_gt, tag_to_sbert, SIMILARITY_THRESHOLD)
print(f"Test semantic hits: {test_hits}")
print(f"Test matches: {test_matches}")


In [None]:
# Apply semantic evaluation to all rankst results
print("Applying SBERT semantic evaluation to rankst results...")

rankst_sbert_results = {}

for k in sorted(rankst_results.keys()):
    print(f"\nProcessing k={k}...")
    
    original_results = rankst_results[k]['evaluation_details']
    enhanced_results = []
    
    total_semantic_hits = 0
    total_exact_hits = 0
    
    for result in tqdm(original_results, desc=f"Processing k={k}"):
        # Get original data
        predicted_tags = result['recommended_tags'][:10]  # Top 10 predictions
        ground_truth_tags = result['ground_truth_tags']
        exact_hits = result['hits']
        
        # Calculate semantic hits using SBERT
        semantic_hits, semantic_matches = compute_semantic_hits(
            predicted_tags, ground_truth_tags, tag_to_sbert, SIMILARITY_THRESHOLD
        )
        
        # Calculate semantic metrics
        semantic_precision = len(semantic_hits) / 10.0  # Always 10 predictions
        semantic_recall = len(semantic_hits) / len(ground_truth_tags) if ground_truth_tags else 0.0
        semantic_f1 = 2 * (semantic_precision * semantic_recall) / (semantic_precision + semantic_recall) if (semantic_precision + semantic_recall) > 0 else 0.0
        
        # Create enhanced result
        enhanced_result = result.copy()  # Copy original result
        enhanced_result.update({
            'semantic_hits': semantic_hits,
            'semantic_matches': semantic_matches,
            'num_semantic_hits': len(semantic_hits),
            'num_exact_hits': len(exact_hits),
            'semantic_precision_10': semantic_precision,
            'semantic_recall_10': semantic_recall,
            'semantic_f1_10': semantic_f1,
            # Keep original metrics for comparison
            'exact_precision_10': result['precision_10'],
            'exact_recall_10': result['recall_10'],
            'exact_f1_10': result['f1_10']
        })
        
        enhanced_results.append(enhanced_result)
        total_semantic_hits += len(semantic_hits)
        total_exact_hits += len(exact_hits)
    
    improvement = (total_semantic_hits / total_exact_hits - 1) * 100 if total_exact_hits > 0 else 0
    print(f"k={k}: {total_semantic_hits} semantic hits vs {total_exact_hits} exact hits ({improvement:.1f}% improvement)")
    
    # Store enhanced results (we'll compute metrics in next cell)
    rankst_sbert_results[k] = {
        'enhanced_results': enhanced_results,
        'total_semantic_hits': total_semantic_hits,
        'total_exact_hits': total_exact_hits
    }

print("\nSBERT evaluation completed for all k values!")


In [None]:
# Compute final metrics for each k
for k in sorted(rankst_sbert_results.keys()):
    enhanced_results = rankst_sbert_results[k]['enhanced_results']
    
    # Calculate overall metrics for this k
    semantic_precisions = [r['semantic_precision_10'] for r in enhanced_results]
    semantic_recalls = [r['semantic_recall_10'] for r in enhanced_results]
    semantic_f1s = [r['semantic_f1_10'] for r in enhanced_results]
    
    exact_precisions = [r['exact_precision_10'] for r in enhanced_results]
    exact_recalls = [r['exact_recall_10'] for r in enhanced_results]
    exact_f1s = [r['exact_f1_10'] for r in enhanced_results]
    
    # Update results structure with complete metrics
    rankst_sbert_results[k].update({
        'k_input_tags': k,
        'sbert_threshold': SIMILARITY_THRESHOLD,
        'sbert_model': 'all-MiniLM-L6-v2',
        'total_test_cases': len(enhanced_results),
        'semantic_metrics': {
            'precision_10': {
                'mean': np.mean(semantic_precisions),
                'median': np.median(semantic_precisions),
                'std': np.std(semantic_precisions),
                'min': np.min(semantic_precisions),
                'max': np.max(semantic_precisions)
            },
            'recall_10': {
                'mean': np.mean(semantic_recalls),
                'median': np.median(semantic_recalls),
                'std': np.std(semantic_recalls),
                'min': np.min(semantic_recalls),
                'max': np.max(semantic_recalls)
            },
            'f1_10': {
                'mean': np.mean(semantic_f1s),
                'median': np.median(semantic_f1s),
                'std': np.std(semantic_f1s),
                'min': np.min(semantic_f1s),
                'max': np.max(semantic_f1s)
            }
        },
        'exact_metrics': {
            'precision_10': {
                'mean': np.mean(exact_precisions),
                'median': np.median(exact_precisions),
                'std': np.std(exact_precisions),
                'min': np.min(exact_precisions),
                'max': np.max(exact_precisions)
            },
            'recall_10': {
                'mean': np.mean(exact_recalls),
                'median': np.median(exact_recalls),
                'std': np.std(exact_recalls),
                'min': np.min(exact_recalls),
                'max': np.max(exact_recalls)
            },
            'f1_10': {
                'mean': np.mean(exact_f1s),
                'median': np.median(exact_f1s),
                'std': np.std(exact_f1s),
                'min': np.min(exact_f1s),
                'max': np.max(exact_f1s)
            }
        },
        'evaluation_details': enhanced_results
    })

print("Metrics computation completed!")


In [None]:
# Display comparison of exact vs semantic metrics
print("=== Rankst Results: Exact vs Semantic Matching Comparison ===")
print(f"SBERT Threshold: {SIMILARITY_THRESHOLD}")
print("\n" + "="*80)
print("PRECISION@10")
print("="*80)
print("k | Exact      | Semantic   | Improvement")
print("--|------------|------------|------------")
for k in sorted(rankst_sbert_results.keys()):
    exact_p = rankst_sbert_results[k]['exact_metrics']['precision_10']['mean']
    semantic_p = rankst_sbert_results[k]['semantic_metrics']['precision_10']['mean']
    improvement = (semantic_p / exact_p - 1) * 100 if exact_p > 0 else 0
    print(f"{k} | {exact_p:.4f}     | {semantic_p:.4f}     | +{improvement:.1f}%")

print("\n" + "="*80)
print("RECALL@10")
print("="*80)
print("k | Exact      | Semantic   | Improvement")
print("--|------------|------------|------------")
for k in sorted(rankst_sbert_results.keys()):
    exact_r = rankst_sbert_results[k]['exact_metrics']['recall_10']['mean']
    semantic_r = rankst_sbert_results[k]['semantic_metrics']['recall_10']['mean']
    improvement = (semantic_r / exact_r - 1) * 100 if exact_r > 0 else 0
    print(f"{k} | {exact_r:.4f}     | {semantic_r:.4f}     | +{improvement:.1f}%")

print("\n" + "="*80)
print("F1@10")
print("="*80)
print("k | Exact      | Semantic   | Improvement")
print("--|------------|------------|------------")
for k in sorted(rankst_sbert_results.keys()):
    exact_f = rankst_sbert_results[k]['exact_metrics']['f1_10']['mean']
    semantic_f = rankst_sbert_results[k]['semantic_metrics']['f1_10']['mean']
    improvement = (semantic_f / exact_f - 1) * 100 if exact_f > 0 else 0
    print(f"{k} | {exact_f:.4f}     | {semantic_f:.4f}     | +{improvement:.1f}%")

print("\n" + "="*80)
print("TOTAL HITS")
print("="*80)
print("k | Exact Hits | Semantic Hits | Improvement")
print("--|------------|---------------|------------")
for k in sorted(rankst_sbert_results.keys()):
    exact_hits = rankst_sbert_results[k]['total_exact_hits']
    semantic_hits = rankst_sbert_results[k]['total_semantic_hits']
    improvement = (semantic_hits / exact_hits - 1) * 100 if exact_hits > 0 else 0
    print(f"{k} | {exact_hits:10d} | {semantic_hits:13d} | +{improvement:.1f}%")


In [None]:
# Show example results with semantic analysis
print("=== Example Results with SBERT Semantic Analysis ===")

# Find best examples for each k (highest semantic hits)
for k in sorted(rankst_sbert_results.keys()):
    results_k = rankst_sbert_results[k]['evaluation_details']
    # Sort by number of semantic hits
    best_results = sorted(results_k, key=lambda x: x['num_semantic_hits'], reverse=True)[:2]
    
    print(f"\n{'='*60}")
    print(f"BEST EXAMPLES FOR k={k}")
    print(f"{'='*60}")
    
    for i, result in enumerate(best_results, 1):
        print(f"\n--- Example {i} (Sound ID: {result['sound_id']}) ---")
        print(f"Title: {result['title']}")
        print(f"Input tags (k={k}): {result['input_tags_k']}")
        print(f"Ground truth tags: {result['ground_truth_tags']}")
        print(f"Predicted tags (top 10): {result['recommended_tags'][:10]}")
        
        print(f"\nSemantic Matches (threshold={SIMILARITY_THRESHOLD}):")
        for match in result['semantic_matches']:
            print(f"  '{match['predicted']}' ↔ '{match['matched_gt']}' (similarity: {match['similarity']:.3f})")
        
        print(f"\nResults:")
        print(f"  Exact hits: {result['hits']} ({result['num_exact_hits']} hits)")
        print(f"  Semantic hits: {result['semantic_hits']} ({result['num_semantic_hits']} hits)")
        print(f"  Exact    - P@10: {result['exact_precision_10']:.3f}, R@10: {result['exact_recall_10']:.3f}, F1@10: {result['exact_f1_10']:.3f}")
        print(f"  Semantic - P@10: {result['semantic_precision_10']:.3f}, R@10: {result['semantic_recall_10']:.3f}, F1@10: {result['semantic_f1_10']:.3f}")


In [None]:
# Analyze hit distributions
print("\n=== Hit Distribution Analysis ===")

for k in sorted(rankst_sbert_results.keys()):
    results_k = rankst_sbert_results[k]['evaluation_details']
    
    # Count hit distributions
    exact_hit_counts = {}
    semantic_hit_counts = {}
    
    for result in results_k:
        exact_hits = result['num_exact_hits']
        semantic_hits = result['num_semantic_hits']
        
        exact_hit_counts[exact_hits] = exact_hit_counts.get(exact_hits, 0) + 1
        semantic_hit_counts[semantic_hits] = semantic_hit_counts.get(semantic_hits, 0) + 1
    
    print(f"\nk={k} Hit Distribution:")
    print("Hits | Exact Count | Semantic Count")
    print("-----|-------------|---------------")
    
    max_hits = max(max(exact_hit_counts.keys()), max(semantic_hit_counts.keys()))
    for hits in range(max_hits + 1):
        exact_count = exact_hit_counts.get(hits, 0)
        semantic_count = semantic_hit_counts.get(hits, 0)
        print(f"{hits:4d} | {exact_count:11d} | {semantic_count:14d}")


In [None]:
# Convert numpy values to Python types for JSON serialization
def convert_numpy_types(obj):
    """Recursively convert numpy types to Python types"""
    if isinstance(obj, np.integer):
        return int(obj)
    elif isinstance(obj, np.floating):
        return float(obj)
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    elif isinstance(obj, dict):
        return {key: convert_numpy_types(value) for key, value in obj.items()}
    elif isinstance(obj, list):
        return [convert_numpy_types(item) for item in obj]
    else:
        return obj

# Convert results to be JSON serializable
json_safe_results = convert_numpy_types(rankst_sbert_results)

# Save detailed results for each k
os.makedirs('eval', exist_ok=True)

for k in sorted(rankst_sbert_results.keys()):
    # Save detailed results
    output_file = f'eval/rankst_k{k}_sbert_threshold{SIMILARITY_THRESHOLD}_results.json'
    with open(output_file, 'w') as f:
        json.dump(json_safe_results[k], f, indent=2)
    print(f"Detailed results for k={k} saved to {output_file}")
    
    # Save summary only
    summary_k = {key: json_safe_results[k][key] for key in json_safe_results[k] if key != 'evaluation_details'}
    summary_file = f'eval/rankst_k{k}_sbert_threshold{SIMILARITY_THRESHOLD}_summary.json'
    with open(summary_file, 'w') as f:
        json.dump(summary_k, f, indent=2)
    print(f"Summary for k={k} saved to {summary_file}")


In [None]:
# Create combined summary comparing all k values
combined_summary = {
    'description': f'Rankst tag recommendation evaluation with SBERT semantic similarity (threshold={SIMILARITY_THRESHOLD})',
    'sbert_model': 'all-MiniLM-L6-v2',
    'sbert_threshold': float(SIMILARITY_THRESHOLD),
    'total_test_cases': rankst_sbert_results[1]['total_test_cases'],
    'k_values_tested': sorted(list(rankst_sbert_results.keys())),
    'semantic_results_by_k': {},
    'exact_results_by_k': {},
    'improvements_by_k': {}
}

for k in sorted(rankst_sbert_results.keys()):
    k_str = str(k)
    
    # Semantic results
    combined_summary['semantic_results_by_k'][k_str] = convert_numpy_types(
        rankst_sbert_results[k]['semantic_metrics']
    )
    
    # Exact results (for comparison)
    combined_summary['exact_results_by_k'][k_str] = convert_numpy_types(
        rankst_sbert_results[k]['exact_metrics']
    )
    
    # Improvements
    exact_metrics = rankst_sbert_results[k]['exact_metrics']
    semantic_metrics = rankst_sbert_results[k]['semantic_metrics']
    
    improvements = {}
    for metric in ['precision_10', 'recall_10', 'f1_10']:
        exact_val = exact_metrics[metric]['mean']
        semantic_val = semantic_metrics[metric]['mean']
        improvement = (semantic_val / exact_val - 1) * 100 if exact_val > 0 else 0
        improvements[metric + '_improvement_percent'] = float(improvement)
    
    # Hit count improvements
    exact_hits = rankst_sbert_results[k]['total_exact_hits']
    semantic_hits = rankst_sbert_results[k]['total_semantic_hits']
    hit_improvement = (semantic_hits / exact_hits - 1) * 100 if exact_hits > 0 else 0
    improvements['total_hits_improvement_percent'] = float(hit_improvement)
    
    combined_summary['improvements_by_k'][k_str] = improvements

# Save combined summary
combined_file = f'eval/rankst_k_sbert_threshold{SIMILARITY_THRESHOLD}_combined_summary.json'
with open(combined_file, 'w') as f:
    json.dump(combined_summary, f, indent=2)

print(f"\nCombined summary saved to {combined_file}")
print("\n=== SBERT Evaluation Complete! ===")
print(f"Results saved for k values: {sorted(list(rankst_sbert_results.keys()))}")
print(f"SBERT threshold used: {SIMILARITY_THRESHOLD}")
