In [None]:
import json
import pandas as pd
import numpy as np
import os
from pathlib import Path
import laion_clap
import torch
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")


In [None]:
# Load input/ground truth pairs
with open('data/input_ground_truth_pairs.json', 'r') as f:
    input_gt_pairs = json.load(f)

print(f"Loaded {len(input_gt_pairs)} sound clips with input/ground truth pairs")
print(f"Example entry: {input_gt_pairs[0]}")


In [None]:
# Load metadata
metadata_df = pd.read_csv('data/BSD10k/BSD10K_metadata_filtered.csv')
print(f"Loaded metadata for {len(metadata_df)} sound clips")
print(metadata_df.head())


In [None]:
# Load tagset
with open('data/tagset_clap_normalized_hyphen_unique.txt', 'r') as f:
    clap_tags = [line.strip() for line in f.readlines()]

print(f"Loaded {len(clap_tags)} unique CLAP tags")
print(f"First 10 tags: {clap_tags[:10]}")


In [None]:
# Initialize CLAP model for text encoding
model = laion_clap.CLAP_Module(enable_fusion=False)
model.load_ckpt()  # Load default pretrained weights

print("CLAP model loaded successfully!")


In [None]:
def create_tag_sentences(tag):
    """
    Create 4 sentence variants for each tag:
    - "{Tag}." (first letter capitalized)
    - "The sound of {tag}."
    - "A recording of {tag}."
    - "An audio clip of {tag}."
    """
    capitalized_tag = tag.capitalize()
    sentences = [
        f"{capitalized_tag}.",
        f"The sound of {tag}.",
        f"A recording of {tag}.",
        f"An audio clip of {tag}."
    ]
    return sentences

# Test the function
test_tag = "guitar"
test_sentences = create_tag_sentences(test_tag)
print(f"Example sentences for '{test_tag}':")
for i, sentence in enumerate(test_sentences, 1):
    print(f"{i}. {sentence}")


In [None]:
# Create text embeddings for all tags
print("Creating text embeddings for all tags...")

tag_embeddings = {}
batch_size = 32  # Process tags in batches for efficiency

for i in tqdm(range(0, len(clap_tags), batch_size)):
    batch_tags = clap_tags[i:i+batch_size]
    
    # Create sentences for all tags in the batch
    all_sentences = []
    tag_to_sentences_map = {}
    
    for tag in batch_tags:
        sentences = create_tag_sentences(tag)
        start_idx = len(all_sentences)
        all_sentences.extend(sentences)
        tag_to_sentences_map[tag] = list(range(start_idx, start_idx + 4))
    
    # Get embeddings for all sentences in the batch
    with torch.no_grad():
        text_embed = model.get_text_embedding(all_sentences)
    
    # Store embeddings for each tag (keep all 4 variants)
    for tag in batch_tags:
        indices = tag_to_sentences_map[tag]
        tag_embeddings[tag] = text_embed[indices]

print(f"Created text embeddings for {len(tag_embeddings)} tags")
print(f"Each tag has {len(list(tag_embeddings.values())[0])} sentence variants")
print(f"Embedding dimension: {list(tag_embeddings.values())[0].shape[1]}")


In [None]:
def save_tag_embeddings(tag_embeddings, save_dir, filename_suffix=""):
    """
    Save all tag embeddings to disk in a single file.
    
    Args:
        tag_embeddings (dict): Dictionary with tag names as keys and embeddings as values
        save_dir (str or Path): Directory to save the embeddings
        filename_suffix (str): Suffix to add to filenames for differentiation
    """
    save_dir = Path(save_dir)
    save_dir.mkdir(parents=True, exist_ok=True)
    
    print(f"Saving tag embeddings to {save_dir}...")
    
    # Prepare data for saving
    tags_list = list(tag_embeddings.keys())
    embeddings_list = []
    
    for tag in tqdm(tags_list, desc="Preparing tag embeddings"):
        embeddings = tag_embeddings[tag]
        
        # Convert to numpy array if it's a torch tensor
        if hasattr(embeddings, 'detach'):
            embeddings_np = embeddings.detach().cpu().numpy()
        else:
            embeddings_np = embeddings
        
        embeddings_list.append(embeddings_np)
    
    # Stack all embeddings into a single array
    # Shape: (num_tags, num_sentence_variants, embedding_dim)
    all_embeddings = np.stack(embeddings_list, axis=0)
    
    # Save embeddings and tag list
    embeddings_path = save_dir / f'tag_embeddings{filename_suffix}.npy'
    tags_path = save_dir / f'tag_names{filename_suffix}.json'
    
    np.save(embeddings_path, all_embeddings)
    
    with open(tags_path, 'w') as f:
        json.dump(tags_list, f, indent=2)
    
    print(f"Saved {len(tag_embeddings)} tag embeddings to {embeddings_path}")
    print(f"Saved tag names to {tags_path}")
    print(f"Embeddings shape: {all_embeddings.shape}")
    
    # Save a metadata file with tag information
    metadata = {
        'num_tags': len(tag_embeddings),
        'embedding_dimension': all_embeddings.shape[2],
        'num_sentence_variants': all_embeddings.shape[1],
        'embeddings_shape': list(all_embeddings.shape),
        'normalization': 'hyphen-normalized',
        'files': {
            'embeddings': f'tag_embeddings{filename_suffix}.npy',
            'tag_names': f'tag_names{filename_suffix}.json'
        }
    }
    
    metadata_path = save_dir / f'metadata{filename_suffix}.json'
    with open(metadata_path, 'w') as f:
        json.dump(metadata, f, indent=2)
    
    print(f"Saved metadata to {metadata_path}")

# Save the tag embeddings (hyphen-normalized version)
tags_embeddings_dir = 'data/BSD10k/embeddings/tags'
save_tag_embeddings(tag_embeddings, tags_embeddings_dir, filename_suffix="_hyphen")


In [None]:
# Check available audio embeddings
embeddings_dir = Path('data/BSD10k/embeddings/clap')
available_embeddings = set([int(f.stem) for f in embeddings_dir.glob('*.npy')])

print(f"Found {len(available_embeddings)} audio embeddings")

# Filter input_gt_pairs to only include clips with available embeddings
filtered_pairs = [pair for pair in input_gt_pairs if pair['sound_id'] in available_embeddings]

print(f"Using {len(filtered_pairs)} clips with both ground truth and audio embeddings")


In [None]:
# Load audio embeddings for the filtered clips
print("Loading audio embeddings...")

audio_embeddings = {}

for pair in tqdm(filtered_pairs):
    sound_id = pair['sound_id']
    embedding_path = embeddings_dir / f"{sound_id}.npy"
    
    if embedding_path.exists():
        audio_embeddings[sound_id] = np.load(embedding_path)

print(f"Loaded audio embeddings for {len(audio_embeddings)} sound clips")
if audio_embeddings:
    sample_embedding = list(audio_embeddings.values())[0]
    print(f"Audio embedding dimension: {sample_embedding.shape}")


In [None]:
def get_tag_recommendations(audio_embedding, tag_embeddings, top_k=10):
    """
    Get top-k tag recommendations for a given audio embedding.
    For each tag, we take the maximum similarity among its 4 sentence variants.
    """
    tag_similarities = {}
    
    for tag, text_embeds in tag_embeddings.items():
        # Compute cosine similarity between audio and all text variants
        similarities = cosine_similarity([audio_embedding], text_embeds)[0]
        # Take the maximum similarity among the 4 variants
        tag_similarities[tag] = np.max(similarities)
    
    # Sort tags by similarity and get top-k
    sorted_tags = sorted(tag_similarities.items(), key=lambda x: x[1], reverse=True)
    top_tags = [tag for tag, _ in sorted_tags[:top_k]]
    top_scores = [score for _, score in sorted_tags[:top_k]]
    
    return top_tags, top_scores

# Test the function
test_sound_id = filtered_pairs[0]['sound_id']
test_audio_embedding = audio_embeddings[test_sound_id]
test_recommendations, test_scores = get_tag_recommendations(test_audio_embedding, tag_embeddings, top_k=5)

print(f"Test recommendations for sound {test_sound_id}:")
for i, (tag, score) in enumerate(zip(test_recommendations, test_scores), 1):
    print(f"{i}. {tag} (similarity: {score:.4f})")


In [None]:
# Generate recommendations for all clips
print("Generating recommendations for all clips...")

results = []

for pair in tqdm(filtered_pairs):
    sound_id = pair['sound_id']
    
    if sound_id not in audio_embeddings:
        continue
    
    # Get audio embedding
    audio_embedding = audio_embeddings[sound_id]
    
    # Get top 10 recommendations
    predicted_tags, prediction_scores = get_tag_recommendations(
        audio_embedding, tag_embeddings, top_k=10
    )
    
    # Normalize ground truth tags to lowercase for comparison
    ground_truth_tags = [tag.lower() for tag in pair['ground_truth_tags']]
    predicted_tags_lower = [tag.lower() for tag in predicted_tags]
    
    # Calculate hits (intersection of predicted and ground truth)
    hits = list(set(predicted_tags_lower) & set(ground_truth_tags))
    
    # Calculate metrics
    precision = len(hits) / 10.0
    recall = len(hits) / len(ground_truth_tags) if ground_truth_tags else 0.0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
    
    result = {
        'sound_id': sound_id,
        'title': pair['title'],
        'ground_truth_tags': ground_truth_tags,
        'predicted_tags': predicted_tags,
        'prediction_scores': prediction_scores,
        'hits': hits,
        'num_hits': len(hits),
        'precision_at_10': precision,
        'recall': recall,
        'f1_score': f1
    }
    
    results.append(result)

print(f"Generated recommendations for {len(results)} clips")


In [None]:
# Calculate overall metrics
total_hits = sum(result['num_hits'] for result in results)
total_predictions = len(results) * 10
total_ground_truth = sum(len(result['ground_truth_tags']) for result in results)

avg_precision_at_10 = np.mean([result['precision_at_10'] for result in results])
avg_recall = np.mean([result['recall'] for result in results])
avg_hits_per_clip = np.mean([result['num_hits'] for result in results])

# Calculate F1 scores
overall_precision = total_hits / total_predictions
overall_recall = total_hits / total_ground_truth
overall_f1 = 2 * (overall_precision * overall_recall) / (overall_precision + overall_recall) if (overall_precision + overall_recall) > 0 else 0

# Calculate average F1 score per clip
f1_scores = []
for result in results:
    precision = result['precision_at_10']
    recall = result['recall']
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    f1_scores.append(f1)
avg_f1 = np.mean(f1_scores)

print("=== CLAP Zero-Shot Tag Recommendation Results ===")
print(f"Number of clips evaluated: {len(results)}")
print(f"Total hits: {total_hits}")
print(f"Total predictions: {total_predictions}")
print(f"Total ground truth tags: {total_ground_truth}")
print()
print(f"Average Precision@10: {avg_precision_at_10:.4f}")
print(f"Average Recall: {avg_recall:.4f}")
print(f"Average F1 Score: {avg_f1:.4f}")
print(f"Average hits per clip: {avg_hits_per_clip:.2f}")
print()
print(f"Overall Precision: {overall_precision:.4f}")
print(f"Overall Recall: {overall_recall:.4f}")
print(f"Overall F1 Score: {overall_f1:.4f}")


In [None]:
# Show some example results
print("=== Example Results ===")

# Sort by number of hits (best first)
results_sorted = sorted(results, key=lambda x: x['num_hits'], reverse=True)

for i, result in enumerate(results_sorted[:5]):
    print(f"\n--- Example {i+1} (Sound ID: {result['sound_id']}) ---")
    print(f"Title: {result['title']}")
    print(f"Ground Truth Tags: {result['ground_truth_tags']}")
    print(f"Predicted Tags: {result['predicted_tags']}")
    print(f"Hits: {result['hits']} ({result['num_hits']} hits)")
    print(f"Precision@10: {result['precision_at_10']:.3f}, Recall: {result['recall']:.3f}, F1: {result['f1_score']:.3f}")


In [None]:
# Distribution of hits
hits_distribution = {}
for result in results:
    num_hits = result['num_hits']
    hits_distribution[num_hits] = hits_distribution.get(num_hits, 0) + 1

print("\n=== Distribution of Hits ===")
for hits in sorted(hits_distribution.keys()):
    count = hits_distribution[hits]
    percentage = count / len(results) * 100
    print(f"{hits} hits: {count} clips ({percentage:.1f}%)")


In [None]:
# Convert numpy values to Python types for JSON serialization
def convert_numpy_types(obj):
    """Recursively convert numpy types to Python types"""
    if isinstance(obj, np.integer):
        return int(obj)
    elif isinstance(obj, np.floating):
        return float(obj)
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    elif isinstance(obj, dict):
        return {key: convert_numpy_types(value) for key, value in obj.items()}
    elif isinstance(obj, list):
        return [convert_numpy_types(item) for item in obj]
    else:
        return obj

# Convert results to be JSON serializable
json_safe_results = convert_numpy_types(results)

# Save results
output_file = 'eval/clap_baseline_results.json'
os.makedirs('eval', exist_ok=True)

# Prepare summary
summary = {
    'model': 'CLAP Zero-Shot',
    'num_clips': len(results),
    'total_hits': int(total_hits),
    'total_predictions': int(total_predictions),
    'total_ground_truth': int(total_ground_truth),
    'avg_precision_at_10': float(avg_precision_at_10),
    'avg_recall': float(avg_recall),
    'avg_f1_score': float(avg_f1),
    'avg_hits_per_clip': float(avg_hits_per_clip),
    'overall_precision': float(overall_precision),
    'overall_recall': float(overall_recall),
    'overall_f1_score': float(overall_f1),
    'hits_distribution': convert_numpy_types(hits_distribution)
}

# Save detailed results
with open(output_file, 'w') as f:
    json.dump({
        'summary': summary,
        'detailed_results': json_safe_results
    }, f, indent=2)

print(f"Results saved to {output_file}")

# Also save summary only
summary_file = 'eval/clap_baseline_summary.json'
with open(summary_file, 'w') as f:
    json.dump(summary, f, indent=2)

print(f"Summary saved to {summary_file}")
