# Audio Fingerprinting Exploration with PyAcoustID

This notebook explores using PyAcoustID for audio fingerprinting and duplicate detection in our Artlist and MotionArray catalogs.

## Goals:
1. Understand how PyAcoustID works with our audio files
2. Generate fingerprints for sample audio files
3. Compare fingerprints to detect duplicates
4. Experiment with similarity thresholds
5. Evaluate performance and accuracy

## Setup and Imports


In [1]:
import os
import acoustid
import librosa
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
import time
from typing import List, Dict, Tuple
import warnings
warnings.filterwarnings('ignore')

# Set up paths
artlist_dir = Path("downloads/artlist")
motionarray_dir = Path("downloads/motionarray")

print("Available functions in acoustid:")
print([f for f in dir(acoustid) if not f.startswith('_')])
print(f"\nArtlist directory exists: {artlist_dir.exists()}")
print(f"MotionArray directory exists: {motionarray_dir.exists()}")

if artlist_dir.exists():
    artlist_files = list(artlist_dir.glob("*.mp3")) + list(artlist_dir.glob("*.wav"))
    print(f"Found {len(artlist_files)} Artlist audio files")
    
if motionarray_dir.exists():
    motionarray_files = list(motionarray_dir.glob("*.mp3")) + list(motionarray_dir.glob("*.wav"))
    print(f"Found {len(motionarray_files)} MotionArray audio files")


ModuleNotFoundError: No module named 'librosa'

## Understanding PyAcoustID

PyAcoustID is a Python library for generating acoustic fingerprints and looking up metadata from the AcoustID service. Let's explore its key functions:


In [None]:
# Test basic functionality
print("Testing PyAcoustID functionality:")
print(f"Have chromaprint: {acoustid.have_chromaprint()}")
print(f"Have audioread: {acoustid.have_audioread()}")

# Key functions we'll use:
print("\nKey functions for our use case:")
print("1. acoustid.fingerprint_file() - Generate fingerprint from audio file")
print("2. acoustid.compare_fingerprints() - Compare two fingerprints for similarity")
print("3. acoustid.fingerprint() - Generate fingerprint from raw audio data")

# Let's check if we have any audio files to work with
sample_files = []
if artlist_dir.exists():
    sample_files.extend(list(artlist_dir.glob("*.mp3"))[:3])  # Take first 3 MP3s
    sample_files.extend(list(artlist_dir.glob("*.wav"))[:3])  # Take first 3 WAVs

if motionarray_dir.exists():
    sample_files.extend(list(motionarray_dir.glob("*.mp3"))[:3])  # Take first 3 MP3s
    sample_files.extend(list(motionarray_dir.glob("*.wav"))[:3])  # Take first 3 WAVs

print(f"\nSample files for testing: {len(sample_files)}")
for i, file in enumerate(sample_files[:5]):  # Show first 5
    print(f"{i+1}. {file.name} ({file.stat().st_size / (1024*1024):.1f} MB)")


## Generating Audio Fingerprints

Let's generate fingerprints for our sample audio files and see what they look like:


In [None]:
def generate_fingerprint_safe(file_path):
    """Safely generate fingerprint for an audio file."""
    try:
        start_time = time.time()
        duration, fingerprint = acoustid.fingerprint_file(str(file_path))
        processing_time = time.time() - start_time
        
        return {
            'file': file_path.name,
            'duration': duration,
            'fingerprint': fingerprint,
            'processing_time': processing_time,
            'fingerprint_length': len(fingerprint) if fingerprint else 0,
            'success': True,
            'error': None
        }
    except Exception as e:
        return {
            'file': file_path.name,
            'duration': None,
            'fingerprint': None,
            'processing_time': None,
            'fingerprint_length': 0,
            'success': False,
            'error': str(e)
        }

# Generate fingerprints for sample files
print("Generating fingerprints for sample files...")
fingerprint_results = []

for i, file_path in enumerate(sample_files[:5]):  # Test with first 5 files
    print(f"\nProcessing {i+1}/5: {file_path.name}")
    result = generate_fingerprint_safe(file_path)
    fingerprint_results.append(result)
    
    if result['success']:
        print(f"  ✓ Duration: {result['duration']:.1f}s")
        print(f"  ✓ Fingerprint length: {result['fingerprint_length']} characters")
        print(f"  ✓ Processing time: {result['processing_time']:.2f}s")
        print(f"  ✓ Fingerprint preview: {result['fingerprint'][:50]}...")
    else:
        print(f"  ✗ Error: {result['error']}")

# Create a summary DataFrame
df_fingerprints = pd.DataFrame(fingerprint_results)
print(f"\nSummary:")
print(f"Successful fingerprints: {df_fingerprints['success'].sum()}/{len(df_fingerprints)}")
print(f"Average processing time: {df_fingerprints[df_fingerprints['success']]['processing_time'].mean():.2f}s")
print(f"Average fingerprint length: {df_fingerprints[df_fingerprints['success']]['fingerprint_length'].mean():.0f} chars")


## Comparing Fingerprints for Similarity

Now let's test the fingerprint comparison functionality to understand how similarity detection works:


In [None]:
# Get successful fingerprints for comparison
successful_results = [r for r in fingerprint_results if r['success']]
print(f"We have {len(successful_results)} successful fingerprints to compare")

if len(successful_results) >= 2:
    print("\nTesting fingerprint comparison...")
    
    # Compare each fingerprint with every other fingerprint
    comparison_results = []
    
    for i in range(len(successful_results)):
        for j in range(i + 1, len(successful_results)):
            file1 = successful_results[i]
            file2 = successful_results[j]
            
            try:
                # Compare fingerprints
                score = acoustid.compare_fingerprints(file1['fingerprint'], file2['fingerprint'])
                
                comparison_results.append({
                    'file1': file1['file'],
                    'file2': file2['file'],
                    'similarity_score': score,
                    'duration1': file1['duration'],
                    'duration2': file2['duration'],
                    'duration_diff': abs(file1['duration'] - file2['duration'])
                })
                
                print(f"  {file1['file'][:30]:<30} vs {file2['file'][:30]:<30} = {score:.4f}")
                
            except Exception as e:
                print(f"  Error comparing {file1['file']} vs {file2['file']}: {e}")
    
    # Analyze comparison results
    if comparison_results:
        df_comparisons = pd.DataFrame(comparison_results)
        
        print(f"\nComparison Statistics:")
        print(f"Total comparisons: {len(df_comparisons)}")
        print(f"Average similarity: {df_comparisons['similarity_score'].mean():.4f}")
        print(f"Max similarity: {df_comparisons['similarity_score'].max():.4f}")
        print(f"Min similarity: {df_comparisons['similarity_score'].min():.4f}")
        print(f"Std deviation: {df_comparisons['similarity_score'].std():.4f}")
        
        # Show most similar pairs
        print(f"\nMost similar pairs:")
        top_similar = df_comparisons.nlargest(3, 'similarity_score')
        for _, row in top_similar.iterrows():
            print(f"  {row['similarity_score']:.4f}: {row['file1']} vs {row['file2']}")
            
        # Show least similar pairs
        print(f"\nLeast similar pairs:")
        least_similar = df_comparisons.nsmallest(3, 'similarity_score')
        for _, row in least_similar.iterrows():
            print(f"  {row['similarity_score']:.4f}: {row['file1']} vs {row['file2']}")
            
else:
    print("Not enough successful fingerprints to perform comparisons")


## Visualizing Similarity Scores

Let's create some visualizations to better understand the similarity score distribution:


In [None]:
if 'df_comparisons' in locals() and len(df_comparisons) > 0:
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    # Histogram of similarity scores
    axes[0, 0].hist(df_comparisons['similarity_score'], bins=20, alpha=0.7, color='skyblue', edgecolor='black')
    axes[0, 0].set_title('Distribution of Similarity Scores')
    axes[0, 0].set_xlabel('Similarity Score')
    axes[0, 0].set_ylabel('Frequency')
    axes[0, 0].axvline(df_comparisons['similarity_score'].mean(), color='red', linestyle='--', label='Mean')
    axes[0, 0].legend()
    
    # Box plot of similarity scores
    axes[0, 1].boxplot(df_comparisons['similarity_score'])
    axes[0, 1].set_title('Similarity Score Box Plot')
    axes[0, 1].set_ylabel('Similarity Score')
    
    # Scatter plot: Duration difference vs Similarity
    axes[1, 0].scatter(df_comparisons['duration_diff'], df_comparisons['similarity_score'], alpha=0.6)
    axes[1, 0].set_title('Duration Difference vs Similarity Score')
    axes[1, 0].set_xlabel('Duration Difference (seconds)')
    axes[1, 0].set_ylabel('Similarity Score')
    
    # Similarity score vs comparison index (to see patterns)
    axes[1, 1].plot(range(len(df_comparisons)), sorted(df_comparisons['similarity_score'], reverse=True), 'o-')
    axes[1, 1].set_title('Similarity Scores (Sorted)')
    axes[1, 1].set_xlabel('Comparison Index')
    axes[1, 1].set_ylabel('Similarity Score')
    
    plt.tight_layout()
    plt.show()
    
    # Print some insights
    print("Insights from the analysis:")
    print(f"1. Score range: {df_comparisons['similarity_score'].min():.4f} to {df_comparisons['similarity_score'].max():.4f}")
    print(f"2. Most comparisons have scores around {df_comparisons['similarity_score'].median():.4f} (median)")
    
    # Suggest potential thresholds
    q75 = df_comparisons['similarity_score'].quantile(0.75)
    q90 = df_comparisons['similarity_score'].quantile(0.90)
    q95 = df_comparisons['similarity_score'].quantile(0.95)
    
    print(f"3. Potential similarity thresholds:")
    print(f"   - Conservative (top 25%): {q75:.4f}")
    print(f"   - Moderate (top 10%): {q90:.4f}")
    print(f"   - Strict (top 5%): {q95:.4f}")
    
else:
    print("No comparison data available for visualization")


## Testing Different Audio Formats

Let's see how PyAcoustID handles different audio formats (MP3 vs WAV) and if there are any differences:


In [None]:
# Analyze by file format
if len(successful_results) > 0:
    format_analysis = {'MP3': [], 'WAV': []}
    
    for result in successful_results:
        if result['file'].lower().endswith('.mp3'):
            format_analysis['MP3'].append(result)
        elif result['file'].lower().endswith('.wav'):
            format_analysis['WAV'].append(result)
    
    print("Format Analysis:")
    for format_type, files in format_analysis.items():
        if files:
            avg_duration = np.mean([f['duration'] for f in files])
            avg_processing_time = np.mean([f['processing_time'] for f in files])
            avg_fingerprint_length = np.mean([f['fingerprint_length'] for f in files])
            
            print(f"\n{format_type} files ({len(files)} files):")
            print(f"  Average duration: {avg_duration:.1f}s")
            print(f"  Average processing time: {avg_processing_time:.2f}s")
            print(f"  Average fingerprint length: {avg_fingerprint_length:.0f} chars")
            print(f"  Processing speed: {avg_duration/avg_processing_time:.1f}x realtime")
    
    # Compare MP3 vs WAV if we have both
    if format_analysis['MP3'] and format_analysis['WAV']:
        print(f"\nCross-format comparison (MP3 vs WAV):")
        cross_format_scores = []
        
        for mp3_file in format_analysis['MP3'][:2]:  # Limit to avoid too many comparisons
            for wav_file in format_analysis['WAV'][:2]:
                try:
                    score = acoustid.compare_fingerprints(mp3_file['fingerprint'], wav_file['fingerprint'])
                    cross_format_scores.append(score)
                    print(f"  {mp3_file['file'][:25]:<25} vs {wav_file['file'][:25]:<25} = {score:.4f}")
                except Exception as e:
                    print(f"  Error comparing {mp3_file['file']} vs {wav_file['file']}: {e}")
        
        if cross_format_scores:
            print(f"  Average cross-format similarity: {np.mean(cross_format_scores):.4f}")
else:
    print("No successful results to analyze by format")


## Performance Analysis and Scalability

Let's analyze the performance characteristics for scaling to our full catalog:


In [None]:
# Performance analysis
if len(successful_results) > 0:
    total_audio_duration = sum([r['duration'] for r in successful_results])
    total_processing_time = sum([r['processing_time'] for r in successful_results])
    
    print("Performance Analysis:")
    print(f"Total audio analyzed: {total_audio_duration:.1f} seconds ({total_audio_duration/60:.1f} minutes)")
    print(f"Total processing time: {total_processing_time:.1f} seconds ({total_processing_time/60:.1f} minutes)")
    print(f"Processing speed: {total_audio_duration/total_processing_time:.1f}x realtime")
    
    # Estimate for full catalog
    print(f"\nScaling estimates:")
    
    # Assume we have 100 files from each catalog (200 total)
    estimated_files = 200
    avg_duration = total_audio_duration / len(successful_results)
    avg_processing_time = total_processing_time / len(successful_results)
    
    estimated_total_duration = estimated_files * avg_duration
    estimated_processing_time = estimated_files * avg_processing_time
    
    print(f"For {estimated_files} files:")
    print(f"  Estimated fingerprinting time: {estimated_processing_time/60:.1f} minutes")
    print(f"  Estimated total audio: {estimated_total_duration/3600:.1f} hours")
    
    # Comparison complexity
    total_comparisons = (estimated_files * (estimated_files - 1)) // 2
    print(f"  Total pairwise comparisons needed: {total_comparisons:,}")
    
    # Estimate comparison time (assume 0.001 seconds per comparison)
    comparison_time_per_pair = 0.001  # seconds
    total_comparison_time = total_comparisons * comparison_time_per_pair
    
    print(f"  Estimated comparison time: {total_comparison_time/60:.1f} minutes")
    print(f"  Total processing time: {(estimated_processing_time + total_comparison_time)/60:.1f} minutes")
    
    # Memory requirements
    avg_fingerprint_size = np.mean([len(r['fingerprint']) for r in successful_results])
    total_fingerprint_memory = estimated_files * avg_fingerprint_size
    
    print(f"\nMemory estimates:")
    print(f"  Average fingerprint size: {avg_fingerprint_size:.0f} characters")
    print(f"  Total fingerprint storage: {total_fingerprint_memory/1024:.1f} KB")
    
else:
    print("No successful results for performance analysis")


## Conclusions and Recommendations

Based on our exploration, here are the key findings and recommendations for implementing duplicate detection:


In [None]:
print("=== CONCLUSIONS AND RECOMMENDATIONS ===")
print()

if len(successful_results) > 0:
    print("✅ PyAcoustID Successfully Working:")
    print(f"   - Successfully processed {len(successful_results)} audio files")
    print(f"   - Average processing speed: {total_audio_duration/total_processing_time:.1f}x realtime")
    print(f"   - Works with both MP3 and WAV formats")
    print()
    
    if 'df_comparisons' in locals() and len(df_comparisons) > 0:
        print("📊 Similarity Analysis:")
        print(f"   - Similarity scores range from {df_comparisons['similarity_score'].min():.4f} to {df_comparisons['similarity_score'].max():.4f}")
        print(f"   - Average similarity: {df_comparisons['similarity_score'].mean():.4f}")
        print(f"   - Recommended thresholds:")
        print(f"     * High confidence duplicates: > {df_comparisons['similarity_score'].quantile(0.95):.4f}")
        print(f"     * Potential duplicates: > {df_comparisons['similarity_score'].quantile(0.90):.4f}")
        print(f"     * Similar tracks: > {df_comparisons['similarity_score'].quantile(0.75):.4f}")
        print()
    
    print("🚀 Implementation Strategy:")
    print("   1. Batch process all 200 files to generate fingerprints")
    print("   2. Store fingerprints in database with metadata")
    print("   3. Implement efficient comparison algorithm")
    print("   4. Use similarity thresholds to classify matches")
    print("   5. Manual review for borderline cases")
    print()
    
    print("⚡ Performance Considerations:")
    print(f"   - Fingerprinting: ~{estimated_processing_time/60:.0f} minutes for 200 files")
    print(f"   - Comparisons: ~{total_comparison_time/60:.0f} minutes for all pairs")
    print(f"   - Total time: ~{(estimated_processing_time + total_comparison_time)/60:.0f} minutes")
    print("   - Memory usage: Very low (fingerprints are compact)")
    print()
    
    print("🔍 Next Steps:")
    print("   1. Create duplicate detection script")
    print("   2. Process full catalog (100 files each)")
    print("   3. Implement database storage for fingerprints")
    print("   4. Build comparison and reporting system")
    print("   5. Add manual review interface")
    
else:
    print("❌ Issues Found:")
    print("   - Could not process audio files successfully")
    print("   - Check audio file formats and pyacoustid installation")
    print("   - Verify chromaprint binary is available")

print()
print("=== END OF ANALYSIS ===")

# Save results for later use
if len(successful_results) > 0:
    # Create a summary dictionary
    analysis_summary = {
        'successful_files': len(successful_results),
        'total_files_tested': len(fingerprint_results),
        'average_processing_time': total_processing_time / len(successful_results),
        'average_duration': total_audio_duration / len(successful_results),
        'processing_speed_ratio': total_audio_duration / total_processing_time,
    }
    
    if 'df_comparisons' in locals() and len(df_comparisons) > 0:
        analysis_summary.update({
            'total_comparisons': len(df_comparisons),
            'avg_similarity': df_comparisons['similarity_score'].mean(),
            'max_similarity': df_comparisons['similarity_score'].max(),
            'min_similarity': df_comparisons['similarity_score'].min(),
            'threshold_95': df_comparisons['similarity_score'].quantile(0.95),
            'threshold_90': df_comparisons['similarity_score'].quantile(0.90),
            'threshold_75': df_comparisons['similarity_score'].quantile(0.75),
        })
    
    print(f"\n📁 Analysis summary saved to 'analysis_summary' variable")
    print("   Use this data to inform the duplicate detection script implementation")
