In [None]:
import pandas as pd
import numpy as np
import requests
import time
import pickle
import json
from typing import List, Dict
import itertools

# Set random seed for reproducibility
np.random.seed(42)

print("Libraries imported successfully!")


In [None]:
# Load the input/ground truth pairs
print("Loading input/ground truth pairs...")
with open('data/input_ground_truth_pairs.pkl', 'rb') as f:
    test_data = pickle.load(f)

print(f"Loaded {len(test_data)} test cases")
print(f"Average ground truth tags per sound: {np.mean([len(item['ground_truth_tags']) for item in test_data]):.2f}")

# Analyze input tag distribution
input_tag_counts = [len(item['input_tags']) for item in test_data]
print(f"Input tags distribution:")
print(f"  Min tags: {min(input_tag_counts)}")
print(f"  Max tags: {max(input_tag_counts)}")
print(f"  Average tags: {np.mean(input_tag_counts):.2f}")

# Filter to only cases with at least 3 input tags for fair comparison
test_data_3plus = [item for item in test_data if len(item['input_tags']) >= 3]
print(f"\nFiltered to {len(test_data_3plus)} cases with 3+ input tags")

# Show first test case as example
print(f"\nExample test case:")
print(f"Sound ID: {test_data_3plus[0]['sound_id']}")
print(f"Title: {test_data_3plus[0]['title']}")
print(f"Input tags: {test_data_3plus[0]['input_tags']}")
print(f"Ground truth tags: {test_data_3plus[0]['ground_truth_tags']}")


In [None]:
def get_freesound_recommendations_k10(input_tags, host="http://localhost:8011"):
    """
    Call the tagrecommendation system API
    """
    url = f"{host}/tagrecommendation/recommend_tags/"
    params = {"input_tags": ",".join(input_tags)}
    
    try:
        response = requests.get(url, params=params, timeout=10)
        if response.status_code == 200:
            return response.json().get("result", {}).get("tags", [])
        else:
            print(f"Error {response.status_code}: {response.text}")
            return []
    except requests.RequestException as e:
        print(f"Request failed: {e}")
        return []

# Test the API with a sample case first
print("Testing API connection...")
sample_case = test_data_3plus[0]
print(f"Sample input tags (all 3): {sample_case['input_tags'][:3]}")

recommended_tags = get_freesound_recommendations_k10(sample_case['input_tags'][:3])
print(f"Recommended tags: {recommended_tags}")
print(f"Ground truth tags: {sample_case['ground_truth_tags']}")

if recommended_tags:
    print("✅ API connection successful!")
else:
    print("❌ API connection failed. Please check if the server is running on localhost:8011")


In [None]:
def generate_input_tag_subsets(input_tags, k):
    """
    Generate k-sized subsets of input tags
    For k=1,2: take first k tags to ensure consistency
    For k=3: take first 3 tags
    """
    if k <= len(input_tags):
        return input_tags[:k]
    else:
        return input_tags  # If fewer than k tags available, use all

def evaluate_recommendations_k(test_data: List[Dict], k: int, max_tests: int = None):
    """
    Evaluate the tagrecommendation system on test data with k input tags
    """
    if max_tests is None:
        max_tests = len(test_data)
    
    results = []
    
    print(f"Evaluating with k={k} input tags on {min(len(test_data), max_tests)} test cases...")
    
    for i, case in enumerate(test_data[:max_tests]):
        if i % 50 == 0:
            print(f"Progress: {i}/{min(len(test_data), max_tests)}")
        
        # Generate k-sized subset of input tags
        input_tags_k = generate_input_tag_subsets(case['input_tags'], k)
        ground_truth = set(case['ground_truth_tags'])
        
        # Get recommendations
        recommended_tags = get_freesound_recommendations_k10(input_tags_k)
        
        if recommended_tags:
            # Calculate metrics
            recommended_set = set(recommended_tags[:10])  # Top 10 recommendations
            
            # Precision@10: How many recommended tags are in ground truth
            precision_10 = len(recommended_set.intersection(ground_truth)) / len(recommended_set) if recommended_set else 0
            
            # Recall@10: How many ground truth tags are found in recommendations
            recall_10 = len(recommended_set.intersection(ground_truth)) / len(ground_truth) if ground_truth else 0
            
            # F1@10: Harmonic mean of precision and recall
            if precision_10 + recall_10 > 0:
                f1_10 = 2 * (precision_10 * recall_10) / (precision_10 + recall_10)
            else:
                f1_10 = 0.0
            
            results.append({
                'sound_id': case['sound_id'],
                'title': case['title'],
                'k': k,
                'original_input_tags': case['input_tags'],
                'input_tags_k': input_tags_k,
                'recommended_tags': recommended_tags[:10],
                'ground_truth_tags': list(ground_truth),
                'precision_10': precision_10,
                'recall_10': recall_10,
                'f1_10': f1_10,
                'hits': list(recommended_set.intersection(ground_truth))
            })
        else:
            # If API call failed, record zeros
            results.append({
                'sound_id': case['sound_id'],
                'title': case['title'],
                'k': k,
                'original_input_tags': case['input_tags'],
                'input_tags_k': input_tags_k,
                'recommended_tags': [],
                'ground_truth_tags': list(ground_truth),
                'precision_10': 0.0,
                'recall_10': 0.0,
                'f1_10': 0.0,
                'hits': []
            })
        
        # Small delay to avoid overwhelming the API
        time.sleep(0.1)
    
    return results

print("Evaluation function defined. Ready to run evaluation.")


In [None]:
# Run evaluation for k=1, 2, 3
k_values = [1, 2, 3]
all_results = {}

for k in k_values:
    print(f"\n{'='*60}")
    print(f"Starting evaluation for k={k}")
    print(f"{'='*60}")
    
    evaluation_results = evaluate_recommendations_k(test_data_3plus, k)
    all_results[k] = evaluation_results
    
    print(f"\nEvaluation for k={k} complete! Processed {len(evaluation_results)} cases.")

print(f"\n🎉 All evaluations complete!")


In [None]:
# Analyze evaluation results for each k
print("\n" + "="*80)
print("EVALUATION RESULTS SUMMARY")
print("="*80)

summary_stats_all = {}

for k in k_values:
    if k in all_results and all_results[k]:
        results = all_results[k]
        precisions = [r['precision_10'] for r in results]
        recalls = [r['recall_10'] for r in results]
        f1_scores = [r['f1_10'] for r in results]
        
        print(f"\nResults for k={k} input tags:")
        print("-" * 40)
        print(f"Average Precision@10: {np.mean(precisions):.3f}")
        print(f"Average Recall@10: {np.mean(recalls):.3f}")
        print(f"Average F1@10: {np.mean(f1_scores):.3f}")
        print(f"Median Precision@10: {np.median(precisions):.3f}")
        print(f"Median Recall@10: {np.median(recalls):.3f}")
        print(f"Median F1@10: {np.median(f1_scores):.3f}")
        print(f"Std Precision@10: {np.std(precisions):.3f}")
        print(f"Std Recall@10: {np.std(recalls):.3f}")
        print(f"Std F1@10: {np.std(f1_scores):.3f}")
        
        # Store summary stats
        summary_stats_all[k] = {
            'precision_10': {
                'mean': float(np.mean(precisions)),
                'median': float(np.median(precisions)),
                'std': float(np.std(precisions)),
                'min': float(np.min(precisions)),
                'max': float(np.max(precisions))
            },
            'recall_10': {
                'mean': float(np.mean(recalls)),
                'median': float(np.median(recalls)),
                'std': float(np.std(recalls)),
                'min': float(np.min(recalls)),
                'max': float(np.max(recalls))
            },
            'f1_10': {
                'mean': float(np.mean(f1_scores)),
                'median': float(np.median(f1_scores)),
                'std': float(np.std(f1_scores)),
                'min': float(np.min(f1_scores)),
                'max': float(np.max(f1_scores))
            }
        }

# Comparison table
print(f"\n\nCOMPARISON ACROSS k VALUES:")
print("=" * 60)
print(f"{'Metric':<15} {'k=1':<10} {'k=2':<10} {'k=3':<10}")
print("-" * 50)
for metric in ['precision_10', 'recall_10', 'f1_10']:
    values = []
    for k in k_values:
        if k in summary_stats_all:
            values.append(f"{summary_stats_all[k][metric]['mean']:.3f}")
        else:
            values.append("N/A")
    print(f"{metric.replace('_10', '@10'):<15} {values[0]:<10} {values[1]:<10} {values[2]:<10}")


In [None]:
# Show example results for each k
print("\n" + "="*80)
print("EXAMPLE RESULTS FOR EACH k")
print("="*80)

# Use the same test case for all k values to see the effect
example_idx = 0

for k in k_values:
    if k in all_results and all_results[k]:
        result = all_results[k][example_idx]
        print(f"\nk={k} - Sound ID: {result['sound_id']}")
        print(f"  Title: {result['title']}")
        print(f"  Original input tags: {result['original_input_tags']}")
        print(f"  Input tags used (k={k}): {result['input_tags_k']}")
        print(f"  Recommended: {result['recommended_tags']}")
        print(f"  Ground Truth: {result['ground_truth_tags']}")
        print(f"  Hits: {result['hits']}")
        print(f"  Precision@10: {result['precision_10']:.3f}")
        print(f"  Recall@10: {result['recall_10']:.3f}")
        print(f"  F1@10: {result['f1_10']:.3f}")
        print("-" * 60)


In [None]:
# Save evaluation results for each k
print("\nSaving evaluation results...")

for k in k_values:
    if k in all_results and all_results[k]:
        results = all_results[k]
        
        # Prepare summary data for this k
        summary_data = {
            'k_input_tags': k,
            'total_test_cases': len(results),
            'metrics': summary_stats_all[k],
            'evaluation_details': results
        }
        
        # Save detailed results
        output_file = f'eval/rankst_k{k}_eval.json'
        with open(output_file, 'w') as f:
            json.dump(summary_data, f, indent=2)
        
        print(f"✅ k={k} results saved to {output_file}")
        
        # Save compact summary
        compact_summary = {
            'k_input_tags': k,
            'total_test_cases': len(results),
            'metrics': summary_stats_all[k]
        }
        
        compact_file = f'eval/rankst_k{k}_eval_summary.json'
        with open(compact_file, 'w') as f:
            json.dump(compact_summary, f, indent=2)
        
        print(f"✅ k={k} summary saved to {compact_file}")

# Save combined summary across all k values
combined_summary = {
    'description': 'Tag recommendation evaluation with varying input tag counts (k=1,2,3)',
    'total_test_cases': len(test_data_3plus),
    'k_values_tested': k_values,
    'results_by_k': summary_stats_all
}

combined_file = 'eval/rankst_k_combined_summary.json'
with open(combined_file, 'w') as f:
    json.dump(combined_summary, f, indent=2)

print(f"✅ Combined summary saved to {combined_file}")
print("\n🎉 All evaluation results saved successfully!")

# Print final summary
print(f"\nFINAL SUMMARY:")
print(f"- Evaluated {len(test_data_3plus)} test cases")
print(f"- Tested with k={k_values} input tags")
print(f"- Results saved to eval/ directory with k-specific filenames")
print(f"- No conflicts with previous rankst_eval.json files")
