**(CD_All for ESC50 dataset)**
# Audio Classification with Sound Descriptions

This notebook reproduces results for the ESC-50 and FSD50K datasets shown in the paper **"A SOUND DESCRIPTION: EXPLORING PROMPT TEMPLATES AND CLASS DESCRIPTIONS TO ENHANCE ZERO-SHOT AUDIO CLASSIFICATION"** by Michel Olvera, Paraskevas Stamatiadis and Slim Essid.

## Overview

The experiment evaluates different class description methods for zero-shot audio classification using CLAP (Contrastive Language-Audio Pre-training) models. The notebook compares:

- **CLS**: Standard class names
- **Context**: Contextual descriptions
- **Ontology**: Ontology-based descriptions  
- **Base**: Basic descriptions
- **Dictionary**: Dictionary-style descriptions

## Methodology

The evaluation uses a cross-validation approach where for each class, the system selects the best-performing description type (CLS vs. definition-based) during training, then applies this mapping to test data. Performance is measured using class-wise accuracy for single-label datasets or mean Average Precision (mAP) for multi-label datasets.


# Import libraries

In [1]:
import os
import argparse
import pandas as pd
from msclap import CLAP
import torch.nn.functional as F
import numpy as np
import pickle
import torch
import os
import argparse
import json
from torch.utils.data import DataLoader
from config import conf, common_parameters
from pprint import pprint
from utilities import merge_dicts
from metrics_helper import compute_metrics, compute_class_wise_accuracy
from sklearn.model_selection import KFold, StratifiedKFold

Running on slurm with job id: 698390


# Set experiment parameters

In [2]:
conf_id = "001" # CD Methods for ESC50 Dataset
# conf_id = "002" # CD Methods for FSD50K Dataset

conf = merge_dicts(common_parameters, conf[conf_id])
conf


{'job_id': '698390',
 'output_folder': '/tsi/audiosig/audible/dcase/studies/016_CLAP_prompting_with_descriptors/003_evaluate_prompts/results',
 'similarities_folder': '/tsi/audiosig/audible/dcase/studies/016_CLAP_prompting_with_descriptors/003_evaluate_prompts/similarities',
 'audio_embeddings_folder': '/tsi/audiosig/audible/dcase/studies/016_CLAP_prompting_with_descriptors/001_extract_audio_embeddings/embeddings',
 'text_embeddings_folder': '/tsi/audiosig/audible/dcase/studies/016_CLAP_prompting_with_descriptors/002_extract_text_embeddings/embeddings',
 'model_name': 'CLAP-MS-23',
 'definition_type': 'CLS',
 'test_dataset': 'ESC50',
 'evaluation_mode': 'CLS'}

In [3]:
# Class-wise model selection through cross-validation

model_name = conf['model_name']
test_dataset = conf['test_dataset']
definition_type = conf['definition_type']
model_name = conf['model_name']
evaluation_mode = conf['evaluation_mode']

# Load dataset
audio_embeddings_path = os.path.join(conf['audio_embeddings_folder'],
                                        model_name,
                                        test_dataset + '.pt')

if test_dataset != 'TUT2017':
    definition_types = ['CLS', 'context', 'ontology', 'base', 'dictionary']
else:
    definition_types = ['CLS', 'ontology', 'base', 'dictionary']


# Load text embeddings for each definition type
text_embeddings_paths = []
for definition_type in definition_types:
    text_embeddings_paths.append(os.path.join(conf['text_embeddings_folder'],
                                        'CLAP-MS-23',
                                        test_dataset + '_' + definition_type + '.pkl'))
    
# Load CLAP model
if model_name == 'CLAP-MS-23':
    clap_model = CLAP(version = '2023', use_cuda=True)
else:
    raise ValueError('Please specify a valid model')

In [4]:
# Read embeddings
audio_embeddings = torch.load(audio_embeddings_path)
print("Audio embeddings shape: ", audio_embeddings.shape)

# Read ground-truth labels
labels = torch.load(audio_embeddings_path.replace('.pt', '_labels.pt'))
print("Labels shape: ", labels.shape)

# Labels are one-hot encoded. Convert them to integers
if test_dataset == 'FSD50K' or test_dataset == 'AudioSet' or test_dataset == 'DCASE2017':
    labels_1D = labels.detach().cpu().numpy()
else:
    labels_1D = torch.argmax(labels, dim=1) 

# Read text embeddings dictionaries
prompts_dictionary_list = []
for text_embeddings_path in text_embeddings_paths:
    with open(text_embeddings_path, 'rb') as f:
        prompts_dictionary_list.append(pickle.load(f))

# Select the text embeddings from the key '' in the dictionaries
text_embeddings_list = []
for prompts_dictionary in prompts_dictionary_list:
    text_embeddings_list.append(prompts_dictionary['']['embeddings'])



Audio embeddings shape:  torch.Size([2000, 1024])
Labels shape:  torch.Size([2000, 50])


## Cross-validation Setup

In [5]:
# Cross-validation. Generate deterministic folds of audio embeddings, text embeddings and labels
# using scikit-learn's StratifiedKFold

# Print shape of data
print("Audio embeddings shape: ", audio_embeddings.shape)
print("Labels shape: ", labels.shape)
print("Text embeddings", text_embeddings_list[0].shape)

audio_embeddings_train_folds = []
labels_train_folds = []

audio_embeddings_test_folds = []
labels_test_folds = []

# Define the number of folds
n_folds = 5

# Define the seed for reproducibility
seed = 1200

# Define the stratified k-fold object
if test_dataset == 'FSD50K' or test_dataset == 'AudioSet' or test_dataset == 'DCASE2017':
    # Cross validation suitable for multi-label classification
    kf = KFold(n_splits=n_folds, shuffle=True, random_state=seed)

else:
    # Cross validation suitable for single-label classification
    # StratifiedKFold is used to ensure that the proportion of classes is the same in each fold
    kf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed)


# Generate the folds
for train_index, test_index in kf.split(audio_embeddings, labels_1D):

    # print("TRAIN:", train_index, "TEST:", test_index)

    audio_embeddings_train_folds.append(audio_embeddings[train_index])
    labels_train_folds.append(labels[train_index])

    audio_embeddings_test_folds.append(audio_embeddings[test_index])
    labels_test_folds.append(labels[test_index])
    

Audio embeddings shape:  torch.Size([2000, 1024])
Labels shape:  torch.Size([2000, 50])
Text embeddings torch.Size([50, 1024])


# Evaluation

In [6]:
results_df = pd.DataFrame()

if evaluation_mode == 'CLS':

    # We'll take the max accuracy/mAP between CLS and the other definition types

    for fold in range(n_folds):
        print("\nFold: ", fold)

        metric_results = [] # List to store the dictionary of results of each definition type

        for text_embeddings in text_embeddings_list:
            # Compute similarity
            y_labels = labels_train_folds[fold].detach().cpu().numpy()
            similarity = clap_model.compute_similarity(audio_embeddings_train_folds[fold], text_embeddings)

            if test_dataset == 'FSD50K' or test_dataset == 'AudioSet' or test_dataset == 'DCASE2017':
                # Process similarities and compute mAP
                y_pred = similarity.detach().cpu().numpy()

                _, _, class_wise_metrics_dict = compute_metrics(y_labels, y_pred, normalize_scores=False)
                # metric_result = sum(ap.values())/len(ap.values())
                # print('mAP: {}'.format(metric_result))
            
            else:

                # Process similarities and compute accuracy
                y_pred = F.softmax(similarity, dim=1).detach().cpu().numpy()
                
                class_wise_metrics_dict = compute_class_wise_accuracy(
                    np.argmax(y_labels, axis=1),
                    np.argmax(y_pred, axis=1))
                
                # print('Accuracy: {}'.format(metric_result))
            
            metric_results.append(class_wise_metrics_dict)


        # # Take the max accuracy/mAP between CLS and the other definition types
        CLS_metrics_dict = metric_results[0]
        definitions_metrics_dicts = metric_results[1:]

        # print lenghts
        # print("CLS_metrics_dict: ", len(CLS_metrics_dict)) # 1 x50
        # print("definitions_metrics_dict: ", len(definitions_metrics_dicts)) # 4 x 50


        # Model-class mapping
        mapping = dict() 
        metrics_dict = CLS_metrics_dict.copy()

        # CD-ALL configuration. An ensemble among all definition types
        # The ensemble is composed of the best predictor for each class among the definition types
        for class_index in CLS_metrics_dict.keys():
            def_index = 0
            for i, definition_dict in enumerate(definitions_metrics_dicts):
                if definition_dict[class_index] > metrics_dict[class_index]:
                    metrics_dict[class_index] = definition_dict[class_index]
                    def_index = i+1
                else:
                    def_index = def_index
            mapping[class_index] = def_index

        # Print mapping
        # print("Mapping: ", mapping)
                    
        metric_result_training = sum(metrics_dict.values()) / len(metrics_dict.values())
        print('Training accuracy: {}'.format(metric_result_training))


        test_cval = True

        if test_cval:
            # Test the mapping in the test set of the fold
            metric_results = [] # List to store the dictionary of results of each definition type


            for text_embeddings in text_embeddings_list:
                # Compute similarity
                y_labels = labels_test_folds[fold].detach().cpu().numpy()

                similarity = clap_model.compute_similarity(audio_embeddings_test_folds[fold], text_embeddings)

                if test_dataset == 'FSD50K' or test_dataset == 'AudioSet' or test_dataset == 'DCASE2017':
                    # Process similarities and compute mAP
                    # y_pred = F.sigmoid(similarity).detach().cpu().numpy()
                    y_pred = similarity.detach().cpu().numpy()

                    _, _, class_wise_metrics_dict = compute_metrics(y_labels, y_pred, normalize_scores=False)
                    # metric_result = sum(ap.values())/len(ap.values())
                    # print('mAP: {}'.format(metric_result))
                
                else:

                    # Process similarities and compute accuracy
                    y_pred = F.softmax(similarity, dim=1).detach().cpu().numpy()
                    
                    class_wise_metrics_dict = compute_class_wise_accuracy(
                        np.argmax(y_labels, axis=1),
                        np.argmax(y_pred, axis=1))
                    
                    # print('Accuracy: {}'.format(metric_result))
                
                metric_results.append(class_wise_metrics_dict)


            # # Take the max accuracy/mAP between CLS and the other definition types
            CLS_metrics_dict = metric_results[0]
            definitions_metrics_dicts = metric_results[1:]

            metrics_dict = CLS_metrics_dict.copy()
            for j, definition_dict in enumerate(definitions_metrics_dicts):
                # Definition type index
                metrics_dict = CLS_metrics_dict.copy()
                for class_index in definition_dict.keys():
                    # Use the mapping to select the best predictor for each class
                    if mapping[class_index] == j+1:
                        # print("Class index: definition", class_index, j+1)
                        metrics_dict[class_index] = definition_dict[class_index]
                    

            metric_result_test = sum(metrics_dict.values()) / len(metrics_dict.values())
            print('Test Accuracy: {}'.format(metric_result_test))



            # Model-class mapping Oracle test (upper bound)
            mapping = dict() 
            metrics_dict = CLS_metrics_dict.copy()

            # Oracle configuration:This represents the theoretical maximum performance achievable with perfect hindsight about which description works best per class. 
            # The gap between actual test accuracy (using training-based selection) and oracle accuracy reveals potential room for improvement in the model selection strategy.
            # The oracle ensemble is composed of the best predictor for each class among the definition types
            for class_index in CLS_metrics_dict.keys():
                def_index = 0
                for i, definition_dict in enumerate(definitions_metrics_dicts):
                    if definition_dict[class_index] > metrics_dict[class_index]:
                        metrics_dict[class_index] = definition_dict[class_index]
                        def_index = i+1
                    else:
                        def_index = def_index
                mapping[class_index] = def_index

            # Print the oracle mapping
            # print("Oracle Mapping: ", mapping)
                        
            metric_result_oracle_test = sum(metrics_dict.values()) / len(metrics_dict.values())
            print('Oracle test accuracy: {}'.format(metric_result_oracle_test))
            print("---"*10)


            # Compose a dataframe with results and save it to a CSV file
            # Columns: model_name, test_dataset, fold, definition_type, training_result, test_result
            results = pd.DataFrame({
                'model_name': [model_name],
                'test_dataset': [test_dataset],
                'fold': [fold],
                'training_result': [metric_result_training],
                'test_result': [metric_result_test],
                'oracle_test_result': [metric_result_oracle_test]
            })
            results_df = pd.concat([results_df, results], ignore_index=True)

else:
    raise ValueError('Please specify a valid evaluation mode')





Fold:  0
Training accuracy: 0.973125
Test Accuracy: 0.9075
Oracle test accuracy: 0.985
------------------------------

Fold:  1
Training accuracy: 0.9775
Test Accuracy: 0.885
Oracle test accuracy: 0.975
------------------------------

Fold:  2
Training accuracy: 0.975
Test Accuracy: 0.895
Oracle test accuracy: 0.9825
------------------------------

Fold:  3
Training accuracy: 0.975625
Test Accuracy: 0.885
Oracle test accuracy: 0.98
------------------------------

Fold:  4
Training accuracy: 0.973125
Test Accuracy: 0.9025
Oracle test accuracy: 0.99
------------------------------


# Compute final results

In [7]:
# Compute final results across folds from results dataframe

# Compute mean and std of training and test results across folds
training_mean = results_df['training_result'].mean()
training_std = results_df['training_result'].std()

test_mean = results_df['test_result'].mean()
test_std = results_df['test_result'].std()

test_oracle_mean = results_df['oracle_test_result'].mean()
test_oracle_std = results_df['oracle_test_result'].std()

print("\nDefinition type: ", definition_type)
print("Training result: {:.4f} +/- {:.4f}".format(training_mean, training_std))
print("Test result: {:.4f} +/- {:.4f}".format(test_mean, test_std))
print("Oracle Test result: {:.4f} +/- {:.4f}".format(test_oracle_mean, test_oracle_std))


Definition type:  dictionary
Training result: 0.9749 +/- 0.0018
Test result: 0.8950 +/- 0.0102
Oracle Test result: 0.9825 +/- 0.0056
