In [None]:
# Import various dependencies, including the relevant modules from the Perch
# repository. Note that "chirp" is the old name that the Perch team used, so any
# chirp modules imported here were installed as part of the Perch repository in
# one of the previous cells.

import numpy as np
import pandas as pd
import tensorflow as tf
import tqdm
import json
import os

from chirp import audio_utils

## Create ground truth data
The following section create a dataframe with 5 seconds window audio and their associated label based on wav files and raven pro annotation txt files using a sliding window approach.

First define functions needed to create ground truth data using a sliding window approach

In [None]:
# TODO - modify to remove deprecation warning from pandas

def determine_label(annotations_in_window, start_window, end_window, wind_dur, annot_coverage_threshold=0.8, win_coverage_threshold=0.2):
    label = 'NEG'
    
    if not annotations_in_window.empty:
        # Calculate durations using vectorized operations
        annotations_in_window = annotations_in_window.copy()
        annotations_in_window['Duration'] = np.minimum(end_window, annotations_in_window['End Time (s)']) - np.maximum(start_window, annotations_in_window['Begin Time (s)'])
        
        total_annotation_duration = annotations_in_window['Duration'].sum()

        if total_annotation_duration + 1e-5 > win_coverage_threshold * wind_dur:
            # Find the longest annotation
            longest_annotation = annotations_in_window.loc[annotations_in_window['Duration'].idxmax()]

            if 'POS' in longest_annotation.values:
                pos_column = longest_annotation.index[longest_annotation == 'POS'].tolist()
                return pos_column[0]
            elif 'UNK' in longest_annotation.values:
                label = 'UNK'
        
        for _, annotation in annotations_in_window.iterrows():
            # Return type of annotation as label if porpotion of the annotation (defined by annot_coverage_threshold) is in the windows
            # TODO - Work for binary labelling now, should be adapted for multi label (one column per label/hot labels)
            if annotation['Duration'] / (annotation['End Time (s)'] - annotation['Begin Time (s)']) >= annot_coverage_threshold - 1e-5:
                    return annotation['Type']

    return label

def sliding_window_cuting(waveform, df_annot=None, sr=16000, wind_dur=1.0, win_coverage_threshold=0.5, annot_coverage_threshold=0.5, overlap_ratio=0.0, verbose=0):
    '''
    Slide a waveform in window of wind_dur * sr length and save the chunks in a dataframe (output). If a dataframe of annotation in raven pro format is provided,
    the function will associate a label to the chunk depending on the presence/absence of annotation, and on the coverage parameters.

    Inputs
    waveform: waveform (np.array)
    sr: sampling rate (int)
    wind_dur: duration of the chunk (float)
    win_coverage_threshold: if an annotation cover a bigger proportion of the window than the value, the chunk is label positive, otherwise NEG (float between 0.0 and 1.0)
    annot_coverage_threshold: if a proportion of the annotation bigger than the value is contained in the chunk, chunk is positive, otherwise NEG (float between 0.0 and 1.0)
    overlap_ratio: time between two chunk start / hop_size in second (float)
    verbose: print info if = 1

    Outputs:
    df_chunks: pandas.DataFrame with keys (Audio, Starttime, Endtime, Label)
    '''
    # Calculate frame length in samples
    frame_length_sample = int(wind_dur * sr)
    step_size = int(frame_length_sample * (1 - overlap_ratio))

    # Calculate start times
    indices = np.arange(0, len(waveform) - frame_length_sample + 1, step_size)
    start_times = indices / sr
    end_times = start_times + wind_dur

    # Extract chunks
    chunks = [waveform[i:i+frame_length_sample] for i in indices]
    df_chunks = pd.DataFrame({'Audio': chunks, 'Starttime': start_times, 'Endtime': end_times})

    if df_annot is not None:
        labels = []
        df_annot_start = df_annot['Begin Time (s)'].values
        df_annot_end = df_annot['End Time (s)'].values

        for start, end in zip(start_times, end_times):
            # Find annotations in the current chunk
            mask = ((start < df_annot_start) & (df_annot_start < end)) | \
                   ((start < df_annot_end) & (df_annot_end < end)) | \
                   ((df_annot_start < start) & (end < df_annot_end))
            annotations_in_window = df_annot[mask]

            # Determine the label for the current chunk
            label = determine_label(annotations_in_window, start, end, wind_dur, annot_coverage_threshold, win_coverage_threshold)
            labels.append(label)

        # Assign labels to the DataFrame
        df_chunks['Label'] = labels

        if verbose == 1:
            print('Number of samples in each of the class:')
            print(df_chunks['Label'].value_counts())

    return df_chunks

## Load paths and model selection from config dictionary 

In [None]:
path_to_config = './config_dict.json'
paths_dictionary = json.load(open(path_to_config, 'r'))

# Define paths from the config dictionary
sample_data_folder = os.path.join(paths_dictionary['working_repo'], paths_dictionary['sample_data_folder'])
output_directory = os.path.join(paths_dictionary['working_repo'], paths_dictionary['output_directory'])
dataset_folder = paths_dictionary['deployment_folder']
model_folder = os.path.join(paths_dictionary['working_repo'], paths_dictionary['embedding_models_folder'])
model_name = paths_dictionary['model_name']
sound_name = paths_dictionary['sound_name']
raven_annotation_column = paths_dictionary['raven_annotation_column']

unlabeled_audio_pattern = os.path.join(sample_data_folder, dataset_folder, 'raw_audio/*.[wW][aA][vV]')

print("Path to dataset: ", sample_data_folder)
print("Output data folder: ", output_directory)
print(f"Working on dataset: {dataset_folder} using {model_name} model.")

dir_path = os.path.join(paths_dictionary['working_repo'], paths_dictionary['sample_data_folder'], paths_dictionary['deployment_folder'], 'test_set/')

In [None]:
# load wav files test list
test_files_list = os.listdir(dir_path)

# Create a list of all wav files with files ending by .wav
wav_files = sorted([file for file in test_files_list if file.endswith('.wav')])
annot_files = sorted([file for file in test_files_list if file.endswith('.txt')])

# Display and check number of files and annotations
print(f"Number of wav files: {len(wav_files)}")
print(f"Number of annotations: {len(annot_files)}")
print("Checking files order\n", wav_files[:3])
print(annot_files[:3])

In [None]:
from utils_agile_model import choose_embedding_model

# Choose embeddings model (birdnet/surfperch/perch)
embed_fn, config = choose_embedding_model(model_name)

# For readability later in the code
sample_rate = config.embed_fn_config.model_config.sample_rate
hop_size_s = config.embed_fn_config.model_config.hop_size_s
window_size_s = config.embed_fn_config.model_config.window_size_s

print(f"Ready to use Agile Modeling with '{model_name}' model")
print(f"Sampling rate:{sample_rate}Hz, Window size:{window_size_s}sec, Hop size:{hop_size_s}sec")

# Specify a glob pattern matching any number of wave files.
# Use [wW][aA][vV] to match .wav or .WAV files
unlabeled_audio_pattern = os.path.join(sample_data_folder, dataset_folder, 'raw_audio/*.[wW][aA][vV]')

print("Working on dataset: ", dataset_folder)
print("Input data folder: ", sample_data_folder)
print("Output data folder: ", output_directory)

output_path = os.path.join(output_directory, dataset_folder, model_name, 'test_set/')

# Create test_set folder if not existing
if not os.path.exists(output_path):
    os.makedirs(output_path)
    print(f"Created output directory for test set: {output_path}")

## Generate label for per audio window depending on sampling rate and window length of the model selected

/!\ Important to replace the column name and the annotation name according to your Raven annotation files

In [None]:
# Load wav and files and create a GT dataframe with waveform, start and end time, label and filename
total_detection = 0
idx_list = []
df_chunks_list = pd.DataFrame()
df_annot_list = pd.DataFrame()
label_column_name = raven_annotation_column
annotation_name = sound_name

# Iterates the wav and annot files - must be sorted to make sure they correspond - no extra check here
for i in tqdm.tqdm(range(len(wav_files))):
    waveform = audio_utils.load_audio(dir_path+wav_files[i], sample_rate)
    df_annot = pd.read_csv(dir_path+annot_files[i], sep='\t')
    # Only keep call A
    df_annot = df_annot[df_annot[label_column_name] == annotation_name]
    df_chunks = sliding_window_cuting(waveform, df_annot, wind_dur=window_size_s, win_coverage_threshold=0.2, annot_coverage_threshold=0.5, sr=sample_rate)
    df_chunks['filename'] = wav_files[i]

    df_chunks_list = pd.concat([df_chunks_list, df_chunks])
    df_annot_list = pd.concat([df_annot_list, df_annot])
    
    if len(df_annot) > 0:
        idx_list.append(i)
        total_detection += len(df_annot)

# print(idx_list)
# df_chunks = sliding_window_cuting(waveform, df_annot, wind_dur=5.0, win_coverage_threshold=0.2, annot_coverage_threshold=0.5)
# remove index
df_chunks_list.reset_index(drop=True, inplace=True)
df_chunks_list['Label'].value_counts()

## Generate embeddings of the test set and save it with the labels

In [None]:
# Modify the code bellow to batch process the embeddings
embedding_list = []
for idx, chunk in tqdm.tqdm(df_chunks_list.iterrows(), total=len(df_chunks_list)):
    output = embed_fn.embedding_model.embed(chunk['Audio'])
    emb_shape = output.embeddings.shape
    flat_emb = tf.reshape(output.embeddings, [-1, emb_shape[-1]])
    embedding_list.append(flat_emb)

# TODO - Batch the creation of embeddings to improve perf (using embedding_model.batch_embed)
df_chunks_list['Embedding'] = embedding_list

In [None]:
# Remove audio to reduce file size and save as pickle
embed_label_list = df_chunks_list.drop('Audio', axis=1)
pickle_filename = 'test_set.pkl'
embed_label_list.to_pickle(output_path + pickle_filename)