In [1]:
import pandas as pd
import numpy as np
import os
import glob
from datetime import datetime
import soundfile as sf
import librosa
from IPython.utils import io

# Codes for functions and augmentation are available at:
# https://github.com/plauha/BSG_classifier_builder/tree/main/Train%20%20BSG%20models
from functions import butter_bandpass, butter_bandpass_filter
from augmentation import pad

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
path_to_birdsound_library = '---/' # path where created clips are saved
bsg_metadata_out = '---/' # path where clips labels are saved
root_audio_directory = '---/' # Base directory for audio files

# load species list
species_list = pd.read_csv('BSG_results/bsg_species.csv') # available at: 
# https://github.com/plauha/BSG_classifier_builder/tree/main/Train%20%20BSG%20models

# Directory containing the CSV files
csv_directory = 'PMs'  # Replace with the actual directory path containing the CSV files
csv_files = glob.glob(os.path.join(csv_directory, '*.csv'))

In [4]:
bsg_labels = pd.DataFrame()

# Process each CSV file in the directory
for csv_file in csv_files:
    df = pd.read_csv(csv_file)
    
    print("Reading:", csv_file)
    
    # Filter the DataFrame to include only rows with "present" or "not present" in the "validated" column
    df_present = df[df['validated'].str.strip().str.lower() == 'present']
    df_not_present = df[df['validated'].str.strip().str.lower() == 'not present']

    # Check if there are 0 "present" rows
    if df_present.empty:
        print(f"Error: No 'present' rows in file {csv_file}. Skipping this file.")
        continue

    # Check if there are 0 "not present" rows
    if df_not_present.empty:
        print(f"Warning: No 'not present' rows in file {csv_file}. Continuing with 'present' rows only.")
        df_combined = df_present
    else:
        # Calculate one-third of the number of "present" rows
        num_present = len(df_present)
        num_not_present_to_select = num_present // 3

        # Randomly select the calculated number of "not present" rows
        df_not_present_selected = df_not_present.sample(n=num_not_present_to_select, random_state=1)

        # Combine the "present" rows with the selected "not present" rows
        df_combined = pd.concat([df_present, df_not_present_selected])
        
    # Process each row in the combined DataFrame
    for index, row in df_combined.iterrows():
        x1 = row['x1']
        x2 = row['x2']
        y1 = row['y1']
        y2 = row['y2']
        recording_name = row['recording']
        minute = int(row['minute'])
        site_name = row['site']
        hour = row['hour'] * 60
        species = row['species'].replace(' ', '-')  # Replace spaces with dashes in species name
        
        # Check if the last character of the site name is a digit
        if site_name[-1].isdigit() and not site_name[-2].isdigit():
            site_name = site_name[:-1] + '0' + site_name[-1]  # Add leading zero to the site number
        root_directory = os.path.join(root_audio_directory, site_name)
        
        # Extract the 7th and 8th characters from the right of the recording name
        starttime = int(recording_name[-8:-6])
        starthour = int(recording_name[-10:-8]) * 60
        calculated_x1 = ((hour + minute) - (starttime + starthour)) * 60 + x1
        calculated_x2 = ((hour + minute) - (starttime + starthour)) * 60 + x2
        match_id = row['id']
        datetime_str = recording_name[-19:-4]

        # Adjust x1 and x2
        full_rec_length = 300 # original recording length = 300 seconds
        if x2-x1 > 3:
            start = x1
            stop = x2
        else:
            mean_x = np.mean([x1, x2])
            start = mean_x -2
            stop = mean_x + 2
        if start < 0:
            start = 0
            stop = np.max([stop, 3])
        if stop > full_rec_length:
            stop = full_rec_length
            start = np.min([start, full_rec_length-3])
                
        # Search within all folders for the audio file
        audio_filename = None
        for root, dirs, files in os.walk(root_directory):
            for file in files:
                if file == recording_name:
                    audio_filename = os.path.join(root, file)
                    break  # Stop searching once the audio file is found and processed
        
        bird_present = True
        if audio_filename:
            # Determine if the row is "not present"
            if row['validated'].strip().lower() == 'not present':
                output_filename = f"x-9994_{datetime_str}_{species}_NB_{match_id}.WAV"
                bird_present = False
            else:
                output_filename = f"x-9994_{datetime_str}_{species}_{match_id}.WAV"
            
            with io.capture_output() as captured:
                sig, sr = librosa.load(audio_filename, sr = 48000, offset = start, duration = stop-start)
                
            # save clip with original background
            sig1 = librosa.resample(sig, orig_sr=48000, target_sr=24000)
            sf.write(path_to_birdsound_library + output_filename, sig1, 24000)
            if bird_present:
                sp_code = species_list['species_code'].loc[species_list['scientific_name']==species.replace('-', ' ')].iloc[0]
            else:
                sp_code = 'nobird'
            bsg_labels = pd.concat([bsg_labels, pd.DataFrame({"file_name":output_filename, "species":sp_code, "occurrence":[1]})])          
            
            # save clip with cleaned background
            sig2 = pad(sig, x1-start, x2-start, snr=10, target_len=len(sig), sr=48000) # time- and frequency cropped version
            sig2 = butter_bandpass_filter(sig2, [y1,y2], 48000, 12) 
            sig2 = librosa.resample(sig2, orig_sr=48000, target_sr=24000)
            output_filename2 = output_filename.replace('.WAV', '_cleaned.WAV')
            sf.write(path_to_birdsound_library + output_filename2, sig2, 24000)
            bsg_labels = pd.concat([bsg_labels, pd.DataFrame({"file_name":output_filename2, "species":sp_code, "occurrence":[1]})])
            
        else:
            print(f"Error: Audio file '{recording_name}' not found in directory '{root_directory}'.")
                                                                 
bsg_labels.reset_index(drop=True, inplace=True)
bsg_labels.to_csv(bsg_metadata_out+ 'BSG_labels.csv', index=False)  

Reading: PMs/pm-amazona_albifrons-simple_call-2640 (1).csv
