In [3]:
import os
import librosa
import librosa.display
import matplotlib.pyplot as plt
from IPython.display import Audio, display
import h5py
import numpy as np
from sklearn.preprocessing import LabelEncoder


# Defining bird class species for classification

10 specific bird species and one 'others' class containing birdcall audio from 397 different birds 

In [1]:
class_names = ['redcro', 'norcar', 'comrav', 'houspa', 'barswa', 'houwre', 'sonspa', 'gbwwre1', 'eursta', 'spotow']
class_names.append('others')
class_label_files = {}

In [None]:
base_directory = r'C:/Users/B.B GHOSH//Desktop/kaggle_input/kaggle/input/birdclef-2021/train_short_audio/'
for bird_name in bird_names:
    bird_directory = os.path.join(base_directory, bird_name)
    if os.path.exists(bird_directory) and os.path.isdir(bird_directory):
        # Get list of files in the bird directory
        bird_files = os.listdir(bird_directory)
        # Store the list of files in the dictionary under the bird name key
        bird_label_files[bird_name] = [os.path.join(bird_directory, file) for file in bird_files]
bird_label_files.keys()

## Birdcall extraction

The recorded raw audio files have captured the background environmental sounds alongwith the periods of bird call. We have used a percentile based SNR thrsholding in order to segregate timings of bird call.

In [2]:
def get_audio_info(file_path, window_duration=0.03):
    try:
        y, sr = librosa.load(file_path)
        duration = librosa.get_duration(y=y, sr=sr)
        num_samples = len(y)
        window_samples = int(sr * window_duration)
        num_windows = num_samples // window_samples
        return y, sr, duration, num_samples, window_duration, window_samples, num_windows
    except Exception as e:
        print("Error:", e)
        return None, None, None, None, None, None, None

def calculate_windowed_snr(y, sr, num_samples, num_windows, window_samples, threshold=0.2):
    SNR = []
    window = np.hanning(window_samples)
    for i in range(num_windows):
        start_index = i * window_samples
        end_index = min((i + 1) * window_samples, num_samples)
        window_signal = y[start_index:end_index] * window
        noise_level = np.mean(np.abs(window_signal)[np.abs(window_signal) < threshold])
        window_max_amplitude = np.var(window_signal)
        SNR_window = 20 * np.log10(window_max_amplitude / noise_level)
        SNR.append(SNR_window)
    return SNR

def find_better_snr_indices(SNR):
    better_SNR_indices = []
    sorted_SNR = sorted(SNR, reverse=True)
    top_half_SNR = sorted_SNR[:int(len(sorted_SNR)*0.8)]
    mean_SNR = np.mean(top_half_SNR)
    
    for i, SNR_value in enumerate(SNR):
        if SNR_value > mean_SNR:
            better_SNR_indices.append(i)
    return better_SNR_indices

def find_better_sample_indices(num_samples, window_samples, better_SNR_indices):
    window_start_indices = np.arange(0, num_samples, window_samples)
    window_end_indices = window_start_indices + window_samples
    window_end_indices[-1] = num_samples

    better_SNR_samples_indices = []
    for window_index in better_SNR_indices:
        start_index = window_start_indices[window_index]
        end_index = window_end_indices[window_index]
        better_SNR_samples_indices.extend(range(start_index, end_index))

    return better_SNR_samples_indices

def extract_audio_with_better_snr(y, better_SNR_samples_indices):
    new_audio = y[better_SNR_samples_indices]
    return new_audio

def get_audio(file_path, window_duration=0.03):
    y, sr, duration, num_samples, window_duration, window_samples, num_windows = get_audio_info(file_path, window_duration)
    if y is None:
        return None, None, None
    
    SNR = calculate_windowed_snr(y, sr, num_samples, num_windows, window_samples)
    better_SNR_indices = find_better_snr_indices(SNR)
    better_SNR_samples_indices = find_better_sample_indices(num_samples, window_samples, better_SNR_indices)
    new_audio = extract_audio_with_better_snr(y, better_SNR_samples_indices)
    
    return new_audio, y, sr

In [None]:
file_path=bird_label_files['barswa'][75]
new_audio,y,sr=get_audio(file_path)
print("Playing original audio:")
display(Audio(data=y, rate=sr))
print("Playing cleaned audio:")
display(Audio(data=new_audio, rate=sr))

plt.figure(figsize=(10, 4))
librosa.display.waveshow(y, sr=sr,color="green")
plt.xlabel('Time (s)')
plt.ylabel('Amplitude')
plt.title('Waveform Plot of original Audio File')
plt.show()
plt.figure(figsize=(10, 4))
librosa.display.waveshow(new_audio, sr=sr,color="green")
plt.xlabel('Time (s)')
plt.ylabel('Amplitude')
plt.title('Waveform Plot of cleaner Audio File')
plt.show()

Log-mel spectrogram creation for processed audio files

In [None]:
def get_logmelspec(file_path):
    new_audio, _, sr = get_audio(file_path)
    if new_audio is None or sr is None:
        return None
    segment_length = sr
    num_segments = len(new_audio) // 24000
    logmelspec_list = []
    for i in range(num_segments):
        start_sample = i * segment_length
        end_sample = (i + 1) * segment_length
        segment = new_audio[start_sample:end_sample]
        if len(segment) < segment_length:
            segment = np.pad(segment, (0, segment_length - len(segment)), mode='constant')
        mel_spec = librosa.feature.melspectrogram(y=segment, sr=sr, n_fft=1024, n_mels=64)
        log_mel_spec = librosa.power_to_db(mel_spec)
        logmelspec_list.append(log_mel_spec.T)
    return np.array(logmelspec_list) if logmelspec_list else None

file_path = class_label_files['comrav'][75]
logmelspec = get_logmelspec(file_path)
print("Shape of log-mel spectrogram array:", logmelspec.shape if logmelspec is not None else "Invalid audio")

Generating logmelspec for all files post extraction

In [None]:
from tqdm import tqdm

class_logmelspec_dict = {}
n_files = 500 #Max number of files in one class

for class_name, file_paths in class_label_files.items():
    class_logmelspecs = []

    for file_path in tqdm(file_paths[:n_files], 
                         desc=f"Processing {class_name}",
                         mininterval=0.5,  # Update every 0.5 seconds minimum
                         maxinterval=1.0,  # Maximum time between updates
                         smoothing=0.1):   # Smoothing factor for speed estimation
        logmelspec = get_logmelspec(file_path)
        if logmelspec is not None:
            class_logmelspecs.append(np.array(logmelspec))
    class_logmelspec_dict[class_name] = class_logmelspecs

# Results summary
for class_name, logmelspec_list in class_logmelspec_dict.items():
    print(f"Processed {len(logmelspec_list)}/{n_files} files for {class_name}")

In [None]:
stacked_melspec_dict = {}

for class_name, logmelspec_list in class_logmelspec_dict.items():
    if logmelspec_list:
        stacked_array = np.vstack(logmelspec_list)
        stacked_melspec_dict[class_name] = stacked_array
    else:
        stacked_melspec_dict[class_name] = None

for class_name, stacked_array in stacked_melspec_dict.items():
    if stacked_array is not None:
        print(f"Class '{class_name}' stacked shape: {stacked_array.shape}")
    else:
        print(f"Class '{class_name}' has no valid spectrograms")

In [None]:
X = np.vstack([stacked_melspec_dict[cls] for cls in class_names if stacked_melspec_dict[cls] is not None])
y = np.concatenate([np.full((stacked_melspec_dict[cls].shape[0], 1), i) 
                   for i, cls in enumerate(class_names) 
                   if stacked_melspec_dict[cls] is not None])

print(f"Combined spectrograms shape: {X.shape}")
print(f"Label array shape: {y.shape}")

Adding extra channel for 3D CNN processing

In [None]:
output_path = '/kaggle/working/melspec_data.h5'

X_cnn = np.expand_dims(X, axis=-1)

with h5py.File(output_path, 'w') as hf:
    hf.create_dataset('X_train', data=X_cnn) 
    hf.create_dataset('y_train', data=y)
    hf.create_dataset('classes', data=np.array(class_names, dtype='S'))