In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
#import duckdb as dd
import polars as pl
import os
import glob
import shutil
import zipfile
import matplotlib.pyplot as plt
plt.style.use('dark_background')
import seaborn as sns
import plotly.express as px
import librosa
from IPython.display import Audio
import pickle
from joblib import dump, load
from pathlib import Path

In [2]:
audio_duration = 5

# Function to extract features from audio file
def extract_features(file_path, offset=0.0, duration=5.0):
    # Load audio file
    audio, sample_rate = librosa.load(path=file_path, offset=offset, duration=duration)
    """total_duration = librosa.get_duration(y=audio, sr=sample_rate)
    num_segments = int(total_duration // audio_duration)"""
    # Extract features using Mel-Frequency Cepstral Coefficients (MFCC)
    mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
    # Flatten the features into a 1D array
    flattened_features = np.mean(mfccs.T, axis=0)
    return flattened_features

In [3]:
def audio_classification(file_path, offset=0.0, duration=5.0):
    extracted_features = extract_features(file_path, offset=offset, duration=duration).reshape(1, -1)
    y_predict = random_forest_model.predict_proba(extracted_features)
    
    return y_predict

In [4]:
random_forest_model = load('/kaggle/input/trained-audio-model-v-01/audio_classifier_model.joblib')

In [5]:
meta_data = pl.read_csv('../input/birdclef-2024/train_metadata.csv', low_memory=True)

In [6]:
bird_cols = list(meta_data['primary_label'].unique().sort())

In [7]:
# test_soundscapes = '/kaggle/input/birdclef-2024/unlabeled_soundscapes'
test_soundscapes = '/kaggle/input/birdclef-2024/test_soundscapes'

filenames_with_path = glob.glob(f"{test_soundscapes}/*.ogg")
# filenames = [('soundscape_' + os.path.basename(filename)) for filename in filenames_with_path]
filenames = [os.path.basename(filename) for filename in filenames_with_path]

print(len(filenames))

0


In [8]:
"""audio, sample_rate = librosa.load(path=test_soundscapes, offset=0.0, duration=5.0)
audio_full, sample_rate_full = librosa.load(path=test_soundscapes)"""

'audio, sample_rate = librosa.load(path=test_soundscapes, offset=0.0, duration=5.0)\naudio_full, sample_rate_full = librosa.load(path=test_soundscapes)'

In [9]:
# Audio(audio, rate=sample_rate)

In [10]:
"""duration = len(audio) / sample_rate
time = np.arange(0, duration, 1/sample_rate)
plt.figure(figsize=(30, 4))
plt.plot(time, audio, color='blue')
plt.title('Audio Waveform')
plt.xlabel('Time (s)')
plt.ylabel('Amplitude')
plot = plt.show()"""

"duration = len(audio) / sample_rate\ntime = np.arange(0, duration, 1/sample_rate)\nplt.figure(figsize=(30, 4))\nplt.plot(time, audio, color='blue')\nplt.title('Audio Waveform')\nplt.xlabel('Time (s)')\nplt.ylabel('Amplitude')\nplot = plt.show()"

In [11]:
"""n_fft = 500  # Number of FFT points 2048
hop_length = 50  # Hop length for STFT 512
stft = librosa.stft(audio, n_fft=n_fft, hop_length=hop_length)
# Convert the magnitude spectrogram to decibels (log scale)
spectrogram = librosa.amplitude_to_db(np.abs(stft))
# Plot the spectrogram
plt.figure(figsize=(30, 6))
librosa.display.specshow(spectrogram, sr=sample_rate, hop_length=hop_length, x_axis='time', y_axis='linear')
plt.colorbar(format='%+2.0f dB')
plt.title('Spectrogram')
plt.xlabel('Time (s)')
plt.ylabel('Frequency (Hz)')
plt.tight_layout()
plot = plt.show()"""

"n_fft = 500  # Number of FFT points 2048\nhop_length = 50  # Hop length for STFT 512\nstft = librosa.stft(audio, n_fft=n_fft, hop_length=hop_length)\n# Convert the magnitude spectrogram to decibels (log scale)\nspectrogram = librosa.amplitude_to_db(np.abs(stft))\n# Plot the spectrogram\nplt.figure(figsize=(30, 6))\nlibrosa.display.specshow(spectrogram, sr=sample_rate, hop_length=hop_length, x_axis='time', y_axis='linear')\nplt.colorbar(format='%+2.0f dB')\nplt.title('Spectrogram')\nplt.xlabel('Time (s)')\nplt.ylabel('Frequency (Hz)')\nplt.tight_layout()\nplot = plt.show()"

In [12]:
def audio_waveframe(file_path, offset=0.0, duration=5.0):
    # Load the audio file
    audio_data, sampling_rate = librosa.load(file_path, offset=offset, duration=duration)
    # Calculate the duration of the audio file
    duration = len(audio_data) / sampling_rate
    # Create a time array for plotting
    time = np.arange(0, duration, 1/sampling_rate)
    # Plot the waveform
    plt.figure(figsize=(30, 4))
    plt.plot(time, audio_data, color='blue')
    plt.title('Audio Waveform')
    plt.xlabel('Time (s)')
    plt.ylabel('Amplitude')
    plot = plt.show()
    return plot

def spectrogram(file_path, offset=0.0, duration=5.0):
    # Compute the short-time Fourier transform (STFT)
    n_fft = 500  # Number of FFT points 2048
    hop_length = 50  # Hop length for STFT 512
    audio_data, sampling_rate = librosa.load(file_path, offset=offset, duration=duration)
    stft = librosa.stft(audio_data, n_fft=n_fft, hop_length=hop_length)
    # Convert the magnitude spectrogram to decibels (log scale)
    spectrogram = librosa.amplitude_to_db(np.abs(stft))
    # Plot the spectrogram
    plt.figure(figsize=(30, 6))
    librosa.display.specshow(spectrogram, sr=sampling_rate, hop_length=hop_length, x_axis='time', y_axis='linear')
    plt.colorbar(format='%+2.0f dB')
    plt.title('Spectrogram')
    plt.xlabel('Time (s)')
    plt.ylabel('Frequency (Hz)')
    plt.tight_layout()
    plot = plt.show()
    return plot

def audio_analysis(file_path, offset=0.0, duration=5.0):
    aw = audio_waveframe(file_path, offset, duration)
    spg = spectrogram(file_path, offset, duration)
    return aw, spg

In [13]:
"""test_soundscapes = '/kaggle/input/birdclef-2024/unlabeled_soundscapes/100350238.ogg'
for j in range(0,241,5):
    predicted = audio_classification(test_soundscapes, offset=float(j), duration=5.0)
    filename_noext = test_soundscapes.replace('.ogg','')
    df['row_id'] = f'{filename_noext}_{j}'
    df[bird_cols] = predicted
    submission_df = pd.concat([submission_df,df]).reset_index(drop=True)"""

"test_soundscapes = '/kaggle/input/birdclef-2024/unlabeled_soundscapes/100350238.ogg'\nfor j in range(0,241,5):\n    predicted = audio_classification(test_soundscapes, offset=float(j), duration=5.0)\n    filename_noext = test_soundscapes.replace('.ogg','')\n    df['row_id'] = f'{filename_noext}_{j}'\n    df[bird_cols] = predicted\n    submission_df = pd.concat([submission_df,df]).reset_index(drop=True)"

In [14]:
sample_submit = pd.read_csv("/kaggle/input/birdclef-2024/sample_submission.csv")
submit = pd.DataFrame(columns=sample_submit.columns)

submit

Unnamed: 0,row_id,asbfly,ashdro1,ashpri1,ashwoo2,asikoe2,asiope1,aspfly1,aspswi1,barfly1,...,whbwoo2,whcbar1,whiter2,whrmun,whtkin2,woosan,wynlau1,yebbab1,yebbul3,zitcis1


In [15]:
# Test = True

submission_df = pd.DataFrame(columns=['row_id']+bird_cols)
print(submission_df.shape)

for i in range(len(filenames_with_path)):
    for j in range(5,241,5):
        offset = j-5
        predicted = audio_classification(filenames_with_path[i], offset=float(offset), duration=5.0)
        filename_noext = filenames[i].replace('.ogg','')
        # print(f'{filename_noext}_{j}')
        submit['row_id'] = filename_noext+'_'+str(j)
        submit[bird_cols] = predicted.round(5)
        # print(submit['row_id'])
        submission_df = pd.concat([submission_df,submit]).reset_index(drop=True)
        
    """if Test:
        break"""

(0, 183)


In [16]:
# submission_df.sum(axis=1, numeric_only=True)

In [17]:
sample_submission = pd.read_csv("/kaggle/input/birdclef-2024/sample_submission.csv")
assert set(sample_submission.columns) == set(submission_df.columns)
#submission_df = submission_df[sample_submission.columns]

In [18]:
submission_df.to_csv('submission.csv', index=False)