In [1]:
!pip install duckdb --no-index --find-links=file:///kaggle/input/birdclef24-duckdb-polars/kaggle/working/mysitepackages/duck_pkg

Looking in links: file:///kaggle/input/birdclef24-duckdb-polars/kaggle/working/mysitepackages/duck_pkg
Processing /kaggle/input/birdclef24-duckdb-polars/kaggle/working/mysitepackages/duck_pkg/duckdb-0.8.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
Installing collected packages: duckdb
Successfully installed duckdb-0.8.1


In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import duckdb as dd
import polars as pl
import pyarrow
import os
import glob
import shutil
import zipfile
import matplotlib.pyplot as plt
plt.style.use('dark_background')
import seaborn as sns
import plotly.express as px
import librosa
from IPython.display import Audio
import pickle
from joblib import dump, load
from pathlib import Path
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

In [None]:
train_metadata = pl.read_csv('/kaggle/input/birdclef-2024/train_metadata.csv')

In [None]:
pl.Config(fmt_str_lengths=100)

train_metadata.select([pl.col('primary_label'),pl.col('filename'),pl.col('latitude'),pl.col('longitude')])\
.filter(~pl.all_horizontal(pl.col('longitude').is_null())).head(5)

In [3]:
# Path to the directory containing your audio dataset
dataset_dir = '/kaggle/input/birdclef-2024/train_audio'
# Initialize an empty dictionary to store the mapping between audio files and labels
label_mapping = {}
# Iterate over subdirectories (classes) in the dataset directory
for label in os.listdir(dataset_dir):
    label_dir = os.path.join(dataset_dir, label)
    # Check if the item in the dataset directory is a directory
    if os.path.isdir(label_dir):
        # Iterate over audio files in the subdirectory (class)
        for audio_file in os.listdir(label_dir):
            # Add the mapping between audio file path and label to the dictionary
            audio_file_path = os.path.join(label_dir, audio_file)
            label_mapping[audio_file_path] = label
            
# label_mapping

# Create a list of tuples containing the audio file paths and labels
data = [(audio_file_path, label) for audio_file_path, label in label_mapping.items()]
# Create a Pandas DataFrame from the list of tuples
annotated_data = pd.DataFrame(data, columns=['audio_file_path', 'label'])

label_encoder = LabelEncoder()
annotated_data['encoded_label'] = label_encoder.fit_transform(annotated_data['label'])

annotated_data.head(5)

Unnamed: 0,audio_file_path,label,encoded_label
0,/kaggle/input/birdclef-2024/train_audio/redspu...,redspu1,137
1,/kaggle/input/birdclef-2024/train_audio/redspu...,redspu1,137
2,/kaggle/input/birdclef-2024/train_audio/redspu...,redspu1,137
3,/kaggle/input/birdclef-2024/train_audio/redspu...,redspu1,137
4,/kaggle/input/birdclef-2024/train_audio/redspu...,redspu1,137


In [None]:
annotated_data_polar = pl.from_pandas(annotated_data)
train_metadata_short = train_metadata.select([pl.col('primary_label'),pl.col('filename'),pl.col('latitude')\
                                              ,pl.col('longitude')])\
.filter(~pl.all_horizontal(pl.col('longitude').is_null()))

annotated_data_w_lat_lon = dd.sql("select a.*, b.latitude, b.longitude from annotated_data_polar a \
join train_metadata_short b on a.label = b.primary_label \
and replace(a.audio_file_path,'/kaggle/input/birdclef-2024/train_audio/','') = b.filename").pl()

In [None]:
annotated_data_w_lat_lon.head(5)

In [None]:
len(annotated_data_w_lat_lon.select(pl.col('label')).unique().to_series().to_list())

#### Trying to combine mfcc and melspectrogram features

In [4]:
def get_file_duration(full_file_path):
    duration = librosa.get_duration(path=full_file_path, sr=32000)
    return round(duration,2)

vect_func = np.vectorize(get_file_duration)

# annotated_data = annotated_data_w_lat_lon.to_pandas()

annotated_data['file_duration'] = vect_func(annotated_data['audio_file_path'])
print(annotated_data.shape)

(24459, 4)


In [None]:
"""annotated_data_rated = dd.sql("select t1.audio_file_path \
, t1.label, t1.encoded_label, t1.file_duration, t2.rating, t2.type, t2.primary_label, t2.secondary_labels \
from annotated_data t1 join train_metadata t2 \
on replace(t1.audio_file_path,'/kaggle/input/birdclef-2024/train_audio/','') = t2.filename \
and t2.rating >= 3.00").pl()"""

In [None]:
dur_stats_per_species = dd.sql("select t.label, min(t.file_duration) as min_duration, max(t.file_duration) as max_duration \
, avg(t.file_duration) as avg_duration from annotated_data t group by t.label").pl().sort(by='avg_duration')

In [None]:
fig_duration = px.histogram(dur_stats_per_species, x="avg_duration")
fig_duration.show()

In [6]:
list1 = dd.sql(" select distinct label from annotated_data ad where ad.file_duration >= 10 and ad.file_duration <= 80 ")\
.pl().to_series().to_list()
print(len(list1))

list2 = dd.sql(" select distinct label from annotated_data ad ").pl().to_series().to_list()
print(len(list2))

set(list1) ^ set(list2)

182
182


set()

In [None]:
"""dur_stats_darter2_integr = dd.sql("select t.label, min(t.file_duration) as min_duration, max(t.file_duration) as max_duration \
, avg(t.file_duration) as avg_duration from annotated_data t where t.label in ('darter2','integr') group by t.label")\
.pl().sort(by='avg_duration')

dur_stats_darter2_integr"""

In [None]:
files_per_species = dd.sql("select label, count(distinct(audio_file_path)) as files from annotated_data \
where file_duration >= 10 and file_duration <= 70 group by label")\
.pl().sort(by='files')

fig_file_count = px.histogram(files_per_species, x="files")
fig_file_count.show()

In [7]:
files_per_species_w_rnk = dd.sql(" select *, row_number()over(partition by label, encoded_label order by file_duration desc) as rn \
from annotated_data where file_duration >= 5 and file_duration <= 80").pl().sort(by=['encoded_label','rn'])

len(files_per_species_w_rnk.filter(pl.col('rn')<=10).select(pl.col('label')).unique().to_series().to_list())

182

In [8]:
files_per_species_final = files_per_species_w_rnk.filter(pl.col('rn')<=15)
# files_per_species_final

file_count_avg_dur = dd.sql("select label, count(distinct(audio_file_path)) as files, avg(file_duration) as avg_file_duration \
from files_per_species_final group by label").pl()

In [9]:
pl.Config(fmt_str_lengths=100)
files_per_species_final.head(20)

audio_file_path,label,encoded_label,file_duration,rn
str,str,i64,f64,i64
"""/kaggle/input/birdclef-2024/train_audio/asbfly/XC827640.ogg""","""asbfly""",0,76.49,1
"""/kaggle/input/birdclef-2024/train_audio/asbfly/XC825177.ogg""","""asbfly""",0,69.51,2
"""/kaggle/input/birdclef-2024/train_audio/asbfly/XC643642.ogg""","""asbfly""",0,69.05,3
"""/kaggle/input/birdclef-2024/train_audio/asbfly/XC756431.ogg""","""asbfly""",0,67.54,4
"""/kaggle/input/birdclef-2024/train_audio/asbfly/XC846616.ogg""","""asbfly""",0,66.91,5
…,…,…,…,…
"""/kaggle/input/birdclef-2024/train_audio/ashdro1/XC265674.ogg""","""ashdro1""",1,76.46,1
"""/kaggle/input/birdclef-2024/train_audio/ashdro1/XC429029.ogg""","""ashdro1""",1,74.02,2
"""/kaggle/input/birdclef-2024/train_audio/ashdro1/XC639829.ogg""","""ashdro1""",1,69.41,3
"""/kaggle/input/birdclef-2024/train_audio/ashdro1/XC289070.ogg""","""ashdro1""",1,68.14,4


In [None]:
fig_file_count_avg_dur1 = px.histogram(file_count_avg_dur, x="files")
fig_file_count_avg_dur1.show()

In [None]:
fig_file_count_avg_dur2 = px.histogram(file_count_avg_dur, x="avg_file_duration")
fig_file_count_avg_dur2.show()

In [None]:
files_per_species_final.item(180,4)

In [None]:
labels = []
features = []

file_path = files_per_species_final.item(1,0)
lat = files_per_species_final.item(1,3)
lon = files_per_species_final.item(1,4)
audio, sample_rate = librosa.load(file_path, sr=32000)
samples_per_segment = sample_rate * 5
if len(audio) > 7680000:
    total_samples = 7680000
else:
    total_samples = len(audio)

for j in range(0, total_samples+160000, samples_per_segment):
    if j + samples_per_segment <= total_samples:
        segment = audio[j:j + samples_per_segment]
        # chunk = cur_file+'_'+str(int((j/32000)+5))
        """mfccs = librosa.feature.mfcc(y=segment, sr=32000, n_mfcc=40)
        flattened_features = (np.mean(mfccs.T, axis=0))"""
        melspec = librosa.feature.melspectrogram(y=segment, sr=32000, n_fft=500, hop_length = 50)
        mfccs = librosa.feature.mfcc(y=segment, sr=32000, n_mfcc=40)
        flattened_melspec_features = (np.mean(melspec.T, axis=0))
        flattened_mfcc_features = (np.mean(mfccs.T, axis=0))
        flattened_features = np.append(flattened_melspec_features, flattened_mfcc_features)
        features.append(np.append(flattened_features, np.array([lat,lon])))
        labels.append(files_per_species_final.item(1,2))

In [10]:
sr = 32000
fmin = 20
fmax = 15000
slice_duration = 5
n_mels = 128
n_fft = n_mels*8
size_x = 512

hop_length = int(sr*slice_duration / size_x)
print(hop_length)

312


In [11]:
from tqdm import tqdm

labels = []
features = []

# Loop through each audio file in the dataset directory
for i in tqdm(range(files_per_species_final.shape[0]), desc = 'Outer Loop'):
    # labels.append(annotated_data['label'].iloc[i])
    file_path = files_per_species_final.item(i,0)
    # lat = files_per_species_final.item(i,3)
    # lon = files_per_species_final.item(i,4)
    audio, sample_rate = librosa.load(file_path, sr=32000)
    samples_per_segment = sample_rate * 5
    if len(audio) > 7680000:
        total_samples = 7680000
    else:
        total_samples = len(audio)

    for j in range(0, total_samples+160000, samples_per_segment):
        if j + samples_per_segment <= total_samples:
            segment = audio[j:j + samples_per_segment]
            # chunk = cur_file+'_'+str(int((j/32000)+5))
            # mfccs = librosa.feature.mfcc(y=segment, sr=32000, n_mfcc=40)
            # flattened_features = (np.mean(mfccs.T, axis=0))
            # melspec = librosa.feature.melspectrogram(y=segment, sr=32000, n_fft=500, hop_length = 50)
            mfccs = librosa.feature.mfcc(y=segment, sr=32000, n_mfcc=40)
            # melspec = librosa.feature.melspectrogram(y=segment, sr=sr, n_fft=n_fft, n_mels=n_mels, hop_length=hop_length, fmin=fmin, fmax=fmax)
            # mfccs = librosa.feature.mfcc(y=segment, sr=sr, n_fft=n_fft, n_mels=n_mels, hop_length=hop_length, fmin=fmin, fmax=fmax)
            # flattened_melspec_features = (np.mean(melspec.T, axis=0))
            """flattened_mfcc_features = (np.mean(mfccs.T, axis=0))
            flattened_features = np.append(flattened_melspec_features, flattened_mfcc_features)"""
            # features.append(np.append(flattened_features, np.array([lat,lon])))
            flattened_features = (np.mean(mfccs.T, axis=0))
            features.append(flattened_features)
            labels.append(files_per_species_final.item(i,2))

Outer Loop: 100%|██████████| 2565/2565 [16:00<00:00,  2.67it/s]


In [12]:
extracted_training_features_five_sec = np.array(features)
labels_five_sec = np.array(labels)

In [13]:
with open("extracted_train_feat_five_sec_v5", "wb") as file:   #Pickling
    pickle.dump(extracted_training_features_five_sec, file)
    
with open("labels_five_sec_v5", "wb") as file:   #Pickling
    pickle.dump(labels_five_sec, file)

In [3]:
feature_file_path = '/kaggle/input/labels-features-v5-04/extracted_train_feat_five_sec_v5'
label_file_path = '/kaggle/input/labels-features-v5-04/labels_five_sec_v5'

with open(feature_file_path, "rb") as file:
    pickled_extracted_features_five_sec = pickle.load(file)
    
with open(label_file_path, "rb") as file:
    labels_five_sec = pickle.load(file)

In [4]:
x_five_sec = np.vstack(pickled_extracted_features_five_sec)
y_five_sec = labels_five_sec

"""x_five_sec = np.vstack(extracted_training_features_five_sec)
y_five_sec = labels_five_sec"""

print(x_five_sec.shape)
print(y_five_sec.shape)

(25532, 40)
(25532,)


In [None]:
from sklearn.model_selection import GridSearchCV, StratifiedShuffleSplit

C_range = np.logspace(-2, 10, 13)
gamma_range = np.logspace(-9, 3, 13)
param_grid = dict(gamma=gamma_range, C=C_range, kernel=['rbf'])
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)

scaler = StandardScaler()
x_five_sec_scaled = scaler.fit_transform(x_five_sec)

grid = GridSearchCV(SVC(), param_grid=param_grid, cv=cv)
grid.fit(x_five_sec_scaled, y_five_sec)

print(
    "The best parameters are %s with a score of %0.2f"
    % (grid.best_params_, grid.best_score_)
)

In [5]:
C_range = np.logspace(-2, 10, 13)
gamma_range = np.logspace(-9, 3, 13)

In [6]:
for i in range(len(C_range)):
    print(C_range[i])

0.01
0.1
1.0
10.0
100.0
1000.0
10000.0
100000.0
1000000.0
10000000.0
100000000.0
1000000000.0
10000000000.0


In [7]:
for i in range(len(gamma_range)):
    print(gamma_range[i])

1e-09
1e-08
1e-07
1e-06
1e-05
0.0001
0.001
0.01
0.1
1.0
10.0
100.0
1000.0


In [8]:
from sklearn.preprocessing import LabelBinarizer

n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

rbf_kernel_svm_clf = Pipeline([
    ("scaler", StandardScaler()),
    ("svm_clf", SVC(kernel="rbf", C=C_range[5], gamma=gamma_range[4], probability=True, class_weight="balanced"))
])

sample_labels = [i for i in range(0, 182, 1)]
y_onehot_tests, y_scores = dict(), dict()

for fold, (train, test) in enumerate(skf.split(x_five_sec, y_five_sec)):
    print("fold = {0}".format(fold))
    x_train = np.ascontiguousarray(x_five_sec[train])
    y_train = np.ascontiguousarray(y_five_sec[train])
    x_test = np.ascontiguousarray(x_five_sec[test])
    y_test = np.ascontiguousarray(y_five_sec[test])
    print("train feature shape & label shape = {0} & {1}".format(x_train.shape, y_train.shape))
    print("test feature shape & label shape = {0} & {1}".format(x_test.shape, y_test.shape))
    rbf_kernel_svm_model = rbf_kernel_svm_clf.fit(x_train, y_train)
    y_predict = rbf_kernel_svm_model.predict_proba(x_test)
    print("predictions shape = {0}".format(y_predict.shape))
    ras_ovr = roc_auc_score(y_test, y_predict, multi_class='ovr', average='macro', labels=sample_labels)
    ras_ovo = roc_auc_score(y_test, y_predict, multi_class='ovo', average='macro', labels=sample_labels)
    print(f"Macro-averaged One-vs-Rest ROC AUC score: ", round(ras_ovr,2))
    print(f"Macro-averaged One-vs-One ROC AUC score: ", round(ras_ovo,2))
    y_scores[fold] = y_predict
    label_binarizer = LabelBinarizer().fit(y_train)
    y_onehot_test = label_binarizer.transform(y_test)
    y_onehot_tests[fold] = y_onehot_test

fold = 0
train feature shape & label shape = (20425, 40) & (20425,)
test feature shape & label shape = (5107, 40) & (5107,)
predictions shape = (5107, 182)
Macro-averaged One-vs-Rest ROC AUC score:  0.96
Macro-averaged One-vs-One ROC AUC score:  0.96
fold = 1
train feature shape & label shape = (20425, 40) & (20425,)
test feature shape & label shape = (5107, 40) & (5107,)
predictions shape = (5107, 182)
Macro-averaged One-vs-Rest ROC AUC score:  0.95
Macro-averaged One-vs-One ROC AUC score:  0.95
fold = 2
train feature shape & label shape = (20426, 40) & (20426,)
test feature shape & label shape = (5106, 40) & (5106,)
predictions shape = (5106, 182)
Macro-averaged One-vs-Rest ROC AUC score:  0.96
Macro-averaged One-vs-One ROC AUC score:  0.96
fold = 3
train feature shape & label shape = (20426, 40) & (20426,)
test feature shape & label shape = (5106, 40) & (5106,)
predictions shape = (5106, 182)
Macro-averaged One-vs-Rest ROC AUC score:  0.96
Macro-averaged One-vs-One ROC AUC score:  0

In [None]:
import matplotlib.colors as mcolors

#len(mcolors.CSS4_COLORS)
color_keys = list(mcolors.CSS4_COLORS.keys())
colors = []

for i in range(0,18):
    #print(color_keys[i])
    color_key = color_keys[i]
    #print(mcolors.CSS4_COLORS[color_key])
    colors.append(mcolors.CSS4_COLORS[color_key])
    
colors[8:18]

In [None]:
from sklearn.metrics import RocCurveDisplay

bird_list = annotated_data['label'].unique().tolist()

In [None]:
from sklearn.metrics import auc, roc_curve

scores_dict_all_fold = dict()

for fold in range(n_splits):
    poor_scorers_list = []
    poor_scorers_labels_list = []
    good_scorers_list = []
    good_scorers_labels_list = []
    scores_dict = dict()
    
    y_onehot_test = y_onehot_tests[fold]
    y_score = y_scores[fold]

    for j in range(0,190,10):
        scores_dict['fold'] = fold
        for class_id, color in zip(range(j,j+10), colors[8:18]):
            if class_id < 182:
                fpr, tpr, thresholds = roc_curve(y_onehot_test[:, class_id], y_score[:, class_id])
                auc_val = round(auc(fpr, tpr),2)
                if auc_val < 0.70:
                    poor_scorers_list.append(bird_list[class_id])
                    poor_scorers_labels_list.append(class_id)
                else:
                    good_scorers_list.append(bird_list[class_id])
                    good_scorers_labels_list.append(class_id)
                #print("AUC for bird {0} = {1}".format(bird_list[class_id], auc_val))
        scores_dict['poor_scorers_list'] = poor_scorers_list
        scores_dict['poor_scorers_labels_list'] = poor_scorers_labels_list
        scores_dict['good_scorers_list'] = good_scorers_list
        scores_dict['good_scorers_labels_list'] = good_scorers_labels_list

    print("Fold {0} - number of poor scoring classes are {1}".format(fold,len(scores_dict['poor_scorers_list'])))
    print("Fold {0} - number of good scoring classes are {1}".format(fold,len(scores_dict['good_scorers_list'])))
    
    scores_dict_all_fold[fold] = scores_dict

In [None]:
set(scores_dict_all_fold[0]['poor_scorers_list']) & set(scores_dict_all_fold[1]['poor_scorers_list']) \
& set(scores_dict_all_fold[2]['poor_scorers_list']) & set(scores_dict_all_fold[3]['poor_scorers_list']) \
& set(scores_dict_all_fold[4]['poor_scorers_list'])

In [None]:
set(scores_dict_all_fold[0]['poor_scorers_list']) ^ set(scores_dict_all_fold[1]['poor_scorers_list']) \
^ set(scores_dict_all_fold[2]['poor_scorers_list']) ^ set(scores_dict_all_fold[3]['poor_scorers_list']) \
^ set(scores_dict_all_fold[4]['poor_scorers_list'])

In [None]:
pl.Config(fmt_str_lengths=100)
files_per_species_final.filter(pl.col('label')=='redspu1')

In [None]:
files_per_species_final = files_per_species_final\
.with_columns(pl.col("label").is_in(scores_dict_all_fold[3]['poor_scorers_list']).alias("poor_scorer"))

In [None]:
files_per_species_final_poor_scorers = files_per_species_final.filter(pl.col('poor_scorer') == True)
print(files_per_species_final_poor_scorers.shape)
files_per_species_final_poor_scorers.head(10)

In [None]:
file_count_avg_dur.filter(pl.col("label").is_in(scores_dict_all_fold[3]['poor_scorers_list']))

In [None]:
dd.sql("select t.label, min(t.file_duration) as min_duration, max(t.file_duration) as max_duration, count(audio_file_path) as files \
, avg(t.file_duration) as avg_duration from annotated_data t where t.label = 'ashwoo2' group by t.label").pl()

In [None]:
train_metadata = pl.read_csv('/kaggle/input/birdclef-2024/train_metadata.csv')

train_metadata_poor_scorers = \
dd.sql("select replace(t1.audio_file_path,'/kaggle/input/birdclef-2024/train_audio/','') as audio_file_path \
, t1.label, t1.encoded_label, t1.file_duration, t1.rn, t2.rating, t2.type, t2.primary_label, t2.secondary_labels \
from files_per_species_final_poor_scorers t1 \
join train_metadata t2 on replace(t1.audio_file_path,'/kaggle/input/birdclef-2024/train_audio/','') = t2.filename").pl()

In [None]:
train_metadata_poor_scorers.filter(pl.col('label')=='ashwoo2')

In [None]:
annotated_data_pl = pl.from_pandas(annotated_data)

annotated_data_ashdro1 = annotated_data_pl.filter(pl.col('label')=='ashwoo2').select(['audio_file_path','file_duration'])

ashdro1_fig = px.histogram(annotated_data_ashdro1, x="file_duration")
ashdro1_fig.show()

In [None]:
i=0
# print("Macro-averaged One-vs-Rest\nReceiver Operating Characteristic for fold {0}".format(i))
plot_title = "Macro-averaged One-vs-Rest\nReceiver Operating Characteristic for fold {0}".format(i)
y_onehot_test = y_onehot_tests[i]
y_score = y_scores[i]

for j in range(0,190,10):
    fig, ax = plt.subplots(figsize=(10, 10))
    ax.set_facecolor('xkcd:grey')

    for class_id, color in zip(range(j,j+10), colors[8:18]):
        if class_id < 182:
            RocCurveDisplay.from_predictions(
                y_onehot_test[:, class_id],
                y_score[:, class_id],
                name=f"ROC curve for {bird_list[class_id]}",
                color=color,
                ax=ax,
            )

        _ = ax.set(
            xlabel="False Positive Rate",
            ylabel="True Positive Rate",
            title=plot_title,
        )

In [6]:
rbf_kernel_svm_clf = Pipeline([
    ("scaler", StandardScaler()),
    ("svm_clf", SVC(kernel="rbf", C=C_range[5], gamma=gamma_range[4], probability=True, class_weight="balanced"))
])

rbf_kernel_svm_model = rbf_kernel_svm_clf.fit(x_five_sec, y_five_sec)

In [9]:
dump(rbf_kernel_svm_model, 'rbf_kernel_svm_model_mfcc_c1000_1e_05.joblib')

['rbf_kernel_svm_model_mfcc_c1000_1e_05.joblib']

In [None]:
rbf_kernel_svm_model = load('/kaggle/input/rbf-kernel-svc-model/rbf_kernel_svm_model.joblib')