In [2]:
!pip install duckdb --no-index --find-links=file:///kaggle/input/birdclef24-duckdb-polars/kaggle/working/mysitepackages/duck_pkg
!pip install polars --no-index --find-links=/kaggle/input/birdclef24-duckdb-polars/kaggle/working/mysitepackages/polars_pkg

Looking in links: file:///kaggle/input/birdclef24-duckdb-polars/kaggle/working/mysitepackages/duck_pkg
Looking in links: /kaggle/input/birdclef24-duckdb-polars/kaggle/working/mysitepackages/polars_pkg


In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import duckdb as dd
import polars as pl
import pyarrow
import os
import glob
import shutil
import zipfile
import matplotlib.pyplot as plt
plt.style.use('dark_background')
import seaborn as sns
import plotly.express as px
import librosa
from IPython.display import Audio
import pickle
from joblib import dump, load
from pathlib import Path
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

In [4]:
# Path to the directory containing your audio dataset
dataset_dir = '/kaggle/input/birdclef-2024/train_audio'
# Initialize an empty dictionary to store the mapping between audio files and labels
label_mapping = {}
# Iterate over subdirectories (classes) in the dataset directory
for label in os.listdir(dataset_dir):
    label_dir = os.path.join(dataset_dir, label)
    # Check if the item in the dataset directory is a directory
    if os.path.isdir(label_dir):
        # Iterate over audio files in the subdirectory (class)
        for audio_file in os.listdir(label_dir):
            # Add the mapping between audio file path and label to the dictionary
            audio_file_path = os.path.join(label_dir, audio_file)
            label_mapping[audio_file_path] = label
            
# label_mapping

# Create a list of tuples containing the audio file paths and labels
data = [(audio_file_path, label) for audio_file_path, label in label_mapping.items()]
# Create a Pandas DataFrame from the list of tuples
annotated_data = pd.DataFrame(data, columns=['audio_file_path', 'label'])

label_encoder = LabelEncoder()
annotated_data['encoded_label'] = label_encoder.fit_transform(annotated_data['label'])

annotated_data.head(5)

Unnamed: 0,audio_file_path,label,encoded_label
0,/kaggle/input/birdclef-2024/train_audio/redspu...,redspu1,137
1,/kaggle/input/birdclef-2024/train_audio/redspu...,redspu1,137
2,/kaggle/input/birdclef-2024/train_audio/redspu...,redspu1,137
3,/kaggle/input/birdclef-2024/train_audio/redspu...,redspu1,137
4,/kaggle/input/birdclef-2024/train_audio/redspu...,redspu1,137


In [7]:
annotated_data.shape

(24459, 3)

In [9]:
# Path to the directory containing your audio dataset
dataset_dir = '/kaggle/input/birdclef-2024/unlabeled_soundscapes'
# Initialize an empty dictionary to store the mapping between audio files and labels
label_mapping = {}

for audio_file in os.listdir(dataset_dir):
    # Add the mapping between audio file path and label to the dictionary
    audio_file_path = os.path.join(dataset_dir, audio_file)
    label_mapping[audio_file_path] = 'unlabelled'
            
# label_mapping

# Create a list of tuples containing the audio file paths and labels
data = [(audio_file_path, label) for audio_file_path, label in label_mapping.items()]
# Create a Pandas DataFrame from the list of tuples
unannotated_data = pd.DataFrame(data, columns=['audio_file_path', 'label'])

# label_encoder = LabelEncoder()
unannotated_data['encoded_label'] = 999

unannotated_data.head(5)

Unnamed: 0,audio_file_path,label,encoded_label
0,/kaggle/input/birdclef-2024/unlabeled_soundsca...,unlabelled,999
1,/kaggle/input/birdclef-2024/unlabeled_soundsca...,unlabelled,999
2,/kaggle/input/birdclef-2024/unlabeled_soundsca...,unlabelled,999
3,/kaggle/input/birdclef-2024/unlabeled_soundsca...,unlabelled,999
4,/kaggle/input/birdclef-2024/unlabeled_soundsca...,unlabelled,999


In [10]:
unannotated_data.shape

(8444, 3)

In [13]:
combined_data = dd.sql("select * from annotated_data union select * from unannotated_data").pl()
combined_data.shape

(32903, 3)

In [14]:
def get_file_duration(full_file_path):
    duration = librosa.get_duration(path=full_file_path, sr=32000)
    return round(duration,2)

vect_func = np.vectorize(get_file_duration)

combined_data_pd = combined_data.to_pandas()

combined_data_pd['file_duration'] = vect_func(combined_data_pd['audio_file_path'])
print(combined_data_pd.shape)

(32903, 4)


In [17]:
list1 = dd.sql(" select distinct label from combined_data_pd ad where ad.file_duration >= 10 and ad.file_duration <= 80 ")\
.pl().to_series().to_list()
print(len(list1))

list2 = dd.sql(" select distinct label from combined_data_pd ad ").pl().to_series().to_list()
print(len(list2))

set(list1) ^ set(list2)

183
183


set()

In [18]:
files_per_species_w_rnk = dd.sql(" select *, row_number()over(partition by label, encoded_label order by file_duration desc) as rn \
from combined_data_pd where file_duration >= 5 and file_duration <= 80").pl().sort(by=['encoded_label','rn'])

len(files_per_species_w_rnk.filter(pl.col('rn')<=10).select(pl.col('label')).unique().to_series().to_list())

183

In [19]:
files_per_species_final = files_per_species_w_rnk.filter(pl.col('rn')<=15)
# files_per_species_final

file_count_avg_dur = dd.sql("select label, count(distinct(audio_file_path)) as files, avg(file_duration) as avg_file_duration \
from files_per_species_final group by label").pl()

In [53]:
pl.Config(fmt_str_lengths=100)
files_per_species_final.tail(10)

audio_file_path,label,encoded_label,file_duration,rn
str,str,i64,f64,i64
"""/kaggle/input/birdclef-2024/unlabeled_soundscapes/876155123.ogg""","""unlabelled""",999,62.12,6
"""/kaggle/input/birdclef-2024/unlabeled_soundscapes/1336735839.ogg""","""unlabelled""",999,59.39,7
"""/kaggle/input/birdclef-2024/unlabeled_soundscapes/1097804416.ogg""","""unlabelled""",999,59.39,8
"""/kaggle/input/birdclef-2024/unlabeled_soundscapes/607492618.ogg""","""unlabelled""",999,57.68,9
"""/kaggle/input/birdclef-2024/unlabeled_soundscapes/133070011.ogg""","""unlabelled""",999,57.68,10
"""/kaggle/input/birdclef-2024/unlabeled_soundscapes/873334531.ogg""","""unlabelled""",999,57.0,11
"""/kaggle/input/birdclef-2024/unlabeled_soundscapes/1167345793.ogg""","""unlabelled""",999,56.32,12
"""/kaggle/input/birdclef-2024/unlabeled_soundscapes/784009014.ogg""","""unlabelled""",999,53.59,13
"""/kaggle/input/birdclef-2024/unlabeled_soundscapes/159694475.ogg""","""unlabelled""",999,53.59,14
"""/kaggle/input/birdclef-2024/unlabeled_soundscapes/1379663308.ogg""","""unlabelled""",999,51.88,15


In [25]:
dd.sql("select label, count(distinct(audio_file_path)) as files, avg(file_duration) as avg_file_duration \
from files_per_species_final where label = 'unlabelled' group by label order by 2 desc").pl()

label,files,avg_file_duration
str,i64,f64
"""unlabelled""",15,60.367333


In [20]:
fig_file_count_avg_dur1 = px.histogram(file_count_avg_dur, x="files")
fig_file_count_avg_dur1.show()

In [21]:
fig_file_count_avg_dur2 = px.histogram(file_count_avg_dur, x="avg_file_duration")
fig_file_count_avg_dur2.show()

In [62]:
from tqdm import tqdm

#labels = []
features = []
#feature_dict = {}

# Loop through each audio file in the dataset directory
for i in tqdm(range(files_per_species_final.shape[0]), desc = 'Outer Loop'):
    # labels.append(annotated_data['label'].iloc[i])
    file_path = files_per_species_final.item(i,0)
    # lat = files_per_species_final.item(i,3)
    # lon = files_per_species_final.item(i,4)
    audio, sample_rate = librosa.load(file_path, sr=32000)
    samples_per_segment = sample_rate * 5
    if len(audio) > 7680000:
        total_samples = 7680000
    else:
        total_samples = len(audio)

    for j in range(0, total_samples+160000, samples_per_segment):
        if j + samples_per_segment <= total_samples:
            segment = audio[j:j + samples_per_segment]
            # chunk = cur_file+'_'+str(int((j/32000)+5))
            # mfccs = librosa.feature.mfcc(y=segment, sr=32000, n_mfcc=40)
            # flattened_features = (np.mean(mfccs.T, axis=0))
            # melspec = librosa.feature.melspectrogram(y=segment, sr=32000, n_fft=500, hop_length = 50)
            mfccs = librosa.feature.mfcc(y=segment, sr=32000, n_mfcc=40)
            # melspec = librosa.feature.melspectrogram(y=segment, sr=sr, n_fft=n_fft, n_mels=n_mels, hop_length=hop_length, fmin=fmin, fmax=fmax)
            # mfccs = librosa.feature.mfcc(y=segment, sr=sr, n_fft=n_fft, n_mels=n_mels, hop_length=hop_length, fmin=fmin, fmax=fmax)
            # flattened_melspec_features = (np.mean(melspec.T, axis=0))
            """flattened_mfcc_features = (np.mean(mfccs.T, axis=0))
            flattened_features = np.append(flattened_melspec_features, flattened_mfcc_features)"""
            # features.append(np.append(flattened_features, np.array([lat,lon])))
            flattened_features = (np.mean(mfccs.T, axis=0))
            #features.append(flattened_features)
            #labels.append(files_per_species_final.item(i,2))
            features.append({'file_path': file_path, 'label': files_per_species_final.item(i,2), 'feature_vector': flattened_features})
            

Outer Loop: 100%|██████████| 2580/2580 [16:25<00:00,  2.62it/s]


In [63]:
features_df = pd.DataFrame.from_dict(features)
features_df.head(5)

Unnamed: 0,file_path,label,feature_vector
0,/kaggle/input/birdclef-2024/train_audio/asbfly...,0,"[-306.90805, 98.6112, 2.0697806, 41.759277, 2...."
1,/kaggle/input/birdclef-2024/train_audio/asbfly...,0,"[-306.5587, 105.97594, 11.0418415, 46.07964, 2..."
2,/kaggle/input/birdclef-2024/train_audio/asbfly...,0,"[-298.4591, 107.41491, 6.918604, 46.13292, 6.2..."
3,/kaggle/input/birdclef-2024/train_audio/asbfly...,0,"[-297.68295, 115.09483, 15.601984, 47.37494, 9..."
4,/kaggle/input/birdclef-2024/train_audio/asbfly...,0,"[-304.0982, 107.606606, 12.915395, 43.217976, ..."


In [64]:
features_df.tail(5)

Unnamed: 0,file_path,label,feature_vector
25700,/kaggle/input/birdclef-2024/unlabeled_soundsca...,999,"[-390.21472, 0.8100739, -18.268923, 30.429401,..."
25701,/kaggle/input/birdclef-2024/unlabeled_soundsca...,999,"[-387.2265, -2.5688446, -14.331874, 28.479193,..."
25702,/kaggle/input/birdclef-2024/unlabeled_soundsca...,999,"[-380.15137, -8.402964, -13.71992, 24.43091, 2..."
25703,/kaggle/input/birdclef-2024/unlabeled_soundsca...,999,"[-384.30136, 0.30289546, -25.428608, 30.471283..."
25704,/kaggle/input/birdclef-2024/unlabeled_soundsca...,999,"[-378.0383, -10.175715, -12.689404, 25.161488,..."


In [65]:
feature_matrix = np.vstack(features_df.feature_vector.values)

In [66]:
feature_matrix.shape

(25705, 40)

In [67]:
from sklearn.metrics import silhouette_score

n_clusters = [20, 50, 70, 100, 182, 210, 250, 300, 350, 400]
metrics_tracking = {}
metrics_tracking['n_clusters'] = n_clusters
list_inertia = []
list_sh_score = []

In [68]:
for cluster in n_clusters:
    print("starting kmeans with {0} clusters".format(cluster))
    kmeans = KMeans(n_clusters = cluster, init='k-means++', random_state=42)
    kmeans.fit(feature_matrix)
    
    print("inertia = {0}".format(kmeans.inertia_))
    list_inertia.append(kmeans.inertia_)
    
    sh_score = silhouette_score(feature_matrix, kmeans.labels_)
    print("silhouette score = {0}".format(sh_score))
    list_sh_score.append(sh_score)
    
metrics_tracking['inertia'] = list_inertia
metrics_tracking['silhouette_score'] = list_sh_score

test_df = pd.DataFrame(metrics_tracking)

starting kmeans with 20 clusters






inertia = 141328288.0
silhouette score = 0.13992908596992493
starting kmeans with 50 clusters






inertia = 105109696.0
silhouette score = 0.11707963049411774
starting kmeans with 70 clusters






inertia = 94893448.0
silhouette score = 0.11561127752065659
starting kmeans with 100 clusters






inertia = 84646880.0
silhouette score = 0.1162625178694725
starting kmeans with 182 clusters






inertia = 68741312.0
silhouette score = 0.12570340931415558
starting kmeans with 210 clusters






inertia = 65116560.0
silhouette score = 0.13350431621074677
starting kmeans with 250 clusters






inertia = 60910796.0
silhouette score = 0.13809634745121002
starting kmeans with 300 clusters






inertia = 56520984.0
silhouette score = 0.1463240683078766
starting kmeans with 350 clusters






inertia = 52986656.0
silhouette score = 0.15187428891658783
starting kmeans with 400 clusters






inertia = 49891792.0
silhouette score = 0.16133439540863037


In [69]:
test_df

Unnamed: 0,n_clusters,inertia,silhouette_score
0,20,141328288.0,0.139929
1,50,105109696.0,0.11708
2,70,94893448.0,0.115611
3,100,84646880.0,0.116263
4,182,68741312.0,0.125703
5,210,65116560.0,0.133504
6,250,60910796.0,0.138096
7,300,56520984.0,0.146324
8,350,52986656.0,0.151874
9,400,49891792.0,0.161334


In [70]:
fig = px.line(test_df, x='n_clusters', y='silhouette_score', markers=True)
fig.show()

In [71]:
fig = px.line(test_df, x='n_clusters', y='inertia', markers=True)
fig.show()

In [72]:
kmeans = KMeans(n_clusters = 182, init='k-means++', random_state=42)
kmeans.fit(feature_matrix)





In [73]:
features_df['cluster'] = kmeans.labels_

In [84]:
features_df.head(10)

Unnamed: 0,file_path,label,feature_vector,cluster,centroid,ftr_w_centroid,cosine_similarity
0,/kaggle/input/birdclef-2024/train_audio/asbfly...,0,"[-306.90805, 98.6112, 2.0697806, 41.759277, 2....",1,"[-304.57397, 86.43449, 4.06686, 24.36016, 6.27...","[[-306.90805, 98.6112, 2.0697806, 41.759277, 2...",1.0
1,/kaggle/input/birdclef-2024/train_audio/asbfly...,0,"[-306.5587, 105.97594, 11.0418415, 46.07964, 2...",1,"[-304.57397, 86.43449, 4.06686, 24.36016, 6.27...","[[-306.5587, 105.97594, 11.0418415, 46.07964, ...",1.0
2,/kaggle/input/birdclef-2024/train_audio/asbfly...,0,"[-298.4591, 107.41491, 6.918604, 46.13292, 6.2...",1,"[-304.57397, 86.43449, 4.06686, 24.36016, 6.27...","[[-298.4591, 107.41491, 6.918604, 46.13292, 6....",0.99
3,/kaggle/input/birdclef-2024/train_audio/asbfly...,0,"[-297.68295, 115.09483, 15.601984, 47.37494, 9...",164,"[-273.1562, 104.131, 10.2164, 26.945103, 2.040...","[[-297.68295, 115.09483, 15.601984, 47.37494, ...",1.0
4,/kaggle/input/birdclef-2024/train_audio/asbfly...,0,"[-304.0982, 107.606606, 12.915395, 43.217976, ...",1,"[-304.57397, 86.43449, 4.06686, 24.36016, 6.27...","[[-304.0982, 107.606606, 12.915395, 43.217976,...",0.99
5,/kaggle/input/birdclef-2024/train_audio/asbfly...,0,"[-306.79626, 110.423485, 14.066338, 43.47894, ...",1,"[-304.57397, 86.43449, 4.06686, 24.36016, 6.27...","[[-306.79626, 110.423485, 14.066338, 43.47894,...",0.99
6,/kaggle/input/birdclef-2024/train_audio/asbfly...,0,"[-306.33368, 106.73989, 6.858093, 40.576057, 5...",1,"[-304.57397, 86.43449, 4.06686, 24.36016, 6.27...","[[-306.33368, 106.73989, 6.858093, 40.576057, ...",1.0
7,/kaggle/input/birdclef-2024/train_audio/asbfly...,0,"[-242.05905, 110.46366, 9.991466, 37.122097, 1...",43,"[-238.71002, 109.91652, -6.9458275, 30.02649, ...","[[-242.05905, 110.46366, 9.991466, 37.122097, ...",0.99
8,/kaggle/input/birdclef-2024/train_audio/asbfly...,0,"[-281.13608, 111.969315, 8.903453, 34.231068, ...",164,"[-273.1562, 104.131, 10.2164, 26.945103, 2.040...","[[-281.13608, 111.969315, 8.903453, 34.231068,...",1.0
9,/kaggle/input/birdclef-2024/train_audio/asbfly...,0,"[-311.87302, 101.62594, 3.1791575, 38.48516, 2...",1,"[-304.57397, 86.43449, 4.06686, 24.36016, 6.27...","[[-311.87302, 101.62594, 3.1791575, 38.48516, ...",1.0


In [78]:
dd.sql("select cluster, count(distinct(label)) as data_count from features_df group by cluster").pl()
# features_df.tail(10)

cluster,data_count
i32,i64
1,66
164,60
43,50
110,63
148,28
…,…
152,1
99,1
146,19
60,8


In [80]:
def get_cluster_centroids(cluster_id):
    return kmeans.cluster_centers_[cluster_id]

features_df['centroid'] = features_df['cluster'].apply(get_cluster_centroids)

def get_ftr_w_centroid(feature, cluster_id):
    list_of_ftr_n_cc = [feature, kmeans.cluster_centers_[cluster_id]]
    return list_of_ftr_n_cc

features_df['ftr_w_centroid'] = features_df.apply(lambda x: get_ftr_w_centroid(x['feature_vector'], x['cluster']), axis=1)

In [83]:
def calc_cos_sim(ftr_w_centroid):
    v = ftr_w_centroid[0]
    w = ftr_w_centroid[1]
    
    cos_before_norm = np.dot(v, w) / (np.linalg.norm(v) * np.linalg.norm(w))
    rounded_cos_sim = round(cos_before_norm, 2)
    return rounded_cos_sim

features_df['cosine_similarity'] = features_df['ftr_w_centroid'].apply(calc_cos_sim)

In [85]:
fig_cluster = px.histogram(features_df, x="cosine_similarity")
fig_cluster.show()

In [95]:
dd.sql("select cluster, count(1) as records, count(distinct(file_path)) as files from features_df where label = 999 group by cluster").pl()

cluster,records,files
i32,i64,i64
72,173,15


In [100]:
# dd.sql("select distinct label, cosine_similarity from features_df where cluster = 72").pl().

features_df_pl = pl.from_pandas(features_df)

list_72 = features_df_pl.filter(pl.col('cluster')==72).select(pl.col('label')).unique().to_series().to_list()

#len(files_per_species_w_rnk.filter(pl.col('rn')<=10).select(pl.col('label')).unique().to_series().to_list())

list_72

[127, 24, 181, 156, 70, 999, 9, 14, 62, 28, 110, 134, 179, 44, 72, 2, 61]

In [50]:
extracted_train_feat_five_sec_mix_match = np.array(features)

with open("extracted_train_feat_five_sec_mix_match", "wb") as file:   #Pickling
    pickle.dump(extracted_train_feat_five_sec_mix_match, file)

In [None]:
feature_file_path = '/kaggle/input/for-kmeans/extracted_train_feat_five_sec_mix_match'

with open(feature_file_path, "rb") as file:
    pickled_extracted_features_five_sec_mix_match = pickle.load(file)
    
vecs_for_clustering = np.vstack(pickled_extracted_features_five_sec_mix_match)