In [None]:
!pip install duckdb --no-index --find-links=file:///kaggle/input/birdclef24-duckdb-polars/kaggle/working/mysitepackages/duck_pkg
!pip install polars --no-index --find-links=/kaggle/input/birdclef24-duckdb-polars/kaggle/working/mysitepackages/polars_pkg

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import duckdb as dd
import polars as pl
import pyarrow
import os
import glob
import shutil
import zipfile
import matplotlib.pyplot as plt
plt.style.use('dark_background')
import seaborn as sns
import plotly.express as px
import librosa
from IPython.display import Audio
import pickle
from joblib import dump, load
from pathlib import Path
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

In [None]:
# Path to the directory containing your audio dataset
dataset_dir = '/kaggle/input/birdclef-2024/train_audio'
# Initialize an empty dictionary to store the mapping between audio files and labels
label_mapping = {}
# Iterate over subdirectories (classes) in the dataset directory
for label in os.listdir(dataset_dir):
    label_dir = os.path.join(dataset_dir, label)
    # Check if the item in the dataset directory is a directory
    if os.path.isdir(label_dir):
        # Iterate over audio files in the subdirectory (class)
        for audio_file in os.listdir(label_dir):
            # Add the mapping between audio file path and label to the dictionary
            audio_file_path = os.path.join(label_dir, audio_file)
            label_mapping[audio_file_path] = label
            
# label_mapping

# Create a list of tuples containing the audio file paths and labels
data = [(audio_file_path, label) for audio_file_path, label in label_mapping.items()]
# Create a Pandas DataFrame from the list of tuples
annotated_data = pd.DataFrame(data, columns=['audio_file_path', 'label'])

label_encoder = LabelEncoder()
annotated_data['encoded_label'] = label_encoder.fit_transform(annotated_data['label'])

print(annotated_data.head(5))
print(annotated_data.shape)

In [None]:
dd.sql("select label, count(distinct(audio_file_path)) as files from annotated_data group by label order by 2").pl()

In [None]:
# Path to the directory containing your audio dataset
dataset_dir = '/kaggle/input/birdclef-2024/unlabeled_soundscapes'
# Initialize an empty dictionary to store the mapping between audio files and labels
label_mapping = {}

for audio_file in os.listdir(dataset_dir):
    # Add the mapping between audio file path and label to the dictionary
    audio_file_path = os.path.join(dataset_dir, audio_file)
    label_mapping[audio_file_path] = 'unlabelled'
            
# label_mapping

# Create a list of tuples containing the audio file paths and labels
data = [(audio_file_path, label) for audio_file_path, label in label_mapping.items()]
# Create a Pandas DataFrame from the list of tuples
unannotated_data = pd.DataFrame(data, columns=['audio_file_path', 'label'])

# label_encoder = LabelEncoder()
unannotated_data['encoded_label'] = 999

print(unannotated_data.head(5))
print(unannotated_data.shape)

In [None]:
combined_data = dd.sql("select * from annotated_data where label = 'niwpig1' union select * from unannotated_data").pl()
combined_data.shape

In [None]:
def get_file_duration(full_file_path):
    duration = librosa.get_duration(path=full_file_path, sr=32000)
    return round(duration,2)

vect_func = np.vectorize(get_file_duration)

combined_data_pd = combined_data.to_pandas()

combined_data_pd['file_duration'] = vect_func(combined_data_pd['audio_file_path'])
print(combined_data_pd.shape)

In [None]:
files_per_species_w_rnk = dd.sql(" select *, row_number()over(partition by label, encoded_label order by file_duration desc) as rn \
from combined_data_pd where file_duration <= 240").pl().sort(by=['encoded_label','rn'])

files_per_species_final = files_per_species_w_rnk.filter(pl.col('rn')<=35)

print(files_per_species_final.shape)

In [None]:
from tqdm import tqdm

#labels = []
features = []
#feature_dict = {}

# Loop through each audio file in the dataset directory
for i in tqdm(range(files_per_species_final.shape[0]), desc = 'Outer Loop'):
    # labels.append(annotated_data['label'].iloc[i])
    file_path = files_per_species_final.item(i,0)
    # lat = files_per_species_final.item(i,3)
    # lon = files_per_species_final.item(i,4)
    audio, sample_rate = librosa.load(file_path, sr=32000)
    samples_per_segment = sample_rate * 5
    if len(audio) > 7680000:
        total_samples = 7680000
    else:
        total_samples = len(audio)

    for j in range(0, total_samples+160000, samples_per_segment):
        if j + samples_per_segment <= total_samples:
            segment = audio[j:j + samples_per_segment]
            # chunk = cur_file+'_'+str(int((j/32000)+5))
            # mfccs = librosa.feature.mfcc(y=segment, sr=32000, n_mfcc=40)
            # flattened_features = (np.mean(mfccs.T, axis=0))
            # melspec = librosa.feature.melspectrogram(y=segment, sr=32000, n_fft=500, hop_length = 50)
            mfccs = librosa.feature.mfcc(y=segment, sr=32000, n_mfcc=40)
            # melspec = librosa.feature.melspectrogram(y=segment, sr=sr, n_fft=n_fft, n_mels=n_mels, hop_length=hop_length, fmin=fmin, fmax=fmax)
            # mfccs = librosa.feature.mfcc(y=segment, sr=sr, n_fft=n_fft, n_mels=n_mels, hop_length=hop_length, fmin=fmin, fmax=fmax)
            # flattened_melspec_features = (np.mean(melspec.T, axis=0))
            """flattened_mfcc_features = (np.mean(mfccs.T, axis=0))
            flattened_features = np.append(flattened_melspec_features, flattened_mfcc_features)"""
            # features.append(np.append(flattened_features, np.array([lat,lon])))
            flattened_features = (np.mean(mfccs.T, axis=0))
            #features.append(flattened_features)
            #labels.append(files_per_species_final.item(i,2))
            features.append({'file_path': file_path, 'label': files_per_species_final.item(i,2), 'feature_vector': flattened_features})
            

In [None]:
features_df = pd.DataFrame.from_dict(features)
print(features_df.head(5))
print(features_df.tail(5))

In [None]:
feature_matrix = np.vstack(features_df.feature_vector.values)

In [None]:
for cluster in n_clusters:
    print("starting kmeans with {0} clusters".format(cluster))
    kmeans = KMeans(n_clusters = cluster, init='k-means++', random_state=42)
    kmeans.fit(feature_matrix)
    
    print("inertia = {0}".format(kmeans.inertia_))
    list_inertia.append(kmeans.inertia_)
    
    sh_score = silhouette_score(feature_matrix, kmeans.labels_)
    print("silhouette score = {0}".format(sh_score))
    list_sh_score.append(sh_score)
    
metrics_tracking['inertia'] = list_inertia
metrics_tracking['silhouette_score'] = list_sh_score

test_df = pd.DataFrame(metrics_tracking)

In [None]:
fig = px.line(test_df, x='n_clusters', y='silhouette_score', markers=True)
fig.show()

In [None]:
fig = px.line(test_df, x='n_clusters', y='inertia', markers=True)
fig.show()

In [None]:
kmeans = KMeans(n_clusters = 35, init='k-means++', random_state=42)
kmeans.fit(feature_matrix)

In [None]:
features_df['cluster'] = kmeans.labels_

def get_cluster_centroids(cluster_id):
    return kmeans.cluster_centers_[cluster_id]

features_df['centroid'] = features_df['cluster'].apply(get_cluster_centroids)

def get_ftr_w_centroid(feature, cluster_id):
    list_of_ftr_n_cc = [feature, kmeans.cluster_centers_[cluster_id]]
    return list_of_ftr_n_cc

features_df['ftr_w_centroid'] = features_df.apply(lambda x: get_ftr_w_centroid(x['feature_vector'], x['cluster']), axis=1)

def calc_cos_sim(ftr_w_centroid):
    v = ftr_w_centroid[0]
    w = ftr_w_centroid[1]
    
    cos_before_norm = np.dot(v, w) / (np.linalg.norm(v) * np.linalg.norm(w))
    rounded_cos_sim = round(cos_before_norm, 2)
    return rounded_cos_sim

features_df['cosine_similarity'] = features_df['ftr_w_centroid'].apply(calc_cos_sim)

In [None]:
list_of_clusters = dd.sql(" select distinct cluster from ( select *, row_number()over(order by files desc) as rn from ( select cluster, count(distinct(label)) as data_count, count(distinct(file_path)) as files \
from features_df group by cluster )t1 )t2 where rn <= 5").pl().to_series().to_list()

print(list_of_clusters)

In [None]:
features_df['label'] = np.where(features_df['cluster'].isin(list_of_clusters), 119, features_df['label'])

In [None]:
final_list_of_files = dd.sql("select distinct file_path from features_df where label = 119 ").pl().to_series().to_list()

print(final_list_of_files)

In [None]:
combined_data_pd['label'] = \
np.where(combined_data_pd['audio_file_path'].isin(final_list_of_files), 'niwpig1', combined_data_pd['label'])

combined_data_pd['encoded_label'] = \
np.where(combined_data_pd['audio_file_path'].isin(final_list_of_files), 119, combined_data_pd['encoded_label'])

combined_data_pd[combined_data_pd['audio_file_path'].isin(final_list_of_files)]

In [None]:
## Test to see if the same file has been put under more than one class
dd.sql(" select audio_file_path, count(distinct(encoded_label)) as encoded_labels \
from combined_data_pd group by audio_file_path having encoded_labels > 1 ").pl()

In [None]:
combined_data_pd_niwpig1 = combined_data_pd[combined_data_pd['audio_file_path'].isin(final_list_of_files)].reset_index(drop=True)

In [None]:
combined_data_pd_niwpig1.to_csv('combined_data_pd_niwpig1.csv', index=False)