# Exploring a new dataset

One of the most important steps in ecological machine learning is defining the problem and understanding the data. In this notebook I look at general strategies for successful machine learning. 

1. What does the data look like?
2. What existing models can help?
3. How can we fine-tune existing models to customize to our data?
4. How better clean the data to reduce the noise and focus on the key elements?
5. How can we organize and track train and test splits to increase predictive accuracy?

To highlight these concepts, intentionally chose a dataset I have no experience with.

In [None]:
from opensoundscape.ml.cnn import load_model
from opensoundscape import Audio

In [None]:
# Importación de bibliotecas
import pandas as pd
import matplotlib.pyplot as plt

def analizar_distribucion_especies(train_df):
    """
    Analiza la distribución de especies en el conjunto de datos.
    Args:
        train_df (DataFrame): DataFrame de entrenamiento
    """
    print("\n=== Análisis de Distribución de Especies ===")
    
    # Análisis de distribución de especies
    species_counts = train_df['primary_label'].value_counts()
    
    # Visualización de las 20 especies más comunes
    plt.figure(figsize=(12, 6))
    species_counts.head(20).plot(kind='bar')
    plt.title('Top 20 Especies más Comunes')
    plt.xlabel('Especie')
    plt.ylabel('Cantidad de Registros')
    plt.xticks(rotation=45)
    plt.tight_layout()
    
    print(f"\nTotal de especies únicas: {len(species_counts)}")
    print(f"Especie más común: {species_counts.index[0]} ({species_counts.iloc[0]} registros)")
    print(f"Especie menos común: {species_counts.index[-1]} ({species_counts.iloc[-1]} registros)")

    # Visualización de las 20 especies más comunes
    plt.figure(figsize=(12, 6))
    species_counts.plot(kind='bar')
    plt.xlabel('Especie')
    plt.ylabel('Cantidad de Registros')

def cargar_datos():
    """
    Carga los archivos principales del conjunto de datos.
    Returns:
        tuple: DataFrames de entrenamiento, taxonomía y muestra
    """
    print("Cargando datos...")
    
    # Carga de archivos principales
    train_df = pd.read_csv("../birdclef-2025/train.csv")
    taxonomy_df = pd.read_csv("../birdclef-2025/taxonomy.csv")
    sample_submission = pd.read_csv("../birdclef-2025/sample_submission.csv")
    
    # Carga de metadatos de ubicación
    with open("../birdclef-2025/recording_location.txt", "r") as f:
        recording_location = f.read()
    
    print(f"Datos de entrenamiento: {train_df.shape}")
    print(f"Datos taxonómicos: {taxonomy_df.shape}")
    print(f"Archivo de muestra: {sample_submission.shape}")
    
    return train_df, taxonomy_df, sample_submission, recording_location

train_df, taxonomy_df, sample_submission, recording_location = cargar_datos()

analizar_distribucion_especies(train_df)
plt.show()

## Load model

https://github.com/kitzeslab/bioacoustics-model-zoo?tab=readme-ov-file

In [None]:
# Opensoundscape and BirdCLEF
import bioacoustics_model_zoo as bmz

# list available models from the model zoo
bmz.utils.list_models()

# Load the model
m=bmz.BirdSetEfficientNetB1()

Let's choose a fairly common bird species with a distinctive call.

https://ebird.org/species/crebob1



In [None]:
Crested_Bobwhite = train_df[train_df.common_name == "Crested Bobwhite"]
Crested_Bobwhite.head()

In [None]:
# Load the audio
file_path = "../birdclef-2025/train_audio/crebob1/XC148253.ogg"
audio = Audio.from_file(file_path)
fft_spectrum, frequencies = audio.spectrum()

# Plot settings
from matplotlib import pyplot as plt

# Plot
plt.plot(frequencies,fft_spectrum)
plt.ylabel('Fast Fourier Transform (V**2/Hz)')
plt.xlabel('Frequency (Hz)')
plt.show()

# Low pass filter
clean_audio = audio.reduce_noise().highpass(1000, order=8).lowpass(5000, order=8).normalize()
fft_spectrum, frequencies = clean_audio.spectrum()

# Plot
plt.plot(frequencies,fft_spectrum)
plt.ylabel('Fast Fourier Transform (V**2/Hz)')
plt.xlabel('Frequency (Hz)')
plt.show()
clean_audio.show_widget()

In [None]:
from opensoundscape import Audio, Spectrogram
spectrogram_object = Spectrogram.from_audio(clean_audio)
spectrogram_object.plot()
plt.show()

## Predict use pretrained model

In [None]:
clean_audio.save("clean_audio.wav")
scores = m.predict("clean_audio.wav", activation_layer="sigmoid") 

## Analyze the predictions

In [None]:
from opensoundscape.metrics import predict_multi_target_labels
predicted_labels = predict_multi_target_labels(scores, threshold=0.5)
predicted_labels
# count the number of detections for each species
detection_counts = predicted_labels.sum(0)
detections = detection_counts[detection_counts > 0]

print(detections)

# Matching taxonomy
taxonomy_df.loc[taxonomy_df.primary_label.isin(detections.index)]

# Lessons learned

1. Data formats and installation are the hardest part of getting started. Most projects fail here.
2. Taxonomy is a persistant challenge.
3. Generalization across time and space limits open source models

# Next steps
1. Aligning taxonomies of existing models and our dataset
2. Intelligent preprocessing to focus on the target species. 
3. Training our own classifier starting from this existing model.

____________________

# Fine-tuning

If there is one overarching lesson from the last 10 years of ecological machine learning research, its that starting from existing backbone is the most common strategy for success. Data from a suprisingly wide array of sources can be useful. Let's retrain this backbone with our classes. 

In [None]:
m

By looking at this model statement, we can see we need embeddings of size 1280 and strip off the top layer. We can keep the bottom layers by freezing the backbone. Following the README.md https://github.com/kitzeslab/bioacoustics-model-zoo

In [None]:
m.freeze_feature_extractor()

In [None]:
# Create train/test split with 1 random sample per class for test
test_df = train_df.groupby('primary_label').apply(lambda x: x.sample(n=1)).reset_index(drop=True)
train_df = train_df[~train_df.index.isin(test_df.index)]

print(f"Train samples: {len(train_df)}")
print(f"Test samples: {len(test_df)}")

labels = train_df.primary_label.unique()
m.change_classes(labels)

In [None]:
m.embed

In [None]:
# Not clear how train_df should be formatted

# Create a mini dataset for testing, one train sample for each species
train_df = train_df.groupby('primary_label').apply(lambda x: x.sample(n=1)).reset_index(drop=True)

formatted_train_df = pd.get_dummies(train_df[["filename", "primary_label"]].set_index("filename")["primary_label"]) * 1
formatted_test_df = pd.get_dummies(test_df[["filename", "primary_label"]].set_index("filename")["primary_label"]) * 1

formatted_train_df

In [None]:
from opensoundscape.ml.shallow_classifier import MLPClassifier, quick_fit, fit_classifier_on_embeddings
import torch

clf = clf = MLPClassifier(
    input_size=1280, output_size=formatted_train_df.shape[1], hidden_layer_sizes=()
)

emb_train, label_train, emb_val, label_val = fit_classifier_on_embeddings(
    embedding_model=m,
    classifier_model=clf,
    train_df=formatted_train_df,
    validation_df=formatted_test_df,
    steps=1,
    embedding_batch_size=128,
    embedding_num_workers=2,
    device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
)