In [1]:
import umap.umap_ as umap
import pandas as pd
import numpy as np
import sklearn.preprocessing
import seaborn as sns
import librosa
import librosa.display
from pathlib import Path
import matplotlib.pyplot as plt
from collections import defaultdict
from sklearn.preprocessing import StandardScaler
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

In [2]:
DEFAULT_SR = 21000
DEFAULT_N_FFT = 2048
DEFAULT_HOP_LENGTH = 512
DEFAULT_N_MELS = 128

In [3]:
def audio_to_melspectrogram(audio_path, sr, n_fft, hop_length, n_mels):
    try:
        # Calculate fixed shape based on current parameters
        duration = 1.0
        fixed_shape = (n_mels, int(sr * duration / hop_length) + 1)

        # Load with fixed duration (pad/trim as needed)
        y, sr = librosa.load(audio_path, sr=sr, duration=duration)
        if len(y) < sr * duration:
            y = np.pad(y, (0, max(0, int(sr * duration) - len(y))))

        # Generate fixed-length spectrogram
        S = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=n_fft,
                                         hop_length=hop_length,
                                         n_mels=n_mels)
        log_S = librosa.power_to_db(S, ref=np.max)

        # Ensure consistent shape
        if log_S.shape != fixed_shape:
            log_S = log_S[:, :fixed_shape[1]]  # Truncate or pad columns
            if log_S.shape[1] < fixed_shape[1]:
                log_S = np.pad(log_S, ((0,0), (0,fixed_shape[1]-log_S.shape[1])))

        return log_S.flatten()
    except Exception as e:
        print(f"Error processing {audio_path}: {str(e)}")
        return None

In [4]:
def create_umap_visualization(sr, n_fft, hop_length, n_mels):
    wavs = list(Path("mswc_microset/mswc_microset/en/clips").rglob("*.opus"))
    print(f"Found {len(wavs)} files")

    keyword_dict = defaultdict(list)
    for wav in wavs:
        label = wav.parts[-2]
        if len(keyword_dict[label]) < 15:
            keyword_dict[label].append(wav)

    spectrograms = []
    valid_labels = []
    for label, paths in keyword_dict.items():
        for path in paths:
            spec = audio_to_melspectrogram(str(path), sr, n_fft, hop_length, n_mels)
            if spec is not None:
                spectrograms.append(spec)
                valid_labels.append(label)

    if not spectrograms:
        print("No valid spectrograms generated!")
        return

    spectrograms = np.stack(spectrograms)
    print(f"Final array shape: {spectrograms.shape}")

    # Standardize and project
    scaler = StandardScaler()
    scaled = scaler.fit_transform(spectrograms)
    embedding = umap.UMAP(random_state=42).fit_transform(scaled)

    # Plot
    df = pd.DataFrame({
        'x': embedding[:,0],
        'y': embedding[:,1],
        'keyword': valid_labels
    })

    plt.figure(figsize=(15,15))
    sns.scatterplot(
        x='x', y='y',
        hue='keyword',
        data=df,
        palette=sns.color_palette('hls', len(set(valid_labels))),
        s=100,
        alpha=0.7
    )
    plt.title(f"UMAP of {len(valid_labels)} audio samples (SR={sr}, N_FFT={n_fft}, Hop={hop_length}, Mels={n_mels})")
    plt.legend(bbox_to_anchor=(1.05, 1))
    plt.tight_layout()
    plt.show()

In [5]:
interact_manual(
    create_umap_visualization,
    sr=widgets.IntSlider(min=8000, max=44100, step=1000, value=DEFAULT_SR, description="Sample Rate"),
    n_fft=widgets.IntSlider(min=256, max=4096, step=256, value=DEFAULT_N_FFT, description="N_FFT"),
    hop_length=widgets.IntSlider(min=64, max=1024, step=64, value=DEFAULT_HOP_LENGTH, description="Hop Length"),
    n_mels=widgets.IntSlider(min=32, max=256, step=32, value=DEFAULT_N_MELS, description="N Mels")
)

interactive(children=(IntSlider(value=21000, description='Sample Rate', max=44100, min=8000, step=1000), IntSl…

<function __main__.create_umap_visualization(sr, n_fft, hop_length, n_mels)>