#### Project wavs into UMAP and cluster using HDBSCAN
- for each individual, for each WAV, grab syllables as spectrograms of equal length
- project spectrograms into UMAP
- cluster UMAP projections

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from tqdm.autonotebook import tqdm
from joblib import Parallel, delayed
import umap
import pandas as pd



In [3]:
from avgn.utils.hparams import HParams
from avgn.dataset import DataSet
from avgn.utils.paths import DATA_DIR, most_recent_subdirectory, ensure_dir

In [4]:
from avgn.signalprocessing.create_spectrogram_dataset import create_syllable_df

In [5]:
from avgn.visualization.projections import scatter_projections
from avgn.visualization.quickplots import draw_projection_plots

### Collect data

In [6]:
hparams = HParams(
    num_mel_bins = 32
    )

In [7]:
DATASET_ID = 'bengalese_finch_sober'
# create a dataset object
dataset = DataSet(DATASET_ID, hparams = hparams)

FileNotFoundError: [Errno 2] No such file or directory: '/home/AD/tsainbur/github_repos/avgn_paper/data/processed/bengalese_finch_sober'

In [None]:
def flatten_spectrograms(specs):
    return np.reshape(specs, (np.shape(specs)[0], np.prod(np.shape(specs)[1:])))

#### cluster and plot

In [None]:
nex = -1 # for quick viz, how many data points to fit

In [None]:
syllable_dfs = {}
for indv in tqdm(dataset._unique_indvs, desc="indvs"):
    print(indv)
    # create dataframe
    syllable_dfs[indv] = create_syllable_df(dataset, indv, log_scaling_factor=8)[:nex]
    specs_flattened = flatten_spectrograms(
        np.array(list(syllable_dfs[indv].syllables_spec.values))
    )

    fit = umap.UMAP(min_dist=0.25)
    syllable_dfs[indv]["umap"] = list(fit.fit_transform(specs_flattened))

    # plot data
    draw_projection_plots(syllable_dfs[indv])
    plt.show()
    break

### save dataframes

In [12]:
syllable_dfs.keys()

dict_keys(['bl26lb16', 'gr41rd51', 'gy6or6', 'or60yw70'])

In [15]:
for indv in tqdm(syllable_dfs.keys()):
    save_loc = DATA_DIR / 'syllable_dfs' / DATASET_ID / (indv + '.pickle')
    ensure_dir(save_loc)
    syllable_dfs[indv].to_pickle(save_loc)

HBox(children=(IntProgress(value=0, max=4), HTML(value='')))