#### Project wavs into UMAP and cluster using HDBSCAN
- for each individual, for each WAV, grab syllables as spectrograms of equal length
- project spectrograms into UMAP
- cluster UMAP projections

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from tqdm.autonotebook import tqdm
from joblib import Parallel, delayed
import umap
import pandas as pd



In [3]:
from avgn.utils.hparams import HParams
from avgn.dataset import DataSet
from avgn.utils.paths import DATA_DIR, most_recent_subdirectory, ensure_dir

In [4]:
from avgn.signalprocessing.create_spectrogram_dataset import create_syllable_df

In [5]:
from avgn.visualization.projections import scatter_projections
from avgn.visualization.quickplots import draw_projection_plots

### Collect data

In [6]:
hparams = HParams(
    num_mel_bins = 32
    )

In [7]:
DATASET_ID = 'koumura_bengalese_finch'
# create a dataset object
dataset = DataSet(DATASET_ID, hparams = hparams)

HBox(children=(IntProgress(value=0, description='loading json', max=2964, style=ProgressStyle(description_widt…

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    4.3s
[Parallel(n_jobs=-1)]: Done 584 tasks      | elapsed:    4.9s





[Parallel(n_jobs=-1)]: Done 2964 out of 2964 | elapsed:    5.2s finished


HBox(children=(IntProgress(value=0, description='getting unique individuals', max=2964, style=ProgressStyle(de…



In [8]:
# transcription types
dataset.sample_json['indvs'][list(dataset.sample_json['indvs'].keys())[0]].keys()

odict_keys(['notes'])

#### cluster and plot

In [9]:
nex = -1 # for quick viz, how many data points to fit

In [10]:
syllable_dfs = {}
for indv in tqdm(dataset._unique_indvs, desc="indvs"):
    print(indv)
    # create dataframe
    syllable_dfs[indv] = create_syllable_df(dataset, indv, unit="notes", log_scaling_factor=8)[:nex]
    specs_flattened = flatten_spectrograms(
        np.array(list(syllable_dfs[indv].syllables_spec.values))
    )

    fit = umap.UMAP(min_dist=0.25)
    syllable_dfs[indv]["umap"] = list(fit.fit_transform(specs_flattened))

    # plot data
    draw_projection_plots(syllable_dfs[indv])
    plt.show()

HBox(children=(IntProgress(value=0, description='indvs', max=2656, style=ProgressStyle(description_width='init…

odict_keys(['Bird1'])


HBox(children=(IntProgress(value=0, max=4), HTML(value='')))

HBox(children=(IntProgress(value=0, description='getting syllable wavs', max=314, style=ProgressStyle(descript…




PicklingError: Could not pickle the task to send it to the workers.

In [67]:
syllable_dfs['Bird1'][:3]

Unnamed: 0,syllables_sequence_id,syllables_sequence_pos,syllables_wav,syllables_rate,syllables_labels,syllables_spec,umap
0,0,0,"[0.00010429580874564355, -3.685241476730734e-0...",32000,0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[10.845855, -2.2115288]"
1,0,1,"[-0.00025558206636041615, -0.00014670878654945...",32000,0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[8.8575945, -2.3941612]"
2,0,2,"[0.00013577303249693684, 0.0003174299934470633...",32000,0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[8.885393, -4.024104]"


### save dataframes

In [68]:
syllable_dfs.keys()

dict_keys(['Bird0', 'Bird1', 'Bird10', 'Bird2', 'Bird3', 'Bird4', 'Bird5', 'Bird6', 'Bird7', 'Bird8', 'Bird9'])

In [100]:
for indv in tqdm(syllable_dfs.keys()):
    save_loc = DATA_DIR / 'syllable_dfs' / DATASET_ID / (indv + '.pickle')
    ensure_dir(save_loc)
    syllable_dfs[indv].to_pickle(save_loc)

HBox(children=(IntProgress(value=0, max=11), HTML(value='')))