In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import pandas as pd
from avgn.utils.paths import DATA_DIR, most_recent_subdirectory, ensure_dir
from sklearn.preprocessing import scale
#from pyclustertend import hopkins, vat, ivat
import matplotlib.pyplot as plt
from tqdm.autonotebook import tqdm



In [None]:
from avgn.clusterability.hopkins import hopkins_statistic

In [None]:
datasets = list(DATA_DIR.glob('embeddings/**/*.pickle'))
len(datasets)

296

In [None]:
datasets = [i for i in datasets if i.parent.stem not in [
    'buckeye',
    'BIRD_DB_Vireo_cassinii',
    'swamp_sparrow',
    'batsong_segmented'
]]

In [None]:
len(datasets)

291

In [None]:
def remove_outliers(z, pct = 99.5):
    """ GPU based UMAP algorithm produces some outliers that UMAP does not, but is much faster
    this is a quick fix for that. 
    """
    _min = np.percentile(z, (100-pct), axis=0)
    _max = np.percentile(z, pct, axis=0)
    for col in range(np.shape(z)[1]):
        mask = z[:,col] < _min[col]
        z[mask,col] = _min[col]
        mask = z[:,col] > _max[col]
        z[mask,col] = _max[col]
    return z

In [None]:
ensure_dir(DATA_DIR / 'clusterability' / 'convex_sample_indvs')

In [None]:
def gen_clusterability_df(dataset):
    """
    m_prop_n is the number of samples over X to perform hopkins statistic on (.1 is reccomended)
    """
    save_loc = DATA_DIR / 'clusterability' / 'convex_sample_indvs' / \
        (dataset.parent.stem + '_ ' + dataset.stem + '.pickle')

    #if save_loc.exists():
    #    return

    ds = pd.read_pickle(dataset)
    specs = np.stack(ds[['spectrogram']].spectrogram.values)
    specs = specs.reshape(len(specs), -1)
    specs = scale(specs)

    umap_proj = np.vstack(ds[['umap']].umap.values)
    umap_proj = remove_outliers(umap_proj, pct=99.5)
    umap_proj = scale(umap_proj)
    
    nex = len(umap_proj)

    print((dataset, np.shape(umap_proj)))
    fig, ax = plt.subplots()
    ax.scatter(umap_proj[:, 0], umap_proj[:, 1], s=1, color='k', alpha=0.1)
    plt.show()

    hopkins_dict = {
        'umap':
        {
            0.01: hopkins_statistic(umap_proj, m_prop_n=0.01, n_neighbors=1, distribution="uniform_convex_hull"),
            0.1: hopkins_statistic(umap_proj, m_prop_n=0.1, n_neighbors=1, distribution="uniform_convex_hull"),
        },

    }
    
    dsname =  dataset.parent.parent.stem if dataset.parent.stem == 'indvs' else dataset.parent.stem

    clusterability_df = pd.DataFrame([[dataset, dsname, dataset.stem,
                                       hopkins_dict['umap'][0.01], hopkins_dict['umap'][0.1], nex]], columns=[
        'df', 'dataset', 'indv',
        'umap_hopkins_1', 'umap_hopkins_10', 'nex'
    ])

    clusterability_df.to_pickle(save_loc)

In [None]:
ensure_dir(DATA_DIR / 'clusterability' / 'convex_sample')

In [None]:
from joblib import Parallel, delayed

In [None]:
clust_data = Parallel(n_jobs=-1, verbose=10)(
    delayed(gen_clusterability_df)(dataset)
    for dataset in tqdm(datasets)
)

HBox(children=(IntProgress(value=0, max=291), HTML(value='')))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.
