In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
from avgn.utils.paths import DATA_DIR, most_recent_subdirectory, ensure_dir
from sklearn.preprocessing import scale
from pyclustertend import hopkins, vat, ivat
import matplotlib.pyplot as plt
from tqdm.autonotebook import tqdm



In [3]:
from avgn.clusterability.hopkins import hopkins_statistic

In [4]:
datasets = list(DATA_DIR.glob('embeddings/*/*.pickle'))
len(datasets)

84

In [5]:
def remove_outliers(z, pct = 99.5):
    """ GPU based UMAP algorithm produces some outliers that UMAP does not, but is much faster
    this is a quick fix for that. 
    """
    _min = np.percentile(z, (100-pct), axis=0)
    _max = np.percentile(z, pct, axis=0)
    for col in range(np.shape(z)[1]):
        mask = z[:,col] < _min[col]
        z[mask,col] = _min[col]
        mask = z[:,col] > _max[col]
        z[mask,col] = _max[col]
    return z

In [29]:
def gen_clusterability_df(dataset):
    
    save_loc = DATA_DIR / 'clusterability'/ (dataset.parent.stem + '_ '+ dataset.stem + '.pickle')
    
    if save_loc.exists():
        return 
    
    ds = pd.read_pickle(dataset)
    specs = np.stack(ds[['spectrogram']].spectrogram.values)
    specs = specs.reshape(len(specs),-1)
    specs = scale(specs)
    
    umap_proj = np.vstack(ds[['umap']].umap.values)
    umap_proj = remove_outliers(umap_proj, pct=99.5)
    umap_proj = scale(umap_proj)

    print(dataset, np.shape(umap_proj))
    fig, ax = plt.subplots()
    ax.scatter(umap_proj[:, 0], umap_proj[:, 1], s=1, color='k', alpha=0.1)
    plt.show()
    
    hopkins_dict = {
        'umap':
        {
            0.01: hopkins_statistic(umap_proj, m_prop_n=0.01, n_neighbors = 1),
            0.1: hopkins_statistic(umap_proj, m_prop_n=0.1, n_neighbors = 1),
        },
        'spec':
        {
            0.01: hopkins_statistic(specs, m_prop_n=0.01, n_neighbors = 1),
            0.1: hopkins_statistic(specs, m_prop_n=0.1, n_neighbors = 1),
        } 
        
        
    }
    
    clusterability_df = pd.DataFrame([[dataset, dataset.parent.stem, dataset.stem, 
        hopkins_dict['umap'][0.01], hopkins_dict['umap'][0.1],
        hopkins_dict['spec'][0.01], hopkins_dict['spec'][0.1],]], columns=[
        'df', 'dataset', 'indv', 
    'umap_hopkins_1', 'umap_hopkins_10', 'spec_hopkins_1', 'spec_hopkins_10'
    ])

    clusterability_df.to_pickle(save_loc)

In [30]:
ensure_dir(DATA_DIR / 'clusterability')

In [31]:
from joblib import Parallel, delayed

In [32]:
clust_data = Parallel(n_jobs=-1, verbose=10)(
    delayed(gen_clusterability_df)(dataset)
    for dataset in tqdm(datasets)
)

HBox(children=(IntProgress(value=0, max=84), HTML(value='')))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   10.6s
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:   43.4s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  1.1min





[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done  46 out of  84 | elapsed:  1.5min remaining:  1.2min
[Parallel(n_jobs=-1)]: Done  55 out of  84 | elapsed:  2.1min remaining:  1.1min
[Parallel(n_jobs=-1)]: Done  64 out of  84 | elapsed:  5.1min remaining:  1.6min
[Parallel(n_jobs=-1)]: Done  73 out of  84 | elapsed: 10.2min remaining:  1.5min
[Parallel(n_jobs=-1)]: Done  82 out of  84 | elapsed: 25.7min remaining:   37.6s
[Parallel(n_jobs=-1)]: Done  84 out of  84 | elapsed: 1290.2min finished


In [12]:
clust_data

Parallel(n_jobs=<generator object <genexpr> at 0x7ff9ed5b2f10>)

In [None]:
clusterability_df = pd.DataFrame(columns=[
    'df', 'dataset', 'indv', 
'umap_hopkins_1', 'umap_hopkins_10', 'spec_hopkins_1', 'spec_hopkins_10'
])
for dataset in tqdm(datasets):
    ds = pd.read_pickle(dataset)
    specs = np.stack(ds[['spectrogram']].spectrogram.values)
    specs = specs.reshape(len(specs),-1)
    specs = scale(specs)
    
    umap_proj = np.vstack(ds[['umap']].umap.values)
    umap_proj = remove_outliers(umap_proj, pct=99.5)
    umap_proj = scale(umap_proj)

    print(dataset, np.shape(umap_proj))
    fig, ax = plt.subplots()
    ax.scatter(umap_proj[:, 0], umap_proj[:, 1], s=1, color='k', alpha=0.1)
    plt.show()
    
    hopkins_dict = {
        'umap':
        {
            0.01: hopkins_statistic(umap_proj, m_prop_n=0.01, n_neighbors = 1),
            0.1: hopkins_statistic(umap_proj, m_prop_n=0.1, n_neighbors = 1),
        },
        'spec':
        {
            0.01: hopkins_statistic(specs, m_prop_n=0.01, n_neighbors = 1),
            0.1: hopkins_statistic(specs, m_prop_n=0.1, n_neighbors = 1),
        } 
        
        
    }

    clusterability_df.loc[len(clusterability_df)] = [
        dataset, dataset.parent.stem, dataset.stem, 
        hopkins_dict['umap'][0.01], hopkins_dict['umap'][0.1],
        hopkins_dict['spec'][0.01], hopkins_dict['spec'][0.1],
    ]

In [None]:
clusterability_df[:3]

In [None]:
clusterability_df.dataset.unique()

In [None]:
species_dict = {
    'castellucci_mouse_usv_segmented': {
        'species': 'Mouse',
        'group': 'mammal'
    },
    'BIRD_DB_Vireo_cassinii': {
        'species': 'Cassin\'s vireo',
        'group': 'songbird'
    },
    'gibbon_morita_segmented': {
        'species': 'Gibbon',
        'group': 'mammal'
    },
    'bengalese_finch_sober': {
        'species': 'Bengalese finch',
        'group': 'songbird'
    },
    'buckeye': {
        'species': 'Human (English)',
        'group': 'mammal'
    },
    'swamp_sparrow': {
        'species': 'Swamp sparrow',
        'group': 'songbird'
    },
    'mobysound_humpback_whale': {
        'species': 'Humpback whale',
        'group': 'mammal'
    },
    'koumura_bengalese_finch': {
        'species': 'Bengalese finch',
        'group': 'songbird'
    },
    'giant_otter': {
        'species': 'Giant otter',
        'group': 'mammal'
    },
    'BIRD_DB_Toxostoma_redivivum': {
        'species': 'California thrasher',
        'group': 'songbird'
    },
    'zebra_finch_gardner_segmented': {
        'species': 'Zebra finch',
        'group': 'songbird'
    },
    'katahira_white_munia_segmented': {
        'species': 'White-rumped munia',
        'group': 'songbird'
    },
    'european_starling_gentner_segmented': {
        'species': 'European starling',
        'group': 'songbird'
    },
}

In [None]:
clusterability_df['species'] = [species_dict[ds]['species']  for ds in clusterability_df['dataset'].values]
clusterability_df['family'] = [species_dict[ds]['group']  for ds in clusterability_df['dataset'].values]

In [None]:
clusterability_df[:3]

In [None]:
import seaborn as sns

In [None]:
from avgn.utils.general import save_fig
from avgn.utils.paths import FIGURE_DIR, ensure_dir

In [None]:
sns.set(style='white', font_scale=2)

metric = 'umap_hopkins_10'

fig, ax = plt.subplots(ncols=1, figsize=(15, 8))

# log scale
result = clusterability_df.groupby(["species", 'family'])[metric].aggregate(
    np.median).reset_index().sort_values(metric)
#result[metric] = -np.log(result[metric])

# create bar chart
bar = sns.barplot(
    x=metric,
    y="species",
    hue='family',
    dodge=False,
    data=result,
    ax=ax,
    linewidth=3,
    edgecolor="0",
    order=result['species'],
    #log=True,
    #width = 1,
    palette = sns.color_palette('Set1')
)
ax.set_xlabel('Clusterability ($\it{-log(Hopkin\'s\ metric)}$)', fontsize=24)
ax.set_ylabel('')
ax.get_legend().remove()

for axis in ['top','left','right']:
    ax.spines[axis].set_linewidth(0)
ax.spines['bottom'].set_linewidth(5)

ylim = ax.get_ylim()
ax.set_ylim([ylim[0]+.25, ylim[1]])
ax.set_xlim([0.5,1])
#save_fig(FIGURE_DIR / 'hopkins_comparison', dpi=300, save_jpg=False, save_png=True)

In [None]:
sns.set(style='white', font_scale=2)

metric = 'spec_hopkins_10'

fig, ax = plt.subplots(ncols=1, figsize=(15, 8))

# log scale
result = clusterability_df.groupby(["species", 'family'])[metric].aggregate(
    np.median).reset_index().sort_values(metric)
result[metric] = (1 - result[metric])

# create bar chart
bar = sns.barplot(
    x=metric,
    y="species",
    hue='family',
    dodge=False,
    data=result,
    ax=ax,
    linewidth=3,
    edgecolor="0",
    order=result['species'],
    #log=True,
    #width = 1,
    palette = sns.color_palette('Set1')
)
ax.set_xlabel('Clusterability ($\it{ 1 - Hopkin\'s\ statistic}$)', fontsize=24)
ax.set_ylabel('')
ax.get_legend().remove()

for axis in ['top','left','right']:
    ax.spines[axis].set_linewidth(0)
ax.spines['bottom'].set_linewidth(5)
ax.set_xscale('log')

ylim = ax.get_ylim()
ax.set_ylim([ylim[0]+.25, ylim[1]])
#ax.set_xlim([0.5,1])

#save_fig(FIGURE_DIR / 'hopkins_comparison', dpi=300, save_jpg=False, save_png=True)

In [None]:
sns.set(style='white', font_scale=2)

metric = 'spec_hopkins_1'

fig, ax = plt.subplots(ncols=1, figsize=(15, 8))

# log scale
result = clusterability_df.groupby(["species", 'family'])[metric].aggregate(
    np.median).reset_index().sort_values(metric)
result[metric] = (1 - result[metric])

# create bar chart
bar = sns.barplot(
    x=metric,
    y="species",
    hue='family',
    dodge=False,
    data=result,
    ax=ax,
    linewidth=3,
    edgecolor="0",
    order=result['species'],
    #log=True,
    #width = 1,
    palette = sns.color_palette('Set1')
)
ax.set_xlabel('Clusterability ($\it{ 1 - Hopkin\'s\ statistic}$)', fontsize=24)
ax.set_ylabel('')
ax.get_legend().remove()

for axis in ['top','left','right']:
    ax.spines[axis].set_linewidth(0)
ax.spines['bottom'].set_linewidth(5)
ax.set_xscale('log')

ylim = ax.get_ylim()
ax.set_ylim([ylim[0]+.25, ylim[1]])
#ax.set_xlim([0.5,1])

#save_fig(FIGURE_DIR / 'hopkins_comparison', dpi=300, save_jpg=False, save_png=True)

In [None]:
sns.set(style='white', font_scale=2)

metric = 'spec_hopkins_0.1'

fig, ax = plt.subplots(ncols=1, figsize=(15, 8))

# log scale
result = clusterability_df.groupby(["species", 'family'])[metric].aggregate(
    np.median).reset_index().sort_values(metric)
result[metric] = (1 - result[metric])

# create bar chart
bar = sns.barplot(
    x=metric,
    y="species",
    hue='family',
    dodge=False,
    data=result,
    ax=ax,
    linewidth=3,
    edgecolor="0",
    order=result['species'],
    #log=True,
    #width = 1,
    palette = sns.color_palette('Set1')
)
ax.set_xlabel('Clusterability ($\it{ 1 - Hopkin\'s\ statistic}$)', fontsize=24)
ax.set_ylabel('')
ax.get_legend().remove()

for axis in ['top','left','right']:
    ax.spines[axis].set_linewidth(0)
ax.spines['bottom'].set_linewidth(5)
ax.set_xscale('log')

ylim = ax.get_ylim()
ax.set_ylim([ylim[0]+.25, ylim[1]])
#ax.set_xlim([0.5,1])

#save_fig(FIGURE_DIR / 'hopkins_comparison', dpi=300, save_jpg=False, save_png=True)

In [None]:
sns.set(style='white', font_scale=2)

metric = 'hopkins_statistic'

fig, ax = plt.subplots(ncols=1, figsize=(15, 8))

# log scale
result = clusterability_df.groupby(["species", 'family'])[metric].aggregate(
    np.median).reset_index().sort_values(metric)
result[metric] = -np.log(1 - result[metric])

# create bar chart
bar = sns.barplot(
    x=metric,
    y="species",
    hue='family',
    dodge=False,
    data=result,
    ax=ax,
    linewidth=3,
    edgecolor="0",
    order=result['species'],
    #log=True,
    #width = 1,
    palette = sns.color_palette('Set1')
)
ax.set_xlabel('Clusterability ($\it{-log(Hopkin\'s\ metric)}$)', fontsize=24)
ax.set_ylabel('')
ax.get_legend().remove()

for axis in ['top','left','right']:
    ax.spines[axis].set_linewidth(0)
ax.spines['bottom'].set_linewidth(5)

ylim = ax.get_ylim()
ax.set_ylim([ylim[0]+.25, ylim[1]])
#ax.set_xlim([0.5,1])
#save_fig(FIGURE_DIR / 'hopkins_comparison', dpi=300, save_jpg=False, save_png=True)

In [None]:
sns.set(style='white', font_scale=2)

metric = 'hopkins_statistic_flip'

fig, ax = plt.subplots(ncols=1, figsize=(15, 8))

# log scale
result = clusterability_df.groupby(["species", 'family'])[metric].aggregate(
    np.median).reset_index().sort_values(metric)
#result[metric] = -np.log(result[metric])

# create bar chart
bar = sns.barplot(
    x=metric,
    y="species",
    hue='family',
    dodge=False,
    data=result,
    ax=ax,
    linewidth=3,
    edgecolor="0",
    order=result['species'],
    #log=True,
    #width = 1,
    palette = sns.color_palette('Set1')
)
ax.set_xlabel('Clusterability ($\it{-log(Hopkin\'s\ metric)}$)', fontsize=24)
ax.set_ylabel('')
ax.get_legend().remove()

for axis in ['top','left','right']:
    ax.spines[axis].set_linewidth(0)
ax.spines['bottom'].set_linewidth(5)

ylim = ax.get_ylim()
ax.set_ylim([ylim[0]+.25, ylim[1]])
axs.set_xscale('log')
#ax.set_xlim([0.5,1])
#save_fig(FIGURE_DIR / 'hopkins_comparison', dpi=300, save_jpg=False, save_png=True)

In [None]:
sns.set(style='white', font_scale=2)

metric = 'hopkins_statistic_flip'

fig, ax = plt.subplots(ncols=1, figsize=(15, 8))

# log scale
result = clusterability_df.groupby(["species", 'family'])[metric].aggregate(
    np.median).reset_index().sort_values(metric)
result[metric] = -np.log(result[metric])

# create bar chart
bar = sns.barplot(
    x=metric,
    y="species",
    hue='family',
    dodge=False,
    data=result,
    ax=ax,
    linewidth=3,
    edgecolor="0",
    order=result['species'],
    #log=True,
    #width = 1,
    palette = sns.color_palette('Set1')
)
ax.set_xlabel('Clusterability ($\it{-log(Hopkin\'s\ metric)}$)', fontsize=24)
ax.set_ylabel('')
ax.get_legend().remove()

for axis in ['top','left','right']:
    ax.spines[axis].set_linewidth(0)
ax.spines['bottom'].set_linewidth(5)

ylim = ax.get_ylim()
ax.set_ylim([ylim[0]+.25, ylim[1]])
#ax.set_xlim([0.5,1])
#save_fig(FIGURE_DIR / 'hopkins_comparison', dpi=300, save_jpg=False, save_png=True)

In [None]:
metric = 'hopkins_statistic'
sns.barplot(x=metric,
            y="species",
            hue='family',
            data=clusterability_df.sort_values(by=metric),
            )

In [None]:
for metric in ['hopkins_10','hopkins_100','hopkins_pct_0.1','hopkins_pct_1','hopkins_pct_5','hopkins_pct_10']:
    result = clusterability_df.groupby(["species", 'family'])[metric].aggregate(np.median).reset_index().sort_values(metric)
    fig, axs = plt.subplots(ncols=2, figsize=(20,5))
    sns.barplot(x=metric,
                y="species",
                hue='family',
                data=clusterability_df.sort_values(by=[metric]),
                ax=axs[0],
                order = result['species']
               )
    
    sns.barplot(x=metric,
                y="species",
                hue='family',
                data=clusterability_df.sort_values(by=metric),
                ax=axs[1],
                order = result['species']
               )
    axs[1].set_xscale('log')
    plt.tight_layout()
    plt.show()