# Persistent single-linkage (HDBSCAN) clustering on sparse matrices

This is our attempt at running an HDBSCAN approach (persistent single linkage) on a sparse distance matrix. 
The sparse distance matrix we use here is the UMAP matrix with distances coming from the low dimensional projection from UMAP. We want to see how the code behaves on a partial distance.

In [1]:
!git branch

* [32mmaster[m


In [2]:
execfile('functions/data_specifics.py')
execfile('functions/graph_functions.py')
print(data_set_list)

['pendigits', 'coil', 'mnist', 'usps', 'buildings', 'clusterable']


In [3]:
from IPython.display import display, Markdown, Latex
from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score, silhouette_score
from sklearn import cluster

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import umap
from collections import Counter
from scipy.stats import mode

from scipy.spatial.distance import euclidean

sns.set()

In [4]:
import hdbscan
import scipy.sparse as sp
import sklearn.cluster
from hdbscan._hdbscan_tree import (
    condense_tree,
    compute_stability,
    get_clusters,
    outlier_scores,
)
from hdbscan.plots import CondensedTree, SingleLinkageTree, MinimumSpanningTree

## Get clusters from a (disconnected) sparse distance matrix

* The 0-values indicate "far away" points

If the matrix has disconnected components (when viewed as a graph), HDBSCAN does not work. To get around this, we have tried clustering each component separately or just adding edges to glue the different parts together.

In [5]:
def clustering_from_sparse(D,
                       connected_component_method = 'glue', # 'glue' or 'individual'
                       cluster_selection_method = 'eom',
                       allow_single_cluster=False,
                       cluster_selection_epsilon=0.0,
                       max_cluster_size=0,
                       min_cluster_size=10,
                       no_split_size = 100,
                       alpha = 1.2):
    par = locals()

    cc = scipy.sparse.csgraph.connected_components(D)
    n = D.shape[0]

    if(cc[0] == 1):
        labels = clustering_from_sparse_connected(**par)
        
    elif (connected_component_method == 'glue'):
        print("Disconnected distance matrix: \n -Connecting distance matrix")
        labels = clustering_from_sparse_force_connection(**par)
        
    elif( connected_component_method == 'individual' ):
        print("Disconnected distance matrix: \n -Performing clustering on individual connected components")
        labels = clustering_from_sparse_on_individual_cc(**par)
        
    else:
        raise ValueError('Parameter should be glue or individual')
        
    return(labels)


# Single linkage on sparse matrix when the initial distance matrix is connected.
def clustering_from_sparse_connected(D,
                       connected_component_method = 'glue',
                       cluster_selection_method = 'eom',
                       allow_single_cluster=False,
                       cluster_selection_epsilon=0.0,
                       max_cluster_size=0,
                       min_cluster_size=10,
                       no_split_size = 100,
                       alpha = 1.2):
    
    mst = scipy.sparse.csgraph.minimum_spanning_tree(D)
    mst = mst.tocoo()
    mst_array = np.vstack([mst.row, mst.col, mst.data]).T
    
    # Sort edges of the min_spanning_tree by weight
    mst_array = mst_array[np.argsort(mst_array.T[2], kind="mergesort"), :]

    # Convert edge list into standard hierarchical clustering format
    single_linkage_tree = sklearn.cluster._hierarchical_fast._single_linkage_label(mst_array)
    
    ## HDBSCAN function
    cd_tree = condense_tree(single_linkage_tree, min_cluster_size)
    
    ## HDBSCAN function
    stability_dict = compute_stability(cd_tree)
    
    ## HDBSCAN function
    labels, probabilities, stabilities = get_clusters(
            cd_tree,
            stability_dict,
            cluster_selection_method = cluster_selection_method,
            allow_single_cluster = allow_single_cluster,
            cluster_selection_epsilon=cluster_selection_epsilon,
            max_cluster_size=max_cluster_size
        )
    return(labels)


# Performing clustering on individual connected components of the sparse distance matrix
def clustering_from_sparse_on_individual_cc(D,
                       connected_component_method = 'glue',
                       cluster_selection_method = 'eom',
                       allow_single_cluster=False,
                       cluster_selection_epsilon=0.0,
                       max_cluster_size=0,
                       min_cluster_size=10,
                       no_split_size = 100,
                       alpha = 1.2):
    
    # Check connected components
    cc = scipy.sparse.csgraph.connected_components(D)
    n = D.shape[0]
    
    if(cc[0]==1):
        labels = clustering_from_sparse_connected(D)
    else:
        labels = np.array([-1]*n)
        for comp in range(cc[0]):
            w = (cc[1]==comp)
            m = sum(w)
            if(m < min_cluster_size):
                continue
            elif(m < no_split_size):
                labels_part = np.zeros(m)
            else:
                D_part = D[w, :][:, w]
                labels_part = clustering_from_sparse_connected(D_part,
                                                            cluster_selection_method = cluster_selection_method,
                                                            allow_single_cluster = allow_single_cluster,
                                                            cluster_selection_epsilon=cluster_selection_epsilon,
                                                            max_cluster_size=max_cluster_size,
                                                            min_cluster_size=min_cluster_size)
                if(sum(labels_part==-1)>(len(labels_part)/2)):
                    labels_part = 0*labels_part
            w[w] = (labels_part>=0)
            labels[w] = (labels_part[labels_part>=0] + max(labels) + 1)
    return(labels)


# Adding large distance values (alpha * largest) to sparse distance matrix to make it connected
def clustering_from_sparse_force_connection(D,
                       connected_component_method = 'glue',
                       cluster_selection_method = 'eom',
                       allow_single_cluster=False,
                       cluster_selection_epsilon=0.0,
                       max_cluster_size=0,
                       min_cluster_size=10,
                       no_split_size = 100,
                       alpha = 1.2):
    
    # Check connected components
    cc = scipy.sparse.csgraph.connected_components(D)
    cc_list = cc[1].tolist()
    if(cc[0]>1): # Add edges to connect if more than one connected component
        m = max(D.data)
 
        rows = []
        cols = []
        vals = []
        for i in set(cc[1]):
            for j in set(cc[1]):
                if(i>j):
                    new_i = cc_list.index(i)
                    new_j = cc_list.index(j)
                    rows.extend([new_i, new_j])
                    cols.extend([new_j, new_i])
                    vals.extend([alpha*m, alpha*m])
        D_glue = scipy.sparse.coo_matrix((vals, (rows, cols)), shape=D.shape)
        D_new = D + D_glue
    else:
        D_new = D
                       
    labels = clustering_from_sparse_connected(D_new)
    return(labels)

# Clustering algorithm
These are the steps we use to feed a sparse distance matrix to the single linkage...
We are not using HDBSCAN at the moment, but this is where we would like to go.

## Get data, UMAP graph and UMAP low dimensional vectors

In [6]:
dataset_id=0
raw_data, targets, dataset_name = get_dataset(dataset_id=dataset_id)

k = get_dataset_params(dataset_id)['n_neighbors']

A_umap, sigmas, rhos, dists = umap.umap_.fuzzy_simplicial_set(X=raw_data, 
                                             n_neighbors=k, 
                                             random_state=0, 
                                             metric='euclidean', 
                                             return_dists=True,
                                             set_op_mix_ratio=1)

A_knn = knn_adjacency(raw_data, k=k)

umap_rep = get_umap_vectors(dataset_id=dataset_id, raw_data=raw_data)

## Build the partial distance matrix based of UMAP graph

In [360]:
# Symmetric D
D = A_umap.copy()
rows = [x for v in [[i]*v for i, v in enumerate(A_umap.indptr[1:] - A_umap.indptr[:-1])] for x in v]
D.data = np.array([euclidean( umap_rep[rows[i]], 
                                         umap_rep[A_umap.indices[i]])
            for i in range(len(rows))
           ])

## Compare robust single-linkage (portions of HDBSCAN) on partial distance vs. on low dim projection

In [361]:
labels = clustering_from_sparse(D)

ari = adjusted_rand_score(targets, labels)
ami = adjusted_mutual_info_score(targets, labels)
print(f'ARI = {ari} and AMI = {ami}')

ARI = 0.5937845136423342 and AMI = 0.7459900925803741


In [362]:
hd_umap_labels = h_dbscan(umap_rep, which_algo='hdbscan', dataset_id=dataset_id)

ari = adjusted_rand_score(targets, hd_umap_labels)
ami = adjusted_mutual_info_score(targets, hd_umap_labels)
print(f'ARI = {ari} and AMI = {ami}')

ARI = 0.9185149200427103 and AMI = 0.9320899303214291


The difference between the HDBSCAN results on the low-dimensional dataset and HDBSCAN results on the partial distance matrix limited to the UMAP graph entries is huge. Is this normal or is it some issues with my HDBSCAN code?

# Add mutual reachability

In [363]:
D_sparse_lil = D.tolil()
D_mr = hdbscan.hdbscan_.sparse_mutual_reachability(D_sparse_lil)

labels = clustering_from_sparse(D_mr)

ari = adjusted_rand_score(targets, labels)
ami = adjusted_mutual_info_score(targets, labels)
print(f'ARI = {ari} and AMI = {ami}')

ARI = 0.7857591462351462 and AMI = 0.8411578826755054


## Non-symmetrical matrices

In [364]:
D_sparse_lil = dists.tolil()
D_mr = hdbscan.hdbscan_.sparse_mutual_reachability(D_sparse_lil)

labels = clustering_from_sparse(D_mr)

ari = adjusted_rand_score(targets, labels)
ami = adjusted_mutual_info_score(targets, labels)
print(f'ARI = {ari} and AMI = {ami}')

ARI = 0.4139177488218988 and AMI = 0.7048979826398417


In [378]:
# Non-Symmetric D
rows, cols = A_knn.nonzero()
data = np.array([euclidean( umap_rep[rows[i]], umap_rep[cols[i]] ) for i in range(len(rows))])
D_knn = sp.coo_matrix((data, (rows, cols)), shape=A_knn.shape)

In [379]:
D_sparse_lil = D_knn.tolil()
D_mr = hdbscan.hdbscan_.sparse_mutual_reachability(D_sparse_lil)

labels = clustering_from_sparse(D_mr)

ari = adjusted_rand_score(targets, labels)
ami = adjusted_mutual_info_score(targets, labels)
print(f'ARI = {ari} and AMI = {ami}')

ARI = 0.8604091972126695 and AMI = 0.8897646915879353


### WRONG!

This should really be close to what we get when we run HDBSCAN on UMAP projections. It's close, but it's not what it should be. For this reason, I still have to make sure my code is doing the right thing.

In [11]:
from sklearn.metrics.pairwise import euclidean_distances
X = euclidean_distances(umap_rep, umap_rep)

X_mr = hdbscan.hdbscan_.mutual_reachability(X)

labels = clustering_from_sparse(X_mr)

ari = adjusted_rand_score(targets, labels)
ami = adjusted_mutual_info_score(targets, labels)
print(f'ARI = {ari} and AMI = {ami}')

ARI = 0.8164678757564839 and AMI = 0.8576043252211757


### Difference between sparse and non-sparse versions

In the HDBSCAN code, I do not get the same results when I use sparse and non-sparse mutual reachability functions. The code at the bottom was my attempt in writing a sparse version of mutual reachability... before I realized there was one written already.

In [350]:
# Non-symmetric matrix : D_sparse and D_dense are the same matrix where the 0 values in the sparse version are replaced with 'max_value' in the dense version.
subset_filter = (targets == 3)

D = low_dim_distance(A_knn, umap_rep)

# Submatrix
D_sparse = D[subset_filter,:][:, subset_filter].copy()

# Densify
D_dense = D_sparse.toarray()
D_dense[D_dense<0.0001] = large_val

In [352]:
mr = hdbscan.hdbscan_.mutual_reachability(D_dense)
mr[0]

array([10.        , 10.        , 10.        , 10.        , 10.        ,
       10.        , 10.        , 10.        , 10.        , 10.        ,
       10.        , 10.        , 10.        , 10.        , 10.        ,
       10.        , 10.        , 10.        , 10.        , 10.        ,
       10.        , 10.        , 10.        , 10.        , 10.        ,
       10.        ,  0.15513985, 10.        ,  0.54610425, 10.        ,
       10.        , 10.        , 10.        , 10.        , 10.        ,
       10.        ,  0.15513985, 10.        , 10.        , 10.        ,
       10.        , 10.        , 10.        , 10.        , 10.        ,
       10.        , 10.        , 10.        ,  0.26774687,  0.2630457 ,
       10.        , 10.        , 10.        , 10.        , 10.        ,
       10.        , 10.        , 10.        , 10.        , 10.        ,
       10.        , 10.        , 10.        , 10.        , 10.        ,
       10.        , 10.        , 10.        , 10.        , 10.  

In [355]:
D_sparse_lil = D_sparse.tolil()
sparse_mr = hdbscan.hdbscan_.sparse_mutual_reachability(D_sparse_lil)
sparse_mr[0].toarray()

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.20584038, 0.        , 0.54610425, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.20584038, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.26774687, 0.2630457 ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.  

In [381]:
def sparse_core_distance(sparse_distance_matrix, min_points, max_val):
    def kth_rank(ll, k=min_points, max_val=max_val):
        res = max_val
        if len(ll)>=(k+1):
            res = sorted(ll.data)[k]
        return(res)
    n_col = sparse_distance_matrix.shape[1]
    return(np.array([kth_rank(sparse_distance_matrix[:,i].data) for i in range(n_col)]))

In [382]:
distance_matrix = D_test
min_points = 5
core_distances = np.partition(distance_matrix,
                              min_points,
                              axis=0)[min_points]

In [384]:
core_test = sparse_partition(D_sparse, min_points, max_val=10)
sum([core_test[i]==core_distancers

array([ 0.15513985,  0.1507993 ,  0.18772864, 10.        ,  0.10765006,
        0.17710201,  0.0762486 ,  0.08286497,  0.07412623,  0.072634  ,
        0.09956087,  0.07378092, 10.        , 10.        ,  0.10927223,
        0.14168802,  0.10851032,  0.21560848,  0.16168611,  0.58037549,
        0.17015436,  0.06285882,  0.13098511,  0.06637947,  0.09284311,
       10.        ,  0.10201158,  0.11199529,  0.17807433,  0.19489038,
        0.26800895,  0.17845109,  0.28063136,  0.13569427,  0.1030286 ,
        0.2211379 ,  0.0862309 ,  0.28688502,  0.12839273,  0.171564  ,
       10.        ,  0.03405412, 10.        ,  0.08702035, 10.        ,
        0.11955469,  0.13806805,  0.07851825,  0.04539296,  0.03049082,
       10.        ,  0.08786056, 10.        ,  0.09154192,  0.04712409,
        0.05771601,  0.05298838,  0.05260412, 10.        ,  0.11792402,
        0.02778008,  0.05000826,  0.03326573,  0.03211796,  0.03779933,
        0.04393745,  0.1695765 ,  0.15168063,  0.25202227, 10.  

In [298]:
def low_dim_distance(M, umap_rep):
    D = M.tocsr().copy()
    rows = [x for v in [[i]*v for i, v in enumerate(D.indptr[1:] - D.indptr[:-1])] for x in v]
    D.data = np.array([euclidean( umap_rep[rows[i]], 
                                             umap_rep[D.indices[i]])
                for i in range(len(rows))
               ])
    return(D)

In [299]:
# Symmetric matrix
D = low_dim_distance(A_umap, umap_rep)

# Submatrix
D_sparse = D[subset_filter,:][:, subset_filter].copy()

# Densify
D_test = D_sparse.toarray()
D_test[D_test<0.0001] = large_val

In [335]:
def sparse_mutual_reachability(matrix, min_points=5, alpha=1.0, max_dist=0., max_val=10):
    
    lil_matrix = matrix.tolil()
    result = sparse_matrix(lil_matrix.shape)
    
    csr_matrix = matrix.tocsr()
    core_distance = sparse_partition(csr_matrix, min_points, max_val)

    if alpha != 1.0:
        lil_matrix = lil_matrix / alpha

    nz_row_data, nz_col_data = lil_matrix.nonzero()

    for n in range(nz_row_data.shape[0]):
        i = nz_row_data[n]
        j = nz_col_data[n]

        mr_dist = max(core_distance[i], core_distance[j], lil_matrix[i, j])
        if np.isfinite(mr_dist):
            result[i, j] = mr_dist
        elif max_dist > 0:
            result[i, j] = max_dist

    return result.tocsr()

In [341]:
mr = hdbscan.hdbscan_.mutual_reachability(D_test)

In [342]:
mr[0]

array([10.        , 10.        , 10.        , 10.        , 10.        ,
       10.        , 10.        , 10.        , 10.        , 10.        ,
       10.        , 10.        , 10.        , 10.        , 10.        ,
       10.        , 10.        , 10.        , 10.        , 10.        ,
       10.        , 10.        , 10.        , 10.        , 10.        ,
       10.        ,  0.15513985, 10.        ,  0.54610425, 10.        ,
       10.        , 10.        , 10.        , 10.        , 10.        ,
       10.        ,  0.15513985, 10.        , 10.        , 10.        ,
       10.        , 10.        , 10.        , 10.        , 10.        ,
       10.        , 10.        , 10.        ,  0.26774687,  0.2630457 ,
       10.        , 10.        , 10.        , 10.        , 10.        ,
       10.        , 10.        , 10.        , 10.        , 10.        ,
       10.        , 10.        , 10.        , 10.        , 10.        ,
       10.        , 10.        , 10.        , 10.        , 10.  

In [346]:
D_sparse_lil = D_sparse.tolil()
sparse_mr = hdbscan.hdbscan_.sparse_mutual_reachability(D_sparse_lil)

In [347]:
sparse_mr.todense()[0]

matrix([[0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.20584038, 0.        , 0.54610425, 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.20584038, 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.26774687, 0.2630457 ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0

In [None]:
sparse_mutual_reachability()

In [330]:
lil_matrix = A_knn.tolil()

In [331]:
nz_row_data, nz_col_data = lil_matrix.nonzero()

In [303]:
core_test = sparse_partition(D_sparse, min_points, max_val=10)
core_test

array([ 0.15513985,  0.1507993 ,  0.11902627,  0.33213294,  0.10765006,
        0.15511011,  0.0762486 ,  0.08286497,  0.07412623,  0.07051768,
        0.09956087,  0.07181428,  0.10064597,  0.08905649,  0.10855202,
        0.08008493,  0.10851032,  0.19406435,  0.10214072,  0.15911663,
        0.17015436,  0.06285882,  0.1053426 ,  0.06637947,  0.09284311,
        0.30223918,  0.10201158,  0.11025267,  0.17807433,  0.19489038,
        0.13454135,  0.17845109,  0.20425074,  0.13569427,  0.1030286 ,
        0.2211379 ,  0.0862309 ,  0.15381028,  0.12839273,  0.171564  ,
        0.25182822,  0.03405412,  0.19693166,  0.08643712,  0.35337457,
        0.09080241,  0.13806805,  0.07851825,  0.04539296,  0.03049082,
        0.05946289,  0.07446273,  0.08857127,  0.09154192,  0.04712409,
        0.05000826,  0.05298838,  0.05260412,  0.50142461,  0.05186893,
        0.02778008,  0.04393745,  0.03178142,  0.03211796,  0.03779933,
        0.04393745,  0.15957074,  0.1283915 ,  0.14526786,  0.22

In [304]:
sum(core_distances > distance_matrix)

array([5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 5, 5])

### On asymmetric matrix (kNN)

In [305]:
# Non-symmetric matrix
D = low_dim_distance(A_knn, umap_rep)

# Submatrix
D_sparse = D[subset_filter,:][:, subset_filter].copy()

# Densify
D_test = D_sparse.toarray()
D_test[D_test<0.0001] = large_val

In [306]:
distance_matrix = D_test
min_points = 5
core_distances = np.partition(distance_matrix,
                              min_points,
                              axis=0)[min_points]

In [316]:
sum(D_sparse.data)

683.9788822224364

In [322]:
np.maximum(core_distances[0], x)

array([0.15513985, 0.54610425, 0.15513985, 0.28982359, 0.2630457 ,
       0.15513985, 0.15513985, 0.15513985, 0.21948646, 0.23696177,
       0.18639688, 0.25545377, 0.21606353, 0.18639039, 0.15513985])

In [326]:
x = D_sparse[:,0].data
D_sparse[:,0].data = np.maximum(core_distances[0], x)
D_sparse[:,0].data

array([0.14072441, 0.54610425, 0.12045969, 0.28982359, 0.2630457 ,
       0.11201889, 0.1147987 , 0.15513985, 0.21948646, 0.23696177,
       0.18639688, 0.25545377, 0.21606353, 0.18639039, 0.14398976])

In [307]:
core_distances

array([ 0.15513985,  0.1507993 ,  0.18772864, 10.        ,  0.10765006,
        0.17710201,  0.0762486 ,  0.08286497,  0.07412623,  0.072634  ,
        0.09956087,  0.07378092, 10.        , 10.        ,  0.10927223,
        0.14168802,  0.10851032,  0.21560848,  0.16168611,  0.58037549,
        0.17015436,  0.06285882,  0.13098511,  0.06637947,  0.09284311,
       10.        ,  0.10201158,  0.11199529,  0.17807433,  0.19489038,
        0.26800895,  0.17845109,  0.28063136,  0.13569427,  0.1030286 ,
        0.2211379 ,  0.0862309 ,  0.28688502,  0.12839273,  0.171564  ,
       10.        ,  0.03405412, 10.        ,  0.08702035, 10.        ,
        0.11955469,  0.13806805,  0.07851825,  0.04539296,  0.03049082,
       10.        ,  0.08786056, 10.        ,  0.09154192,  0.04712409,
        0.05771601,  0.05298838,  0.05260412, 10.        ,  0.11792402,
        0.02778008,  0.05000826,  0.03326573,  0.03211796,  0.03779933,
        0.04393745,  0.1695765 ,  0.15168063,  0.25202227, 10.  

In [308]:
core_test = sparse_partition(D_sparse, min_points, max_val=10)
core_test

array([ 0.15513985,  0.1507993 ,  0.18772864, 10.        ,  0.10765006,
        0.17710201,  0.0762486 ,  0.08286497,  0.07412623,  0.072634  ,
        0.09956087,  0.07378092, 10.        , 10.        ,  0.10927223,
        0.14168802,  0.10851032,  0.21560848,  0.16168611,  0.58037549,
        0.17015436,  0.06285882,  0.13098511,  0.06637947,  0.09284311,
       10.        ,  0.10201158,  0.11199529,  0.17807433,  0.19489038,
        0.26800895,  0.17845109,  0.28063136,  0.13569427,  0.1030286 ,
        0.2211379 ,  0.0862309 ,  0.28688502,  0.12839273,  0.171564  ,
       10.        ,  0.03405412, 10.        ,  0.08702035, 10.        ,
        0.11955469,  0.13806805,  0.07851825,  0.04539296,  0.03049082,
       10.        ,  0.08786056, 10.        ,  0.09154192,  0.04712409,
        0.05771601,  0.05298838,  0.05260412, 10.        ,  0.11792402,
        0.02778008,  0.05000826,  0.03326573,  0.03211796,  0.03779933,
        0.04393745,  0.1695765 ,  0.15168063,  0.25202227, 10.  

In [312]:
np.array([.2, .3, 1.1]) > np.ones((3,3))

array([[False, False,  True],
       [False, False,  True],
       [False, False,  True]])

In [273]:
np.sort(D_test[:,3])[0:6]

array([ 0.17660187,  0.17710201,  0.33213294,  0.6646533 , 10.        ,
       10.        ])

In [263]:
np.partition(distance_matrix,
                              min_points,
                              axis=0)[min_points]

array([ 0.15513985,  0.1507993 ,  0.18772864, 10.        ,  0.10765006,
        0.17710201,  0.0762486 ,  0.08286497,  0.07412623,  0.072634  ,
        0.09956087,  0.07378092, 10.        , 10.        ,  0.10927223,
        0.14168802,  0.10851032,  0.21560848,  0.16168611,  0.58037549,
        0.17015436,  0.06285882,  0.13098511,  0.06637947,  0.09284311,
       10.        ,  0.10201158,  0.11199529,  0.17807433,  0.19489038,
        0.26800895,  0.17845109,  0.28063136,  0.13569427,  0.1030286 ,
        0.2211379 ,  0.0862309 ,  0.28688502,  0.12839273,  0.171564  ,
       10.        ,  0.03405412, 10.        ,  0.08702035, 10.        ,
        0.11955469,  0.13806805,  0.07851825,  0.04539296,  0.03049082,
       10.        ,  0.08786056, 10.        ,  0.09154192,  0.04712409,
        0.05771601,  0.05298838,  0.05260412, 10.        ,  0.11792402,
        0.02778008,  0.05000826,  0.03326573,  0.03211796,  0.03779933,
        0.04393745,  0.1695765 ,  0.15168063,  0.25202227, 10.  

In [256]:
for i in range(10):
    print(i, sum(np.partition(distance_matrix,
                              min_points,
                              axis=0)[i]))

0 111.70232457760721
1 64.53022875078022
2 115.43103753495961
3 156.82369180116802
4 307.82428181916475
5 391.3767988476902
6 554.4709333255887
7 777.4999589584768
8 902.4747921265662
9 1011.686602383852


In [246]:
core_distances

array([ 0.15513985,  0.1507993 ,  0.18772864, 10.        ,  0.10765006,
        0.17710201,  0.0762486 ,  0.08286497,  0.07412623,  0.072634  ,
        0.09956087,  0.07378092, 10.        , 10.        ,  0.10927223,
        0.14168802,  0.10851032,  0.21560848,  0.16168611,  0.58037549,
        0.17015436,  0.06285882,  0.13098511,  0.06637947,  0.09284311,
       10.        ,  0.10201158,  0.11199529,  0.17807433,  0.19489038,
        0.26800895,  0.17845109,  0.28063136,  0.13569427,  0.1030286 ,
        0.2211379 ,  0.0862309 ,  0.28688502,  0.12839273,  0.171564  ,
       10.        ,  0.03405412, 10.        ,  0.08702035, 10.        ,
        0.11955469,  0.13806805,  0.07851825,  0.04539296,  0.03049082,
       10.        ,  0.08786056, 10.        ,  0.09154192,  0.04712409,
        0.05771601,  0.05298838,  0.05260412, 10.        ,  0.11792402,
        0.02778008,  0.05000826,  0.03326573,  0.03211796,  0.03779933,
        0.04393745,  0.1695765 ,  0.15168063,  0.25202227, 10.  

In [275]:
core_test = sparse_partition(D_sparse, min_points, max_val=10)
core_test

IndexError: list index out of range

In [127]:
D_sparse.T.data

array([0.14072441, 0.54610425, 0.12045969, ..., 1.05872238, 0.27734983,
       0.29902574])

In [123]:
D_sparse[0,:].data

array([0.14072441, 0.54610425, 0.12045969, 0.28982359, 0.26774687,
       0.2630457 , 0.11201889, 0.1147987 , 0.15513985, 0.24766865,
       0.21948646, 0.23696177, 0.18639688, 0.35997349, 0.25411701,
       0.20584038, 0.25545377, 0.21606353, 0.18639039, 0.14398976])

In [125]:
n_row = D_sparse.shape[0]
new_data = np.array(
    [x for i in range(n_row)
     for x in np.where(core_distances[i] > D_sparse[i,:].data,
                      core_distances[i], D_sparse[i,:].data)
     
    ]
     )


array([0.15513985, 0.54610425, 0.15513985, ..., 1.05872238, 0.30724514,
       0.30724514])

# Run on all datasets

In [45]:
def get_single_linkage_clustering(dataset_id, distance_function, k=None):
    raw_data, targets, dataset_name = get_dataset(dataset_id=dataset_id)
    display(Markdown(f'### {dataset_name}'))
    
    if(distance_function.__name__ == "get_avg_neighbor_distance"):
        D = distance_function(raw_data)
    else:
        if(k is None):
            k = get_dataset_params(dataset_id)['n_neighbors']

        A_umap, sigmas, rhos, dists = umap.umap_.fuzzy_simplicial_set(X=raw_data, 
                                                         n_neighbors=k, 
                                                         random_state=0, 
                                                         metric='euclidean', 
                                                         return_dists=True,
                                                         set_op_mix_ratio=0.5)
        umap_rep = get_umap_vectors(dataset_id=dataset_id, raw_data=raw_data)
        D = distance_function(A_umap, umap_rep)

    labels = clustering_from_sparse(D)

    ari = adjusted_rand_score(targets, labels)
    ami = adjusted_mutual_info_score(targets, labels)
    print(f'PARTIAL DISTANCE:\nARI = {ari} and AMI = {ami}\n')
    
    hd_umap_labels = h_dbscan(umap_rep, which_algo='hdbscan', dataset_id=dataset_id)

    ari = adjusted_rand_score(targets, hd_umap_labels)
    ami = adjusted_mutual_info_score(targets, hd_umap_labels)
    print(f'UMAP+HDBSCAN:\nARI = {ari} and AMI = {ami}\n\n')
    
    return(labels, ari, ami, targets)

In [46]:
def low_dim_distance(A_umap, umap_rep):
    D = A_umap.copy()
    rows = [x for v in [[i]*v for i, v in enumerate(A_umap.indptr[1:] - A_umap.indptr[:-1])] for x in v]
    D.data = np.array([euclidean( umap_rep[rows[i]], 
                                             umap_rep[A_umap.indices[i]])
                for i in range(len(rows))
               ])
    return(D)

In [48]:
for i in range(6):
    res = get_single_linkage_clustering(dataset_id=i, distance_function = low_dim_distance)

### pendigits

PARTIAL DISTANCE:
ARI = 0.5937845136423342 and AMI = 0.7459900925803741

UMAP+HDBSCAN:
ARI = 0.9185149200427103 and AMI = 0.9320899303214291




### coil

  "Graph is not fully connected, spectral embedding may not work as expected."


Disconnected distance matrix: 
 -Connecting distance matrix
PARTIAL DISTANCE:
ARI = 0.7366539128552851 and AMI = 0.8614787111461784

UMAP+HDBSCAN:
ARI = 0.7910813366998825 and AMI = 0.9424028955749346




### mnist

PARTIAL DISTANCE:
ARI = 0.6536270135409726 and AMI = 0.6850632024244153

UMAP+HDBSCAN:
ARI = 0.8987078065717921 and AMI = 0.8868226537248037




### usps

PARTIAL DISTANCE:
ARI = 0.7406192640111551 and AMI = 0.7621950198047123

UMAP+HDBSCAN:
ARI = 0.8824676944734986 and AMI = 0.9004545663772475




### buildings

Disconnected distance matrix: 
 -Connecting distance matrix
PARTIAL DISTANCE:
ARI = 0.162062153104807 and AMI = 0.5937277651507081

UMAP+HDBSCAN:
ARI = 0.2458851298996959 and AMI = 0.6307848503576966




### clusterable

PARTIAL DISTANCE:
ARI = 0.09585077605431681 and AMI = 0.4880254512377565

UMAP+HDBSCAN:
ARI = 0.1478619673703955 and AMI = 0.5132842384271498


