# Persistent single-linkage (HDBSCAN) clustering on sparse matrices

This is our attempt at running an HDBSCAN approach (persistent single linkage) on a sparse distance matrix. 
The sparse distance matrix we use here is the UMAP matrix with distances coming from the low dimensional projection from UMAP. We want to see how the code behaves on a partial distance

In [76]:
!git branch

* [32mmaster[m


In [2]:
execfile('functions/data_specifics.py')
execfile('functions/graph_functions.py')
print(data_set_list)

['pendigits', 'coil', 'mnist', 'usps', 'buildings', 'clusterable']


In [6]:
from IPython.display import display, Markdown, Latex
from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score, silhouette_score
from sklearn import cluster

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import umap
from collections import Counter
from scipy.stats import mode

from scipy.spatial.distance import euclidean

sns.set()

In [4]:
import hdbscan
import scipy.sparse
import sklearn.cluster
from hdbscan._hdbscan_tree import (
    condense_tree,
    compute_stability,
    get_clusters,
    outlier_scores,
)
from hdbscan.plots import CondensedTree, SingleLinkageTree, MinimumSpanningTree

## Get clusters from a (disconnected) sparse distance matrix

* The 0-values indicate "far away" points

If the matrix has disconnected components (when viewed as a graph), HDBSCAN does not work. To get around this, we have tried clustering each component separately or just adding edges to glue the different parts together.

In [5]:
def clustering_from_sparse(D,
                       connected_component_method = 'glue', # 'glue' or 'individual'
                       cluster_selection_method = 'eom',
                       allow_single_cluster=False,
                       cluster_selection_epsilon=0.0,
                       max_cluster_size=0,
                       min_cluster_size=10,
                       no_split_size = 100,
                       alpha = 1.2):
    par = locals()

    cc = scipy.sparse.csgraph.connected_components(D)
    n = D.shape[0]

    if(cc[0] == 1):
        labels = clustering_from_sparse_connected(**par)
        
    elif (connected_component_method == 'glue'):
        print("Disconnected distance matrix: \n -Connecting distance matrix")
        labels = clustering_from_sparse_force_connection(**par)
        
    elif( connected_component_method == 'individual' ):
        print("Disconnected distance matrix: \n -Performing clustering on individual connected components")
        labels = clustering_from_sparse_on_individual_cc(**par)
        
    else:
        raise ValueError('Parameter should be glue or individual')
        
    return(labels)


# Single linkage on sparse matrix when the initial distance matrix is connected.
def clustering_from_sparse_connected(D,
                       connected_component_method = 'glue',
                       cluster_selection_method = 'eom',
                       allow_single_cluster=False,
                       cluster_selection_epsilon=0.0,
                       max_cluster_size=0,
                       min_cluster_size=10,
                       no_split_size = 100,
                       alpha = 1.2):
    
    mst = scipy.sparse.csgraph.minimum_spanning_tree(D)
    mst = mst.tocoo()
    mst_array = np.vstack([mst.row, mst.col, mst.data]).T
    
    # Sort edges of the min_spanning_tree by weight
    mst_array = mst_array[np.argsort(mst_array.T[2], kind="mergesort"), :]

    # Convert edge list into standard hierarchical clustering format
    single_linkage_tree = sklearn.cluster._hierarchical_fast._single_linkage_label(mst_array)
    
    ## HDBSCAN function
    cd_tree = condense_tree(single_linkage_tree, min_cluster_size)
    
    ## HDBSCAN function
    stability_dict = compute_stability(cd_tree)
    
    ## HDBSCAN function
    labels, probabilities, stabilities = get_clusters(
            cd_tree,
            stability_dict,
            cluster_selection_method = cluster_selection_method,
            allow_single_cluster = allow_single_cluster,
            cluster_selection_epsilon=cluster_selection_epsilon,
            max_cluster_size=max_cluster_size
        )
    return(labels)


# Performing clustering on individual connected components of the sparse distance matrix
def clustering_from_sparse_on_individual_cc(D,
                       connected_component_method = 'glue',
                       cluster_selection_method = 'eom',
                       allow_single_cluster=False,
                       cluster_selection_epsilon=0.0,
                       max_cluster_size=0,
                       min_cluster_size=10,
                       no_split_size = 100,
                       alpha = 1.2):
    
    # Check connected components
    cc = scipy.sparse.csgraph.connected_components(D)
    n = D.shape[0]
    
    if(cc[0]==1):
        labels = clustering_from_sparse_connected(D)
    else:
        labels = np.array([-1]*n)
        for comp in range(cc[0]):
            w = (cc[1]==comp)
            m = sum(w)
            if(m < min_cluster_size):
                continue
            elif(m < no_split_size):
                labels_part = np.zeros(m)
            else:
                D_part = D[w, :][:, w]
                labels_part = clustering_from_sparse_connected(D_part,
                                                            cluster_selection_method = cluster_selection_method,
                                                            allow_single_cluster = allow_single_cluster,
                                                            cluster_selection_epsilon=cluster_selection_epsilon,
                                                            max_cluster_size=max_cluster_size,
                                                            min_cluster_size=min_cluster_size)
                if(sum(labels_part==-1)>(len(labels_part)/2)):
                    labels_part = 0*labels_part
            w[w] = (labels_part>=0)
            labels[w] = (labels_part[labels_part>=0] + max(labels) + 1)
    return(labels)


# Adding large distance values (alpha * largest) to sparse distance matrix to make it connected
def clustering_from_sparse_force_connection(D,
                       connected_component_method = 'glue',
                       cluster_selection_method = 'eom',
                       allow_single_cluster=False,
                       cluster_selection_epsilon=0.0,
                       max_cluster_size=0,
                       min_cluster_size=10,
                       no_split_size = 100,
                       alpha = 1.2):
    
    # Check connected components
    cc = scipy.sparse.csgraph.connected_components(D)
    cc_list = cc[1].tolist()
    if(cc[0]>1): # Add edges to connect if more than one connected component
        m = max(D.data)
 
        rows = []
        cols = []
        vals = []
        for i in set(cc[1]):
            for j in set(cc[1]):
                if(i>j):
                    new_i = cc_list.index(i)
                    new_j = cc_list.index(j)
                    rows.extend([new_i, new_j])
                    cols.extend([new_j, new_i])
                    vals.extend([alpha*m, alpha*m])
        D_glue = scipy.sparse.coo_matrix((vals, (rows, cols)), shape=D.shape)
        D_new = D + D_glue
    else:
        D_new = D
                       
    labels = clustering_from_sparse_connected(D_new)
    return(labels)

# Clustering algorithm
These are the steps we use to feed a sparse distance matrix to the single linkage...
We are not using HDBSCAN at the moment, but this is where we would like to go.

## Get data, UMAP graph and UMAP low dimensional vectors

In [35]:
dataset_id=0
raw_data, targets, dataset_name = get_dataset(dataset_id=dataset_id)

k = get_dataset_params(dataset_id)['n_neighbors']

A_umap, sigmas, rhos, dists = umap.umap_.fuzzy_simplicial_set(X=raw_data, 
                                             n_neighbors=k, 
                                             random_state=0, 
                                             metric='euclidean', 
                                             return_dists=True,
                                             set_op_mix_ratio=1)

umap_rep = get_umap_vectors(dataset_id=dataset_id, raw_data=raw_data)

## Build the partial distance matrix based of UMAP graph

In [40]:
D = A_umap.copy()
rows = [x for v in [[i]*v for i, v in enumerate(A_umap.indptr[1:] - A_umap.indptr[:-1])] for x in v]
D.data = np.array([euclidean( umap_rep[rows[i]], 
                                         umap_rep[A_umap.indices[i]])
            for i in range(len(rows))
           ])

## Compare robust single-linkage (portions of HDBSCAN) on partial distance vs. on low dim projection

In [38]:
labels = clustering_from_sparse(D)

ari = adjusted_rand_score(targets, labels)
ami = adjusted_mutual_info_score(targets, labels)
print(f'ARI = {ari} and AMI = {ami}')

ARI = 0.5937845136423342 and AMI = 0.7459900925803741


In [39]:
hd_umap_labels = h_dbscan(umap_rep, which_algo='hdbscan', dataset_id=dataset_id)

ari = adjusted_rand_score(targets, hd_umap_labels)
ami = adjusted_mutual_info_score(targets, hd_umap_labels)
print(f'ARI = {ari} and AMI = {ami}')

ARI = 0.9185149200427103 and AMI = 0.9320899303214291


# Run on all datasets

In [45]:
def get_single_linkage_clustering(dataset_id, distance_function, k=None):
    raw_data, targets, dataset_name = get_dataset(dataset_id=dataset_id)
    display(Markdown(f'### {dataset_name}'))
    
    if(distance_function.__name__ == "get_avg_neighbor_distance"):
        D = distance_function(raw_data)
    else:
        if(k is None):
            k = get_dataset_params(dataset_id)['n_neighbors']

        A_umap, sigmas, rhos, dists = umap.umap_.fuzzy_simplicial_set(X=raw_data, 
                                                         n_neighbors=k, 
                                                         random_state=0, 
                                                         metric='euclidean', 
                                                         return_dists=True,
                                                         set_op_mix_ratio=0.5)
        umap_rep = get_umap_vectors(dataset_id=dataset_id, raw_data=raw_data)
        D = distance_function(A_umap, umap_rep)

    labels = clustering_from_sparse(D)

    ari = adjusted_rand_score(targets, labels)
    ami = adjusted_mutual_info_score(targets, labels)
    print(f'PARTIAL DISTANCE:\nARI = {ari} and AMI = {ami}\n')
    
    hd_umap_labels = h_dbscan(umap_rep, which_algo='hdbscan', dataset_id=dataset_id)

    ari = adjusted_rand_score(targets, hd_umap_labels)
    ami = adjusted_mutual_info_score(targets, hd_umap_labels)
    print(f'UMAP+HDBSCAN:\nARI = {ari} and AMI = {ami}\n\n')
    
    return(labels, ari, ami, targets)

In [46]:
def low_dim_distance(A_umap, umap_rep):
    D = A_umap.copy()
    rows = [x for v in [[i]*v for i, v in enumerate(A_umap.indptr[1:] - A_umap.indptr[:-1])] for x in v]
    D.data = np.array([euclidean( umap_rep[rows[i]], 
                                             umap_rep[A_umap.indices[i]])
                for i in range(len(rows))
               ])
    return(D)

In [48]:
for i in range(6):
    res = get_single_linkage_clustering(dataset_id=i, distance_function = low_dim_distance)

### pendigits

PARTIAL DISTANCE:
ARI = 0.5937845136423342 and AMI = 0.7459900925803741

UMAP+HDBSCAN:
ARI = 0.9185149200427103 and AMI = 0.9320899303214291




### coil

  "Graph is not fully connected, spectral embedding may not work as expected."


Disconnected distance matrix: 
 -Connecting distance matrix
PARTIAL DISTANCE:
ARI = 0.7366539128552851 and AMI = 0.8614787111461784

UMAP+HDBSCAN:
ARI = 0.7910813366998825 and AMI = 0.9424028955749346




### mnist

PARTIAL DISTANCE:
ARI = 0.6536270135409726 and AMI = 0.6850632024244153

UMAP+HDBSCAN:
ARI = 0.8987078065717921 and AMI = 0.8868226537248037




### usps

PARTIAL DISTANCE:
ARI = 0.7406192640111551 and AMI = 0.7621950198047123

UMAP+HDBSCAN:
ARI = 0.8824676944734986 and AMI = 0.9004545663772475




### buildings

Disconnected distance matrix: 
 -Connecting distance matrix
PARTIAL DISTANCE:
ARI = 0.162062153104807 and AMI = 0.5937277651507081

UMAP+HDBSCAN:
ARI = 0.2458851298996959 and AMI = 0.6307848503576966




### clusterable

PARTIAL DISTANCE:
ARI = 0.09585077605431681 and AMI = 0.4880254512377565

UMAP+HDBSCAN:
ARI = 0.1478619673703955 and AMI = 0.5132842384271498


