# Attempt to use hierarchical clustering on the weighted UMAP graph directly


In [None]:
execfile('functions/data_specifics.py')
execfile('functions/graph_functions.py')
print(data_set_list)

In [None]:
from operator import itemgetter

from IPython.display import display, Markdown, Latex
from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score, silhouette_score
from sklearn import cluster

import numpy as np
import pandas as pd
import umap

import matplotlib.pyplot as plt
import seaborn as sns

import networkx as nx
from networkx.algorithms import community

sns.set()

In [None]:
import leidenalg as la
import igraph as ig
from tqdm import tqdm
from sklearn.manifold import TSNE
from matplotlib.collections import LineCollection
from sklearn.preprocessing import normalize
import scipy.sparse.csgraph

## Get data

In [None]:
dataset_id=1
raw_data, targets, dataset_name = get_dataset(dataset_id=dataset_id)

k = get_dataset_params(dataset_id)['n_neighbors']

## UMAP Graph

Build and visualize the UMAP graph

In [None]:
%%time
tsne_map = TSNE().fit_transform(raw_data)

In [None]:
symmetric_graph, _, _ = umap.umap_.fuzzy_simplicial_set(raw_data, n_neighbors=15, metric="euclidean", random_state=42)
umap_graph = symmetric_graph.copy()

In [None]:
fig, ax = plt.subplots(figsize=(12,12))
coo_umap_graph = umap_graph.tocoo()
edges = LineCollection(
    np.dstack([tsne_map[coo_umap_graph.row], tsne_map[coo_umap_graph.col]]).transpose((0, 2, 1)), 
    linewidths=0.5 * coo_umap_graph.data, 
    colors=np.vstack([np.zeros((3, coo_umap_graph.data.shape[0])), coo_umap_graph.data]).T,
    zorder=3
)
ax.add_collection(edges)
ax.scatter(*tsne_map.T, s=5, c=targets, cmap="Spectral")

# Modified Girvan-Newman to find when connected components break

This is effectively a divisive hierarchical clustering algorithm. We remove the edges of the umap graph from lowest weight to highest weight, and each time we break a connected component into more parts, we get a new list of communities. `nx.community.girvan_newman(umap_G, most_valuable_edge=lowest_weight)` returns a list of flat cuts, each level of the hierarchical clustering where a new component was introduced. 

In [None]:
umap_G = nx.from_scipy_sparse_matrix(umap_graph)

In [None]:
def lowest_weight(G):
    u, v, w = min(G.edges(data='weight'), key=itemgetter(2))
    return (u, v)

In [None]:
%%time
communities = list(nx.community.girvan_newman(umap_G, most_valuable_edge=lowest_weight))

In [None]:
original_communities = communities.copy()

Let's look at the modularity of the various cut levels (from the example here: https://networkx.org/documentation/stable/auto_examples/algorithms/plot_girvan_newman.html)

In [None]:
%%time
# Modularity -> measures the strength of division of a network into modules
modularity_df = pd.DataFrame(
    [
        [k + 1, len(communities[k]), nx.community.modularity(umap_G, communities[k])]
        for k in range(len(communities))
    ],
    columns=["cut level", "components", "modularity"],
)

In [None]:
modularity_df[10:20]

In [None]:
max_mod = np.where(modularity_df.modularity==modularity_df.modularity.max())[0][0]
modularity_df.iloc[max_mod]

This seems to be consistent with how we tend to get around 16 clusters using modularity based clustering algorithms.

In [None]:
# Plot change in modularity as the important edges are removed
modularity_df.plot(x='cut level', y='modularity')

## Do flat cuts and evaluate the resulting clusters

Let's look at the clustering results for flat cuts from 10 to 30. 

Turn a list of communities into a cluster label list

In [None]:
results = {}
cluster_label_dict = {}
for cut_level in range(150):
    cluster_dict = {}
    for i, cluster in enumerate(communities[cut_level]):
        for element in cluster:
            cluster_dict[element] = i

    cluster_labels = []
    for x in list(umap_G.nodes): 
        cluster_labels.append(cluster_dict[x])
    cluster_label_dict[cut_level] = cluster_labels
    
    ari = adjusted_rand_score(targets, cluster_labels)
    ami = adjusted_mutual_info_score(targets, cluster_labels)
    results[cut_level] = {'ARI': ari, 'AMI': ami, 'Clusters': len(set(cluster_labels))}

In [None]:
results_df = pd.DataFrame(results).T
results_df[10:25]

## A look at the various cut levels

At cut level 12, we have the highest modularity, and too few clusters. We have 2 overly large clusters (size 286 and 144) and a single tiny cluster of size 2 (which probably should be considered noise eventually).

In [None]:
cut_level = 12
results_df.T[cut_level]

In [None]:
np.unique(cluster_label_dict[cut_level], return_counts=True)[1]

In [None]:
fig, ax = plt.subplots(figsize=(12,12))
coo_umap_graph = umap_graph.tocoo()
edges = LineCollection(
    np.dstack([tsne_map[coo_umap_graph.row], tsne_map[coo_umap_graph.col]]).transpose((0, 2, 1)), 
    linewidths=0.2 * coo_umap_graph.data, 
    colors=np.vstack([np.zeros((3, coo_umap_graph.data.shape[0])), coo_umap_graph.data]).T,
    zorder=3
)
ax.add_collection(edges)
ax.scatter(*tsne_map.T, s=5, c=cluster_label_dict[cut_level], cmap="Spectral")

For cut level 15, we have the correct number of clusters, but it has not split the big clusters, just overly split some of the correct clusters. 

In [None]:
cut_level = 16
results_df.T[cut_level]

In [None]:
np.unique(cluster_label_dict[cut_level], return_counts=True)[1]

It is not until cut levels 16, 17, 18 and 19 that the big clusters start to get broken apart. By 18, we have 3 tiny clusters of size < 5, which we should probably refuse to cluster. By 19, we have our highest AMI and ARI, which is comparable to UMAP+HDBSCAN results (below). 

In [None]:
cut_level = 19
results_df.T[cut_level]

In [None]:
np.unique(cluster_label_dict[cut_level], return_counts=True)[1]

In [None]:
fig, ax = plt.subplots(figsize=(12,12))
coo_umap_graph = umap_graph.tocoo()
edges = LineCollection(
    np.dstack([tsne_map[coo_umap_graph.row], tsne_map[coo_umap_graph.col]]).transpose((0, 2, 1)), 
    linewidths=0.2 * coo_umap_graph.data, 
    colors=np.vstack([np.zeros((3, coo_umap_graph.data.shape[0])), coo_umap_graph.data]).T,
    zorder=3
)
ax.add_collection(edges)
ax.scatter(*tsne_map.T, s=5, c=cluster_label_dict[cut_level], cmap="Spectral")

It's not until level 55 that we start to the clusters break from two merged together

In [None]:
for cut_level in range(150):
    _, counts = np.unique(cluster_label_dict[cut_level], return_counts=True)
    if max(counts) <= 72*2:
        print(f'cut_level: {cut_level} \nlargest cluster size: {max(counts)}\nclusters: {len(counts)}\ncounts:{counts}')
        break

And it's not until cut level 118 that all the clusters are size 72 or less. 

In [None]:
for cut_level in range(150):
    _, counts = np.unique(cluster_label_dict[cut_level], return_counts=True)
    if max(counts) <= 72:
        print(f'cut_level: {cut_level} \nlargest cluster size: {max(counts)}\nclusters: {len(counts)}\ncounts:{counts}')
        break

This suggests that some of these oversized components are very persistent, and are connected by lots of higher weight edges making them hard to break apart. 

# Compare against UMAP+HDBSCAN

In [None]:
%%time
umap_rep = get_umap_vectors(dataset_id=dataset_id, raw_data=raw_data)
hd_umap_labels = h_dbscan(umap_rep, which_algo='hdbscan', dataset_id=dataset_id)
ari_baseline = adjusted_rand_score(targets, hd_umap_labels)
ami_baseline = adjusted_mutual_info_score(targets, hd_umap_labels)

In [None]:
val, counts = np.unique(hd_umap_labels, return_counts=True)
print(f'largest cluster size: {max(counts)}\nclusters: {len(counts)}\ncounts:{counts}')
print(f'ARI = {ari_baseline} and AMI = {ami_baseline}') 

Admittedly, UMAP+HDBSCAN also fails to separate these large hard to separate clusters. 

The cut level with as similar max cluster size as UMAP+HDBSCAN is 17, with a similar ARI and AMI. 

In [None]:
for cut_level in range(150):
    _, counts = np.unique(cluster_label_dict[cut_level], return_counts=True)
    if max(counts) <= 216:
        print(f'cut_level: {cut_level} \nlargest cluster size: {max(counts)}\nclusters: {len(counts)}\ncounts:{counts}')
        break
results_df.T[cut_level]

In [None]:
fig, ax = plt.subplots(figsize=(12,12))
coo_umap_graph = umap_graph.tocoo()
edges = LineCollection(
    np.dstack([tsne_map[coo_umap_graph.row], tsne_map[coo_umap_graph.col]]).transpose((0, 2, 1)), 
    linewidths=0.5 * coo_umap_graph.data, 
    colors=np.vstack([np.zeros((3, coo_umap_graph.data.shape[0])), coo_umap_graph.data]).T,
    zorder=3
)
ax.add_collection(edges)
ax.scatter(*tsne_map.T, s=5, c=hd_umap_labels, cmap="Spectral")