# Clustering evaluation on high dimensional data

Starting

In [1]:
!git branch

* [32mmain[m


In [2]:
data_folder = '../data'

In [3]:
execfile('functions/data_specifics.py')
execfile('functions/graph_functions.py')
print(data_set_list)

['pendigits', 'coil', 'mnist', 'usps', 'buildings', 'clusterable']


In [4]:
from IPython.display import display, Markdown, Latex
from sklearn.datasets import fetch_openml
from sklearn.datasets import load_digits
from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score, silhouette_score
from sklearn.decomposition import PCA
from sklearn import cluster

import numpy as np
import pandas as pd
import requests
import zipfile
import imageio
import os
from PIL import Image
from glob import glob

import matplotlib.pyplot as plt
import seaborn as sns

import hdbscan
import umap
from sklearn.neighbors import KNeighborsTransformer
import pynndescent

sns.set()

# Clustering metric eval

To make things easier later we will write some short functions to evaluate clusterings (with some special handling of singleton clusters or noise points for clusterign algorithms that support such things), and to plot the results for easy comparison.

In [5]:
def eval_clusters(cluster_labels, true_labels, raw_data, cluster_method="None", min_cluster_size=5):
    unique_labels = np.unique(cluster_labels)
    cluster_sizes, size_ids = np.histogram(cluster_labels, bins=unique_labels)
    if np.any(cluster_sizes == 1): # Has singleton clusters -- call them noise
        singleton_clusters = size_ids[:-1][cluster_sizes <= min_cluster_size]
        for c in singleton_clusters:
            cluster_labels[cluster_labels == c] = -1
    if np.any(cluster_labels < 0): # Has noise points
        clustered_points = (cluster_labels >= 0)
        ari = adjusted_rand_score(true_labels[clustered_points], cluster_labels[clustered_points])
        ami = adjusted_mutual_info_score(true_labels[clustered_points], cluster_labels[clustered_points])
        sil = silhouette_score(raw_data[clustered_points], cluster_labels[clustered_points])
        pct_clustered = (np.sum(clustered_points) / cluster_labels.shape[0])
        # print(f"ARI: {ari:.4f}\nAMI: {ami:.4f}\nSilhouette: {sil:.4f}\nPct clustered: {pct_clustered * 100:.2f}%")
    else:
        ari = adjusted_rand_score(true_labels, cluster_labels)
        ami = adjusted_mutual_info_score(true_labels, cluster_labels)
        sil = silhouette_score(raw_data, cluster_labels)
        # print(f"ARI: {ari:.4f}\nAMI: {ami:.4f}\nSilhouette: {sil:.4f}")
        pct_clustered = 1.0
    
    return {"Method": cluster_method, "ARI": ari, "AMI": ami, "Silhouette": sil, "Pct Clustered": pct_clustered}

In [6]:
def plot_scores(results_dataframe, score_types=("ARI", "AMI"), colors=list(sns.color_palette()), width=0.75):
    fig, axs = plt.subplots(1, len(score_types), figsize=(8 * len(score_types), 8))
    x_ticklabels = results_dataframe.Method.unique()
    x_positions = np.arange(len(x_ticklabels), dtype=np.float32) - width / 2
    dim_red_types = results_dataframe["Dim Reduction"].unique()
    bar_width = width / len(dim_red_types)
    for offset_idx, dim_red in enumerate(dim_red_types):
        color = colors[offset_idx]
        for i, score_type in enumerate(score_types):
            sub_dataframe = results_dataframe[
                (results_dataframe["Score Type"] == score_type) &
                (results_dataframe["Dim Reduction"] == dim_red)
            ]
            axs[i].bar(
                x=x_positions,
                height=sub_dataframe["Score"],
                width=bar_width,
                align="edge",
                color=[(*color, v) for v in sub_dataframe["Pct Clustered"]],
                label=dim_red if i ==0 else None,
            )
            axs[i].set_xlabel("Cluster Method")
            axs[i].set_xticks(np.arange(len(x_ticklabels)))
            axs[i].set_xticklabels(x_ticklabels)
            axs[i].set_ylabel(f"{score_type} Score")
            axs[i].set_title(score_type, fontsize=20)
            axs[i].grid(visible=False, axis="x")
            axs[i].set_ylim([0, 1.05])
        x_positions += bar_width
        
    if len(dim_red_types) > 1:
        fig.legend(loc="center right", bbox_to_anchor=(1.125, 0.5), borderaxespad=0.0, fontsize=20)
        
    fig.tight_layout()

In [7]:
def run_clustering_algorithms(raw_data, targets, list_clustering_algo, params, name=None):
    results = dict()
    if("K-Means" in list_clustering_algo):
        results["K-Means"] = cluster.KMeans(n_clusters=max(targets)+1).fit_predict(raw_data)
    if("Complete\nLinkage" in list_clustering_algo):
        results["Complete\nLinkage"] = cluster.AgglomerativeClustering(n_clusters=max(targets)+1, linkage="complete").fit_predict(raw_data)
    if("Single\nLinkage" in list_clustering_algo):
        results["Single\nLinkage"] = cluster.AgglomerativeClustering(n_clusters=params['sl_k'], linkage="single").fit_predict(raw_data)
    if("DBSCAN" in list_clustering_algo):
        results["DBSCAN"] = cluster.DBSCAN(eps=params['dbscan_eps']).fit_predict(raw_data)
    if("HDBSCAN" in list_clustering_algo):
        results["HDBSCAN"] = hdbscan.HDBSCAN(min_samples=params['hdbscan_min_samples'], min_cluster_size=params['hdbscan_min_cluster_size']).fit_predict(raw_data)

    raw_results = pd.DataFrame(
        [
            eval_clusters(algo_labels, targets, raw_data, cluster_method=algo_name) for algo_name, algo_labels in results.items()
        ]
    )
    raw_results_long = raw_results.melt(["Method", "Pct Clustered"], var_name="Score Type", value_name="Score")
    raw_results_long["Dim Reduction"] = str(name)
    return(raw_results_long)

In [8]:
def run_analysis(i, list_clustering_algo = ["K-Means", "Single\nLinkage", "HDBSCAN"]):   
    raw_data, targets, dataset_name = get_dataset(dataset_id = i)
    display(Markdown(f'## {dataset_name}'))

    params = dataset_clustering_params['None'][i]
    raw_results_long = run_clustering_algorithms(raw_data, targets, list_clustering_algo, params)

    params = dataset_clustering_params['PCA'][i]
    pca_data = PCA(n_components=params['pca_n_components']).fit_transform(raw_data)
    pca_results_long = run_clustering_algorithms(pca_data, targets, list_clustering_algo, params, name='PCA')

    params = dataset_clustering_params['UMAP'][i]
    umap_data = get_umap_vectors(dataset_id=i)
    umap_results_long = run_clustering_algorithms(umap_data, targets, list_clustering_algo, params, name='UMAP')

    return(pd.concat([raw_results_long, pca_results_long, umap_results_long]))

In [None]:
results = dict()
for i in range(5):
    results[data_set_list[i]] = run_analysis(i)

## pendigits

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


## coil

  warn(


## mnist

  warn(
  warn(


## usps

  warn(


## buildings



In [None]:
for i in range(5):
    display(Markdown(f'## {data_set_list[i]}'))
    plot_scores(results[data_set_list[i]])