## **Clustering**

In [None]:
!pip install umap-learn

!pip install tslearn
!pip install hdbscan

Collecting umap-learn
  Downloading umap-learn-0.5.2.tar.gz (86 kB)
[K     |████████████████████████████████| 86 kB 3.4 MB/s 
Collecting pynndescent>=0.5
  Downloading pynndescent-0.5.6.tar.gz (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 33.2 MB/s 
Building wheels for collected packages: umap-learn, pynndescent
  Building wheel for umap-learn (setup.py) ... [?25l[?25hdone
  Created wheel for umap-learn: filename=umap_learn-0.5.2-py3-none-any.whl size=82708 sha256=6604b5d5f6d66b7b0ec5bbb735790071a68059e4a9d2c3b9d89279c35368cb9c
  Stored in directory: /root/.cache/pip/wheels/84/1b/c6/aaf68a748122632967cef4dffef68224eb16798b6793257d82
  Building wheel for pynndescent (setup.py) ... [?25l[?25hdone
  Created wheel for pynndescent: filename=pynndescent-0.5.6-py3-none-any.whl size=53943 sha256=5601a4d49c81ad6e216d278bd00771c01be7d08fb44b1216d67260749313f847
  Stored in directory: /root/.cache/pip/wheels/03/f1/56/f80d72741e400345b5a5b50ec3d929aca581bf45e0225d5c50
Successfull

### 1- Test data clustering (from scratch)









In [None]:
import time
import copy
import pandas as pd
import numpy as np
import sklearn.metrics
import matplotlib.pyplot as plt
import seaborn as sns
import hdbscan
import umap.umap_ as umap

# Plotting keywords
plot_kwds = {'alpha': 0.15, 's': 80, 'linewidths': 0}

def plot_clusters(data, algorithm, args, kwds):
    start_time = time.time()
    labels = algorithm(*args, **kwds).fit_predict(data)

    print("Labels:", labels)
    print("Max label:", labels.max())

    label_copy = copy.deepcopy(labels)
    label_list = list(label_copy)
    sorted_labels = sorted(label_copy)

    for i in range(-1, sorted_labels.max() + 1):
        count = label_list.count(i)
        print(f"Number of inputs in class {i} is {count}")

    end_time = time.time()
    palette = sns.color_palette('deep', np.unique(labels).max() + 1)
    colors = [palette[x] if x >= 0 else (0.0, 0.0, 0.0) for x in labels]

    plt.scatter(data.T[0], data.T[1], c=colors, **plot_kwds)
    frame = plt.gca()
    frame.axes.get_xaxis().set_visible(False)
    frame.axes.get_yaxis().set_visible(False)
    plt.title(f'Clusters found by {algorithm.__name__}', fontsize=24)
    plt.text(-0.5, 0.7, f'Clustering took {end_time - start_time:.2f} s', fontsize=14)

def scale_one(X):
    return (X - X.min()) / (X.max() - X.min())

# Example usage model is DNN, x_test is cifar10 test dataset (10000,32,32,3), check the necessary normalization before feeding to model usually (0-1) in our paper and models we have used.

Y_pred = model(x_test)
Y_pred = np.argmax(Y_pred, axis=1)
print(Y_pred)
YP_Scaled = scale_one(Y_pred)
YT_Scaled = scale_one(y_test)

# Features and scaled labels: "features" are VGG16 outputs, we have used vgg16 as feature extraction: load vgg16 or its features from our files and continue the process
X_features = features
TY_scaled = YT_Scaled
PY_scaled = YP_Scaled

# Add the true and predicted labels to the VGG16 features
X_features = np.c_[X_features, TY_scaled, PY_scaled]

bb, trace, hdbscan_in_umap, clustering_results = [], [], [], []
Sumn = 0
# The following values are hyperparameters that you can adjust to find the best clustering results.
# Since UMAP and HDBSCAN incorporate randomness in their algorithms, ensure that you save the final settings for your reproducibility. We have saved our clustering results in this repository.

for i, j in zip([500, 400, 300, 250], [450, 350, 250, 200]):
    for k, o in zip([5, 10, 15, 20, 25], [3, 5, 10, 15, 20]):
        for n_n in [0.03, 0.1, 0.25, 0.5]:
            fit = umap.UMAP(min_dist=n_n, n_components=i, n_neighbors=k)
            u1 = fit.fit_transform(X_features)
            fit = umap.UMAP(min_dist=0.1, n_components=j, n_neighbors=o)
            u = fit.fit_transform(u1)
            u = np.c_[u, TY_scaled, PY_scaled]
            print("UMAP output shape:", u.shape)

            plot_clusters(u, hdbscan.HDBSCAN, (), {'min_cluster_size': 5})
            silhouette_umap = sklearn.metrics.silhouette_score(u, labels)
            silhouette_features = sklearn.metrics.silhouette_score(X_features, labels)

            print("Silhouette Score UMAP:", silhouette_umap)
            print("Silhouette Score Features:", silhouette_features)

            if (silhouette_umap >= 0.1 or silhouette_features >= 0.1) and labels.max() + 2 >= 200:
                bb.append(labels)
                config = [i, j, k, o]
                trace.append([i, j, k, o, silhouette_umap, labels.max() + 2, list(labels).count(-1)])
                hdbscan_in_umap.append(u)
                Sumn += 1

                clustering_results.append({
                    "Number of Clusters": labels.max() + 1,
                    "Silhouette Score": silhouette_umap,
                    "Number of Noisy Inputs": list(labels).count(-1),
                    "Config": config
                })

                print(f"Iteration {Sumn}: Noisy labels count: {list(labels).count(-1)}")

# Save the results example:
# np.save("/content/drive/MyDrive/RQ_Con_factor/clustering/Cifar10_12Conv/Test_cluster_4068.npy", bb)
# np.save("/content/drive/MyDrive/RQ_Con_factor/clustering/Cifar10_12Conv/all_trace_4068.npy", trace)
# np.save("/content/drive/MyDrive/RQ_Con_factor/clustering/Cifar10_12Conv/umap_output_config7_4068.npy", np.array(hdbscan_in_umap[7]))

# Display clustering results in a table and select the one config clustering that has best Silhouette score
clustering_df = pd.DataFrame(clustering_results)
print(clustering_df)


### 2- Test data clustering (loading the best clustring results)

In [None]:
# Cl_label=np.load("/content/drive/MyDrive/RQ_Con_factor/clustering/Cifar10_12Conv/Test_cluster_4068.npy")
# trace=np.load("/content/drive/MyDrive/RQ_Con_factor/clustering/Cifar10_12Conv/all_trace_4068.npy", allow_pickle=True)
# umap_output=("/content/drive/MyDrive/RQ_Con_factor/clustering/Cifar10_12Conv/umap_output_4068.npy")
