In [52]:
from sklearn.decomposition import PCA
from sklearn import datasets
from sklearn.preprocessing import scale
import numpy as np
from sklearn import metrics
import umap
from MulticoreTSNE import MulticoreTSNE as TSNE

In [93]:
digits = datasets.load_digits()
data = scale(digits.data)
n_samples, n_features = data.shape
n_digits = len(np.unique(digits.target))
labels = digits.target
sample_size = 300

In [54]:
data_pca_5 = PCA(n_components=5).fit_transform(data)
data_pca_5

array([[ 1.91421394, -0.95450501, -3.94603592,  2.02872682, -0.26695791],
       [ 0.58898069,  0.92463325,  3.92475311, -1.77983979, -0.99306889],
       [ 1.30203763, -0.31720032,  3.02331869, -2.04343436, -2.08019365],
       ...,
       [ 1.02259281, -0.14791934,  2.46994978, -0.62049629, -0.97224351],
       [ 1.07605044, -0.38094809, -2.45552796, -1.31229163,  0.25478432],
       [-1.25770559, -2.22760297,  0.28361983, -0.12720962, -1.57067828]])

In [55]:
data_pca_8 = PCA(n_components=8).fit_transform(data)
data_pca_8

array([[ 1.91416796, -0.95432538, -3.94639617, ...,  0.52822212,
        -1.42174081,  1.52051214],
       [ 0.58900946,  0.92494144,  3.92465259, ..., -0.67689081,
         1.88183994,  0.56412244],
       [ 1.30223124, -0.31726979,  3.02433939, ...,  0.93072469,
        -1.27812038,  1.11807833],
       ...,
       [ 1.02251109, -0.14804928,  2.47020496, ...,  0.00714471,
         0.35637193,  0.51616751],
       [ 1.07587417, -0.38060892, -2.45570334, ..., -0.62285083,
         0.99089316,  0.7888836 ],
       [-1.25771835, -2.22842911,  0.28496313, ...,  2.36773084,
        -0.40684442, -1.25249908]])

In [56]:
embedding_umap_8 = umap.UMAP(n_neighbors=8).fit_transform(data)
embedding_umap_5 = umap.UMAP(n_neighbors=5).fit_transform(data)

In [57]:
def get_errors(estimator, sample_size, data):
    siholitte = metrics.silhouette_score(data, estimator.labels_,
                                      metric='euclidean',
                                      sample_size=sample_size)
    adj_rand = metrics.adjusted_rand_score(labels, estimator.labels_)
    mutual = metrics.adjusted_mutual_info_score(labels,  estimator.labels_)
    rand = metrics.rand_score(labels, estimator.labels_)
    return print(f"Rand {rand}",
          f"Adjusted Rand Index {adj_rand}",
          f"Adjusted Mutual Information: {mutual}", 
          f"Silhouette {siholitte}", sep='\n')

## K-means

In [58]:
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist

In [59]:
estimator = KMeans(init='k-means++', n_clusters=n_digits, n_init=10)
estimator.fit(data)

KMeans(n_clusters=10)

In [60]:
get_errors(estimator, sample_size, data)

Rand 0.9141076503402726
Adjusted Rand Index 0.5687406548046969
Adjusted Mutual Information: 0.6945510574894563
Silhouette 0.11491996215318409


In [61]:
pca_estimator = KMeans(init='k-means++', n_clusters=n_digits, n_init=10)
pca_estimator.fit(data_pca_8)

KMeans(n_clusters=10)

In [62]:
get_errors(pca_estimator, sample_size, data)

Rand 0.9015855428436159
Adjusted Rand Index 0.5083735750277368
Adjusted Mutual Information: 0.6337553778218533
Silhouette 0.13688152926617506


In [63]:
pca_estimator = KMeans(init='k-means++', n_clusters=n_digits, n_init=10)
pca_estimator.fit(data_pca_5)
get_errors(pca_estimator, sample_size, data)

Rand 0.8905252877537791
Adjusted Rand Index 0.4539320821438111
Adjusted Mutual Information: 0.5789780164016596
Silhouette 0.13388341792678082


In [81]:
umap_8_estimator =  KMeans(init='k-means++', n_clusters=n_digits, n_init=10)
umap_8_estimator.fit(embedding_umap_8)
get_errors(umap_8_estimator, sample_size, data)

Rand 0.9636854544755984
Adjusted Rand Index 0.8109556477304581
Adjusted Mutual Information: 0.8831118160643505
Silhouette 0.12145315035746189


In [82]:
umap_5_estimator =  KMeans(init='k-means++', n_clusters=n_digits, n_init=10)
umap_5_estimator.fit(embedding_umap_5)
get_errors(umap_5_estimator, sample_size, data)

Rand 0.9627280310044085
Adjusted Rand Index 0.8068792197364256
Adjusted Mutual Information: 0.8822945808301651
Silhouette 0.08281916914446907


## Affinity Propagation

In [65]:
from sklearn.cluster import AffinityPropagation

In [66]:
model_aff = AffinityPropagation()
model_aff.fit(data)

AffinityPropagation()

In [69]:
get_errors(model_aff, sample_size, data)

Rand 0.9090063493597966
Adjusted Rand Index 0.1543890171236833
Adjusted Mutual Information: 0.5728829386106491
Silhouette 0.05409447363969096


In [84]:
aff_pca_estimator = AffinityPropagation()
aff_pca_estimator.fit(data_pca_8)
get_errors(aff_pca_estimator, sample_size, data)

Rand 0.9113958800425852
Adjusted Rand Index 0.21764060274052408
Adjusted Mutual Information: 0.5874223418791499
Silhouette 0.05921617350232909


In [85]:
aff_pca_estimator = AffinityPropagation()
aff_pca_estimator.fit(data_pca_5)
get_errors(aff_pca_estimator, sample_size, data)

Rand 0.9086648993063172
Adjusted Rand Index 0.19535675094947286
Adjusted Mutual Information: 0.5367948299256343
Silhouette 0.015723390170051824


In [97]:
umap_8_estimator =  AffinityPropagation(max_iter=1000)
umap_8_estimator.fit(embedding_umap_8)
get_errors(umap_10_estimator, sample_size, data)

Rand 0.93250071574376
Adjusted Rand Index 0.4760848717336931
Adjusted Mutual Information: 0.7559585918378904
Silhouette 0.058323666535463774


In [98]:
umap_5_estimator =  AffinityPropagation(max_iter=1000)
umap_5_estimator.fit(embedding_umap_5)
get_errors(umap_5_estimator, sample_size, data)

Rand 0.9315693193183888
Adjusted Rand Index 0.4707660195004991
Adjusted Mutual Information: 0.7468834635679453
Silhouette 0.07714768561186161


## Agglomerative Hierarchical clustering

In [74]:
from sklearn.cluster import AgglomerativeClustering

In [75]:
model_agglom = AgglomerativeClustering(n_clusters=n_digits)
model_agglom.fit(data)

AgglomerativeClustering(n_clusters=10)

In [76]:
model_agglom.labels_

array([5, 1, 1, ..., 1, 1, 1])

In [77]:
get_errors(model_agglom, 300, data)

Rand 0.9302549535045417
Adjusted Rand Index 0.6643458356002894
Adjusted Mutual Information: 0.7934927361004462
Silhouette 0.12251888091350975


In [78]:
agg_pca_estimator = AgglomerativeClustering(n_clusters=n_digits)
agg_pca_estimator.fit(data_pca_8)

AgglomerativeClustering(n_clusters=10)

In [79]:
get_errors(agg_pca_estimator, sample_size, data)

Rand 0.8866131748905934
Adjusted Rand Index 0.4596582510127902
Adjusted Mutual Information: 0.6373011620932433
Silhouette 0.11229765924749661


In [80]:
agg_pca_estimator = AgglomerativeClustering(n_clusters=n_digits)
agg_pca_estimator.fit(data_pca_5)
get_errors(agg_pca_estimator, sample_size, data)

Rand 0.8730512249443207
Adjusted Rand Index 0.4141991315977404
Adjusted Mutual Information: 0.5988112321038814
Silhouette 0.11004095518626646


In [100]:
umap_8_estimator =  AgglomerativeClustering(n_clusters=n_digits)
umap_8_estimator.fit(embedding_umap_8)
get_errors(umap_10_estimator, sample_size, data)

Rand 0.93250071574376
Adjusted Rand Index 0.4760848717336931
Adjusted Mutual Information: 0.7559585918378904
Silhouette 0.07075844710071579


In [99]:
umap_5_estimator =  AgglomerativeClustering(n_clusters=n_digits)
umap_5_estimator.fit(embedding_umap_5)
get_errors(umap_5_estimator, sample_size, data)

Rand 0.9618381539140339
Adjusted Rand Index 0.7931938357029507
Adjusted Mutual Information: 0.8559436929343653
Silhouette 0.10914717093916264
