In [1]:
from sklearn.decomposition import PCA
from sklearn import datasets
from sklearn.preprocessing import scale
import numpy as np
from sklearn import metrics
import umap
from MulticoreTSNE import MulticoreTSNE as TSNE

In [2]:
digits = datasets.load_digits()
data = scale(digits.data)
n_samples, n_features = data.shape
n_digits = len(np.unique(digits.target))
labels = digits.target
sample_size = 300

In [3]:
data_pca_5 = PCA(n_components=5).fit_transform(data)
data_pca_5

array([[ 1.91421136, -0.954494  , -3.94598893,  2.02883547, -0.2662893 ],
       [ 0.58898338,  0.92462501,  3.92469301, -1.78005426, -0.99468853],
       [ 1.302044  , -0.31720754,  3.02325187, -2.04355634, -2.08282233],
       ...,
       [ 1.02259317, -0.1479016 ,  2.47003706, -0.62006647, -0.9711708 ],
       [ 1.0760562 , -0.38090983, -2.45551041, -1.31202529,  0.25266869],
       [-1.25770889, -2.22756667,  0.28376911, -0.12658822, -1.56747671]])

In [4]:
data_pca_8 = PCA(n_components=8).fit_transform(data)
data_pca_8

array([[ 1.91430892e+00, -9.54085986e-01, -3.94591549e+00, ...,
         5.30587008e-01, -1.40257505e+00,  1.46663330e+00],
       [ 5.89077123e-01,  9.24837562e-01,  3.92505304e+00, ...,
        -6.76759584e-01,  1.89206983e+00,  5.40455126e-01],
       [ 1.30188281e+00, -3.17683462e-01,  3.02331018e+00, ...,
         9.42145566e-01, -1.31065103e+00,  1.17143974e+00],
       ...,
       [ 1.02241607e+00, -1.48989341e-01,  2.46965827e+00, ...,
        -1.52793819e-03,  3.61431072e-01,  5.05776679e-01],
       [ 1.07592152e+00, -3.82057851e-01, -2.45552721e+00, ...,
        -6.47122970e-01,  1.00690964e+00,  7.70529961e-01],
       [-1.25812441e+00, -2.22926151e+00,  2.83593603e-01, ...,
         2.34290694e+00, -4.26583262e-01, -1.17477216e+00]])

In [5]:
embedding_umap_8 = umap.UMAP(n_neighbors=8).fit_transform(data)
embedding_umap_5 = umap.UMAP(n_neighbors=5).fit_transform(data)

In [6]:
embedding_tsne_8 =TSNE(n_components = 8, n_jobs=4).fit_transform(data)
embedding_tsne_5 =TSNE(n_components = 5, n_jobs=4).fit_transform(data)

In [7]:
def get_errors(estimator, sample_size, data):
    siholitte = metrics.silhouette_score(data, estimator.labels_,
                                      metric='euclidean',
                                      sample_size=sample_size)
    adj_rand = metrics.adjusted_rand_score(labels, estimator.labels_)
    mutual = metrics.adjusted_mutual_info_score(labels,  estimator.labels_)
    rand = metrics.rand_score(labels, estimator.labels_)
    return print(f"Rand {rand}",
          f"Adjusted Rand Index {adj_rand}",
          f"Adjusted Mutual Information: {mutual}", 
          f"Silhouette {siholitte}", sep='\n')

## K-means

In [8]:
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist

In [59]:
estimator = KMeans(init='k-means++', n_clusters=n_digits, n_init=10)
estimator.fit(data)

KMeans(n_clusters=10)

In [60]:
get_errors(estimator, sample_size, data)

Rand 0.9141076503402726
Adjusted Rand Index 0.5687406548046969
Adjusted Mutual Information: 0.6945510574894563
Silhouette 0.11491996215318409


In [61]:
pca_estimator = KMeans(init='k-means++', n_clusters=n_digits, n_init=10)
pca_estimator.fit(data_pca_8)

KMeans(n_clusters=10)

In [62]:
get_errors(pca_estimator, sample_size, data)

Rand 0.9015855428436159
Adjusted Rand Index 0.5083735750277368
Adjusted Mutual Information: 0.6337553778218533
Silhouette 0.13688152926617506


In [63]:
pca_estimator = KMeans(init='k-means++', n_clusters=n_digits, n_init=10)
pca_estimator.fit(data_pca_5)
get_errors(pca_estimator, sample_size, data)

Rand 0.8905252877537791
Adjusted Rand Index 0.4539320821438111
Adjusted Mutual Information: 0.5789780164016596
Silhouette 0.13388341792678082


In [81]:
umap_8_estimator =  KMeans(init='k-means++', n_clusters=n_digits, n_init=10)
umap_8_estimator.fit(embedding_umap_8)
get_errors(umap_8_estimator, sample_size, data)

Rand 0.9636854544755984
Adjusted Rand Index 0.8109556477304581
Adjusted Mutual Information: 0.8831118160643505
Silhouette 0.12145315035746189


In [82]:
umap_5_estimator =  KMeans(init='k-means++', n_clusters=n_digits, n_init=10)
umap_5_estimator.fit(embedding_umap_5)
get_errors(umap_5_estimator, sample_size, data)

Rand 0.9627280310044085
Adjusted Rand Index 0.8068792197364256
Adjusted Mutual Information: 0.8822945808301651
Silhouette 0.08281916914446907


In [102]:
tsne_8_estimator =  KMeans(init='k-means++', n_clusters=n_digits, n_init=10)
tsne_8_estimator.fit(embedding_tsne_8)
get_errors(tsne_8_estimator, sample_size, data)

Rand 0.9671067716176305
Adjusted Rand Index 0.8206998704995867
Adjusted Mutual Information: 0.8691969594267749
Silhouette 0.08612554954914495


In [9]:
tsne_5_estimator =  KMeans(init='k-means++', n_clusters=n_digits, n_init=10)
tsne_5_estimator.fit(embedding_tsne_5)
get_errors(tsne_5_estimator, sample_size, data)

Rand 0.9661989234718096
Adjusted Rand Index 0.8163450016376737
Adjusted Mutual Information: 0.8657428008677109
Silhouette 0.10278263023583528


## Affinity Propagation

In [65]:
from sklearn.cluster import AffinityPropagation

In [66]:
model_aff = AffinityPropagation()
model_aff.fit(data)

AffinityPropagation()

In [69]:
get_errors(model_aff, sample_size, data)

Rand 0.9090063493597966
Adjusted Rand Index 0.1543890171236833
Adjusted Mutual Information: 0.5728829386106491
Silhouette 0.05409447363969096


In [84]:
aff_pca_estimator = AffinityPropagation()
aff_pca_estimator.fit(data_pca_8)
get_errors(aff_pca_estimator, sample_size, data)

Rand 0.9113958800425852
Adjusted Rand Index 0.21764060274052408
Adjusted Mutual Information: 0.5874223418791499
Silhouette 0.05921617350232909


In [85]:
aff_pca_estimator = AffinityPropagation()
aff_pca_estimator.fit(data_pca_5)
get_errors(aff_pca_estimator, sample_size, data)

Rand 0.9086648993063172
Adjusted Rand Index 0.19535675094947286
Adjusted Mutual Information: 0.5367948299256343
Silhouette 0.015723390170051824


In [97]:
umap_8_estimator =  AffinityPropagation(max_iter=1000)
umap_8_estimator.fit(embedding_umap_8)
get_errors(umap_8_estimator, sample_size, data)

Rand 0.93250071574376
Adjusted Rand Index 0.4760848717336931
Adjusted Mutual Information: 0.7559585918378904
Silhouette 0.058323666535463774


In [98]:
umap_5_estimator =  AffinityPropagation(max_iter=1000)
umap_5_estimator.fit(embedding_umap_5)
get_errors(umap_5_estimator, sample_size, data)

Rand 0.9315693193183888
Adjusted Rand Index 0.4707660195004991
Adjusted Mutual Information: 0.7468834635679453
Silhouette 0.07714768561186161


In [105]:
tsne_8_estimator =  AffinityPropagation(max_iter=2000)
tsne_8_estimator.fit(embedding_tsne_8)
get_errors(tsne_8_estimator, sample_size, data)

Rand 0.9511354608584216
Adjusted Rand Index 0.6616059090691245
Adjusted Mutual Information: 0.80938813756693
Silhouette 0.11136858610319429


In [106]:
tsne_5_estimator =  AffinityPropagation(max_iter=2000)
tsne_5_estimator.fit(embedding_tsne_5)
get_errors(tsne_5_estimator, sample_size, data)

Rand 0.9471384502505413
Adjusted Rand Index 0.6261122629079497
Adjusted Mutual Information: 0.8018712630488759
Silhouette 0.08897748994219001


## Agglomerative Hierarchical clustering

In [11]:
from sklearn.cluster import AgglomerativeClustering

In [75]:
model_agglom = AgglomerativeClustering(n_clusters=n_digits)
model_agglom.fit(data)

AgglomerativeClustering(n_clusters=10)

In [76]:
model_agglom.labels_

array([5, 1, 1, ..., 1, 1, 1])

In [77]:
get_errors(model_agglom, 300, data)

Rand 0.9302549535045417
Adjusted Rand Index 0.6643458356002894
Adjusted Mutual Information: 0.7934927361004462
Silhouette 0.12251888091350975


In [78]:
agg_pca_estimator = AgglomerativeClustering(n_clusters=n_digits)
agg_pca_estimator.fit(data_pca_8)

AgglomerativeClustering(n_clusters=10)

In [79]:
get_errors(agg_pca_estimator, sample_size, data)

Rand 0.8866131748905934
Adjusted Rand Index 0.4596582510127902
Adjusted Mutual Information: 0.6373011620932433
Silhouette 0.11229765924749661


In [80]:
agg_pca_estimator = AgglomerativeClustering(n_clusters=n_digits)
agg_pca_estimator.fit(data_pca_5)
get_errors(agg_pca_estimator, sample_size, data)

Rand 0.8730512249443207
Adjusted Rand Index 0.4141991315977404
Adjusted Mutual Information: 0.5988112321038814
Silhouette 0.11004095518626646


In [12]:
umap_8_estimator =  AgglomerativeClustering(n_clusters=n_digits)
umap_8_estimator.fit(embedding_umap_8)
get_errors(umap_8_estimator, sample_size, data)

Rand 0.9634412959981558
Adjusted Rand Index 0.8090392315692063
Adjusted Mutual Information: 0.8801409167069021
Silhouette 0.09554924580430214


In [99]:
umap_5_estimator =  AgglomerativeClustering(n_clusters=n_digits)
umap_5_estimator.fit(embedding_umap_5)
get_errors(umap_5_estimator, sample_size, data)

Rand 0.9618381539140339
Adjusted Rand Index 0.7931938357029507
Adjusted Mutual Information: 0.8559436929343653
Silhouette 0.10914717093916264


In [107]:
tsne_8_estimator =  AgglomerativeClustering(n_clusters=n_digits)
tsne_8_estimator.fit(embedding_tsne_8)
get_errors(tsne_8_estimator, sample_size, data)

Rand 0.9718436939566439
Adjusted Rand Index 0.8442074674738934
Adjusted Mutual Information: 0.8842720857049966
Silhouette 0.10467246334819201


In [108]:
tsne_5_estimator =  AgglomerativeClustering(n_clusters=n_digits)
tsne_5_estimator.fit(embedding_tsne_5)
get_errors(tsne_5_estimator, sample_size, data)

Rand 0.9574172742742483
Adjusted Rand Index 0.7673993556863412
Adjusted Mutual Information: 0.8381398452654937
Silhouette 0.08145335725657371
