In [1]:
from sklearn.decomposition import PCA
from sklearn import datasets
from sklearn.preprocessing import scale
import numpy as np
from sklearn import metrics
import umap
from MulticoreTSNE import MulticoreTSNE as TSNE
import itertools
import warnings
warnings.filterwarnings('ignore')

In [2]:
digits = datasets.load_digits()
data = scale(digits.data)
n_samples, n_features = data.shape
n_digits = len(np.unique(digits.target))
labels = digits.target
sample_size = 300

In [3]:
data_pca_5 = PCA(n_components=5).fit_transform(data)
data_pca_5

array([[ 1.91421359, -0.95449622, -3.94604391,  2.02883652, -0.26673329],
       [ 0.5889804 ,  0.92464032,  3.92474476, -1.77975917, -0.99319685],
       [ 1.30203956, -0.31718825,  3.02330473, -2.04306354, -2.07906134],
       ...,
       [ 1.02259645, -0.14791941,  2.46998499, -0.62051816, -0.97207496],
       [ 1.07605565, -0.38089398, -2.45551507, -1.311739  ,  0.25546111],
       [-1.25770213, -2.22761693,  0.28366556, -0.12755157, -1.57059051]])

In [4]:
data_pca_8 = PCA(n_components=8).fit_transform(data)
data_pca_8

array([[ 1.91420718, -0.95447953, -3.94697306, ...,  0.52842636,
        -1.40541202,  1.49544973],
       [ 0.58889222,  0.92516625,  3.92329924, ..., -0.66469951,
         1.88305585,  0.55170731],
       [ 1.30198568, -0.31833771,  3.02113306, ...,  0.92925939,
        -1.31435026,  1.17849775],
       ...,
       [ 1.02288926, -0.14944315,  2.47131186, ..., -0.01079962,
         0.36145641,  0.54872627],
       [ 1.07626231, -0.38174773, -2.45346355, ..., -0.63699604,
         1.02426494,  0.77144759],
       [-1.25754134, -2.22956672,  0.28789893, ...,  2.33169467,
        -0.42765266, -1.15846763]])

In [5]:
embedding_umap_8 = umap.UMAP(n_neighbors=8).fit_transform(data)
embedding_umap_5 = umap.UMAP(n_neighbors=5).fit_transform(data)

In [6]:
embedding_tsne_8 =TSNE(n_components = 8, n_jobs=4).fit_transform(data)
embedding_tsne_5 =TSNE(n_components = 5, n_jobs=4).fit_transform(data)

In [7]:
def get_errors(estimator, sample_size, data, print_s=True):
    try:
        siholitte = metrics.silhouette_score(data, estimator.labels_,
                                          metric='euclidean',
                                          sample_size=sample_size)
    except Exception:
        siholitte = 0
    adj_rand = metrics.adjusted_rand_score(labels, estimator.labels_)
    mutual = metrics.adjusted_mutual_info_score(labels,  estimator.labels_)
    rand = metrics.rand_score(labels, estimator.labels_)
    if print_s:
        return print(f"Rand {rand}",
              f"Adjusted Rand Index {adj_rand}",
              f"Adjusted Mutual Information: {mutual}", 
              f"Silhouette {siholitte}", sep='\n')
    else:
        return rand, mutual, adj_rand, siholitte

## K-means

In [8]:
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist

In [59]:
estimator = KMeans(init='k-means++', n_clusters=n_digits, n_init=10)
estimator.fit(data)

KMeans(n_clusters=10)

In [60]:
get_errors(estimator, sample_size, data)

Rand 0.9141076503402726
Adjusted Rand Index 0.5687406548046969
Adjusted Mutual Information: 0.6945510574894563
Silhouette 0.11491996215318409


In [61]:
pca_estimator = KMeans(init='k-means++', n_clusters=n_digits, n_init=10)
pca_estimator.fit(data_pca_8)

KMeans(n_clusters=10)

In [62]:
get_errors(pca_estimator, sample_size, data)

Rand 0.9015855428436159
Adjusted Rand Index 0.5083735750277368
Adjusted Mutual Information: 0.6337553778218533
Silhouette 0.13688152926617506


### K-means PCA

In [63]:
pca_estimator = KMeans(init='k-means++', n_clusters=n_digits, n_init=10)
pca_estimator.fit(data_pca_5)
get_errors(pca_estimator, sample_size, data)

Rand 0.8905252877537791
Adjusted Rand Index 0.4539320821438111
Adjusted Mutual Information: 0.5789780164016596
Silhouette 0.13388341792678082


In [61]:
scores = []
for components in range(2, min(data.shape)):
    data_pca = PCA(n_components=components, random_state=42).fit_transform(data)
    pca_estimator = KMeans(n_clusters=n_digits, random_state=42).fit(data_pca)
    rand, mutual, adj_rand, siholitte  = get_errors(pca_estimator, sample_size, data_pca, print_s=False)
    scores.append({"comp": components, "rand": rand, "mutual": mutual, "adj_rand": adj_rand,"siholitte": siholitte})

In [63]:
max(scores, key=lambda x:x['mutual'])

{'comp': 37,
 'rand': 0.9150229347848988,
 'mutual': 0.6986544384959287,
 'adj_rand': 0.5732841244333222,
 'siholitte': 0.17430850922653643}

In [64]:
max(scores, key=lambda x:x['adj_rand'])

{'comp': 57,
 'rand': 0.9151803364429456,
 'mutual': 0.6984185299612811,
 'adj_rand': 0.573917350014406,
 'siholitte': 0.12463345396040446}

### K-means UMAP

In [81]:
umap_8_estimator =  KMeans(init='k-means++', n_clusters=n_digits, n_init=10)
umap_8_estimator.fit(embedding_umap_8)
get_errors(umap_8_estimator, sample_size, data)

Rand 0.9636854544755984
Adjusted Rand Index 0.8109556477304581
Adjusted Mutual Information: 0.8831118160643505
Silhouette 0.12145315035746189


In [82]:
umap_5_estimator =  KMeans(init='k-means++', n_clusters=n_digits, n_init=10)
umap_5_estimator.fit(embedding_umap_5)
get_errors(umap_5_estimator, sample_size, data)

Rand 0.9627280310044085
Adjusted Rand Index 0.8068792197364256
Adjusted Mutual Information: 0.8822945808301651
Silhouette 0.08281916914446907


In [56]:
n_neighbors = (5, 10, 15, 20)
min_dist = (0.1, 0.2, 0.3, 0.4, 0.5)
scores = []
params = list(itertools.product(n_neighbors, min_dist))
for param_set in params:
    embedding = umap.UMAP(random_state=42, n_neighbors=param_set[0], min_dist=param_set[1]).fit_transform(data)
    umap_estimator = KMeans(n_clusters=n_digits, random_state=42).fit(embedding)
    rand, mutual, adj_rand, siholitte  = get_errors(umap_estimator, sample_size, embedding, print_s=False)
    scores.append({"n_neighbors":param_set[0],
                   "min_dist": param_set[1],
                   "rand": rand,
                   "mutual": mutual,
                   "adj_rand": adj_rand,
                   "siholitte": siholitte})


In [59]:
max(scores, key=lambda x:x['mutual'])

{'n_neighbors': 20,
 'min_dist': 0.1,
 'rand': 0.9766072630330432,
 'mutual': 0.8949484884693509,
 'adj_rand': 0.8700936270137439,
 'siholitte': 0.69965786}

In [60]:
max(scores, key=lambda x:x['adj_rand'])

{'n_neighbors': 10,
 'min_dist': 0.4,
 'rand': 0.9771916321808309,
 'mutual': 0.8928537943721203,
 'adj_rand': 0.8730641349002842,
 'siholitte': 0.61702585}

### K-means TSNE

In [102]:
tsne_8_estimator =  KMeans(init='k-means++', n_clusters=n_digits, n_init=10)
tsne_8_estimator.fit(embedding_tsne_8)
get_errors(tsne_8_estimator, sample_size, data)

Rand 0.9671067716176305
Adjusted Rand Index 0.8206998704995867
Adjusted Mutual Information: 0.8691969594267749
Silhouette 0.08612554954914495


In [9]:
tsne_5_estimator =  KMeans(init='k-means++', n_clusters=n_digits, n_init=10)
tsne_5_estimator.fit(embedding_tsne_5)
get_errors(tsne_5_estimator, sample_size, data)

Rand 0.9661989234718096
Adjusted Rand Index 0.8163450016376737
Adjusted Mutual Information: 0.8657428008677109
Silhouette 0.10278263023583528


In [65]:
tsne_2_estimator =  KMeans(init='k-means++', n_clusters=n_digits, n_init=10)
embedding_tsne_2 = TSNE(n_jobs=4).fit_transform(data)
tsne_2_estimator.fit(embedding_tsne_2)
get_errors(tsne_2_estimator, sample_size, embedding_tsne_2)

Rand 0.9750660901056326
Adjusted Rand Index 0.8611581075212501
Adjusted Mutual Information: 0.880679250179012
Silhouette 0.5759131326894458


## Affinity Propagation

In [8]:
from sklearn.cluster import AffinityPropagation

In [9]:
model_aff = AffinityPropagation()
model_aff.fit(data)

AffinityPropagation()

In [69]:
get_errors(model_aff, sample_size, data)

Rand 0.9090063493597966
Adjusted Rand Index 0.1543890171236833
Adjusted Mutual Information: 0.5728829386106491
Silhouette 0.05409447363969096


In [84]:
aff_pca_estimator = AffinityPropagation()
aff_pca_estimator.fit(data_pca_8)
get_errors(aff_pca_estimator, sample_size, data)

Rand 0.9113958800425852
Adjusted Rand Index 0.21764060274052408
Adjusted Mutual Information: 0.5874223418791499
Silhouette 0.05921617350232909


In [85]:
aff_pca_estimator = AffinityPropagation()
aff_pca_estimator.fit(data_pca_5)
get_errors(aff_pca_estimator, sample_size, data)

Rand 0.9086648993063172
Adjusted Rand Index 0.19535675094947286
Adjusted Mutual Information: 0.5367948299256343
Silhouette 0.015723390170051824


### UMAP

In [97]:
umap_8_estimator =  AffinityPropagation(max_iter=1000)
umap_8_estimator.fit(embedding_umap_8)
get_errors(umap_8_estimator, sample_size, data)

Rand 0.93250071574376
Adjusted Rand Index 0.4760848717336931
Adjusted Mutual Information: 0.7559585918378904
Silhouette 0.058323666535463774


In [98]:
umap_5_estimator =  AffinityPropagation(max_iter=1000)
umap_5_estimator.fit(embedding_umap_5)
get_errors(umap_5_estimator, sample_size, data)

Rand 0.9315693193183888
Adjusted Rand Index 0.4707660195004991
Adjusted Mutual Information: 0.7468834635679453
Silhouette 0.07714768561186161


In [10]:
n_neighbors = (5, 10, 15, 20)
min_dist = (0.1, 0.2, 0.3, 0.4, 0.5)
scores = []
params = list(itertools.product(n_neighbors, min_dist))
for param_set in params:
    embedding = umap.UMAP(random_state=42, n_neighbors=param_set[0], min_dist=param_set[1]).fit_transform(data)
    umap_estimator = AffinityPropagation().fit(embedding)
    rand, mutual, adj_rand, siholitte  = get_errors(umap_estimator, sample_size, embedding, print_s=False)
    scores.append({"n_neighbors":param_set[0],
                   "min_dist": param_set[1],
                   "rand": rand,
                   "mutual": mutual,
                   "adj_rand": adj_rand,
                   "siholitte": siholitte})


In [11]:
max(scores, key=lambda x:x['mutual'])

{'n_neighbors': 5,
 'min_dist': 0.1,
 'rand': 0.0995199869121141,
 'mutual': 2.3140630140104056e-16,
 'adj_rand': 0.0,
 'siholitte': 0}

In [12]:
max(scores, key=lambda x:x['adj_rand'])

{'n_neighbors': 5,
 'min_dist': 0.1,
 'rand': 0.0995199869121141,
 'mutual': 2.3140630140104056e-16,
 'adj_rand': 0.0,
 'siholitte': 0}

### TSNE

In [105]:
tsne_8_estimator =  AffinityPropagation(max_iter=2000)
tsne_8_estimator.fit(embedding_tsne_8)
get_errors(tsne_8_estimator, sample_size, data)

Rand 0.9511354608584216
Adjusted Rand Index 0.6616059090691245
Adjusted Mutual Information: 0.80938813756693
Silhouette 0.11136858610319429


In [106]:
tsne_5_estimator =  AffinityPropagation(max_iter=2000)
tsne_5_estimator.fit(embedding_tsne_5)
get_errors(tsne_5_estimator, sample_size, data)

Rand 0.9471384502505413
Adjusted Rand Index 0.6261122629079497
Adjusted Mutual Information: 0.8018712630488759
Silhouette 0.08897748994219001


## Agglomerative Hierarchical clustering

In [14]:
from sklearn.cluster import AgglomerativeClustering

In [75]:
model_agglom = AgglomerativeClustering(n_clusters=n_digits)
model_agglom.fit(data)

AgglomerativeClustering(n_clusters=10)

In [76]:
model_agglom.labels_

array([5, 1, 1, ..., 1, 1, 1])

In [77]:
get_errors(model_agglom, 300, data)

Rand 0.9302549535045417
Adjusted Rand Index 0.6643458356002894
Adjusted Mutual Information: 0.7934927361004462
Silhouette 0.12251888091350975


In [78]:
agg_pca_estimator = AgglomerativeClustering(n_clusters=n_digits)
agg_pca_estimator.fit(data_pca_8)

AgglomerativeClustering(n_clusters=10)

In [79]:
get_errors(agg_pca_estimator, sample_size, data)

Rand 0.8866131748905934
Adjusted Rand Index 0.4596582510127902
Adjusted Mutual Information: 0.6373011620932433
Silhouette 0.11229765924749661


In [80]:
agg_pca_estimator = AgglomerativeClustering(n_clusters=n_digits)
agg_pca_estimator.fit(data_pca_5)
get_errors(agg_pca_estimator, sample_size, data)

Rand 0.8730512249443207
Adjusted Rand Index 0.4141991315977404
Adjusted Mutual Information: 0.5988112321038814
Silhouette 0.11004095518626646


### UMAP

In [12]:
umap_8_estimator =  AgglomerativeClustering(n_clusters=n_digits)
umap_8_estimator.fit(embedding_umap_8)
get_errors(umap_8_estimator, sample_size, data)

Rand 0.9634412959981558
Adjusted Rand Index 0.8090392315692063
Adjusted Mutual Information: 0.8801409167069021
Silhouette 0.09554924580430214


In [99]:
umap_5_estimator =  AgglomerativeClustering(n_clusters=n_digits)
umap_5_estimator.fit(embedding_umap_5)
get_errors(umap_5_estimator, sample_size, data)

Rand 0.9618381539140339
Adjusted Rand Index 0.7931938357029507
Adjusted Mutual Information: 0.8559436929343653
Silhouette 0.10914717093916264


In [15]:
n_neighbors = (5, 10, 15, 20)
min_dist = (0.1, 0.2, 0.3, 0.4, 0.5)
scores = []
params = list(itertools.product(n_neighbors, min_dist))
for param_set in params:
    embedding = umap.UMAP(random_state=42, n_neighbors=param_set[0], min_dist=param_set[1]).fit_transform(data)
    umap_estimator = AgglomerativeClustering(n_clusters=n_digits).fit(embedding)
    rand, mutual, adj_rand, siholitte  = get_errors(umap_estimator, sample_size, embedding, print_s=False)
    scores.append({"n_neighbors":param_set[0],
                   "min_dist": param_set[1],
                   "rand": rand,
                   "mutual": mutual,
                   "adj_rand": adj_rand,
                   "siholitte": siholitte})

In [16]:
max(scores, key=lambda x:x['mutual'])

{'n_neighbors': 15,
 'min_dist': 0.1,
 'rand': 0.9758623937693731,
 'mutual': 0.8924318031767376,
 'adj_rand': 0.866066434910594,
 'siholitte': 0.66685003}

In [17]:
max(scores, key=lambda x:x['adj_rand'])

{'n_neighbors': 15,
 'min_dist': 0.1,
 'rand': 0.9758623937693731,
 'mutual': 0.8924318031767376,
 'adj_rand': 0.866066434910594,
 'siholitte': 0.66685003}

### TSNE

In [107]:
tsne_8_estimator =  AgglomerativeClustering(n_clusters=n_digits)
tsne_8_estimator.fit(embedding_tsne_8)
get_errors(tsne_8_estimator, sample_size, data)

Rand 0.9718436939566439
Adjusted Rand Index 0.8442074674738934
Adjusted Mutual Information: 0.8842720857049966
Silhouette 0.10467246334819201


In [108]:
tsne_5_estimator =  AgglomerativeClustering(n_clusters=n_digits)
tsne_5_estimator.fit(embedding_tsne_5)
get_errors(tsne_5_estimator, sample_size, data)

Rand 0.9574172742742483
Adjusted Rand Index 0.7673993556863412
Adjusted Mutual Information: 0.8381398452654937
Silhouette 0.08145335725657371
