In [35]:
from sklearn.decomposition import PCA
from sklearn import datasets
from sklearn.preprocessing import scale
import numpy as np
from sklearn import metrics

In [36]:
digits = datasets.load_digits()
data = scale(digits.data)
n_samples, n_features = data.shape
n_digits = len(np.unique(digits.target))
labels = digits.target
sample_size = 300

In [37]:
data_pca_5 = PCA(n_components=5).fit_transform(data)
data_pca_5

array([[ 1.91421555, -0.95450159, -3.94603799,  2.02837568, -0.26559953],
       [ 0.58898221,  0.92464386,  3.9247618 , -1.7798229 , -0.99527166],
       [ 1.30203719, -0.31718658,  3.02333216, -2.04293568, -2.08369243],
       ...,
       [ 1.02259851, -0.14791292,  2.46995299, -0.62074828, -0.9694444 ],
       [ 1.07606   , -0.38089879, -2.45550638, -1.31230627,  0.25336338],
       [-1.25771078, -2.22762088,  0.28360834, -0.12691401, -1.56442005]])

In [38]:
data_pca_8 = PCA(n_components=8).fit_transform(data)
data_pca_8

array([[ 1.914317  , -0.95421419, -3.94652967, ...,  0.52255669,
        -1.41125136,  1.51099137],
       [ 0.58906001,  0.92521281,  3.92448315, ..., -0.68616736,
         1.88210594,  0.56736236],
       [ 1.30256054, -0.31571534,  3.02345605, ...,  0.92062185,
        -1.30662746,  1.17016534],
       ...,
       [ 1.02196519, -0.14967333,  2.47015831, ...,  0.02996551,
         0.3843445 ,  0.48486982],
       [ 1.07528232, -0.38329416, -2.45523416, ..., -0.59703669,
         1.03941814,  0.71700158],
       [-1.25874272, -2.23079842,  0.28499778, ...,  2.4102428 ,
        -0.39209902, -1.29215186]])

## K-means

In [39]:
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist

In [40]:
estimator = KMeans(init='k-means++', n_clusters=n_digits, n_init=10)
estimator.fit(data)

KMeans(n_clusters=10)

In [41]:
mutual = metrics.adjusted_mutual_info_score(labels,  estimator.labels_)
siholitte = metrics.silhouette_score(data, estimator.labels_,
                                      metric='euclidean',
                                      sample_size=sample_size)
adj_rand = metrics.adjusted_rand_score(labels, estimator.labels_)
rand = metrics.rand_score(labels, estimator.labels_)
rand = metrics.rand_score(labels, estimator.labels_)

In [42]:
print(f"Rand {rand}",f"Adjusted Rand Index {adj_rand}", f"Adjusted Mutual Information: {mutual}", f"Silhouette {siholitte}", sep='\n')

Rand 0.8885794562330437
Adjusted Rand Index 0.46034991023967187
Adjusted Mutual Information: 0.6095850583787574
Silhouette 0.13041403588494707


In [43]:
pca_estimator = KMeans(init='k-means++', n_clusters=n_digits, n_init=10)
pca_estimator.fit(data_pca_8)

KMeans(n_clusters=10)

In [44]:
mutual = metrics.adjusted_mutual_info_score(labels,  pca_estimator.labels_)
siholitte = metrics.silhouette_score(data, pca_estimator.labels_,
                                      metric='euclidean',
                                      sample_size=sample_size)
adj_rand = metrics.adjusted_rand_score(labels, pca_estimator.labels_)
rand = metrics.rand_score(labels, pca_estimator.labels_)

In [45]:
print(f"Rand {rand}", f"Adjusted Rand Index {adj_rand}", f"Adjusted Mutual Information: {mutual}", f"Silhouette {siholitte}", sep='\n')

Rand 0.9019331898127664
Adjusted Rand Index 0.5098878479260415
Adjusted Mutual Information: 0.6329601091061987
Silhouette 0.15043391171445772


In [46]:
pca_estimator = KMeans(init='k-means++', n_clusters=n_digits, n_init=10)
pca_estimator.fit(data_pca_5)
mutual = metrics.adjusted_mutual_info_score(labels,  pca_estimator.labels_)
siholitte = metrics.silhouette_score(data, pca_estimator.labels_,
                                      metric='euclidean',
                                      sample_size=sample_size)
adj_rand = metrics.adjusted_rand_score(labels, pca_estimator.labels_)
rand = metrics.rand_score(labels, pca_estimator.labels_)
print(f"Rand {rand}", f"Adjusted Rand Index {adj_rand}", f"Adjusted Mutual Information: {mutual}", f"Silhouette {siholitte}", sep='\n')

Rand 0.8846840750421701
Adjusted Rand Index 0.42587880657243815
Adjusted Mutual Information: 0.5586586977115364
Silhouette 0.11929342797265913


## Affinity Propagation

In [47]:
from sklearn.cluster import AffinityPropagation

In [48]:
model_aff = AffinityPropagation()
model_aff.fit(data)

AffinityPropagation()

In [49]:
len(model_aff.cluster_centers_indices_)

131

In [50]:
max(model_aff.labels_)

130

In [51]:
mutual = metrics.adjusted_mutual_info_score(labels,  model_aff.labels_)
siholitte = metrics.silhouette_score(data, model_aff.labels_,
                                      metric='euclidean',
                                      sample_size=sample_size)
adj_rand = metrics.adjusted_rand_score(labels, model_aff.labels_)
rand = metrics.rand_score(labels, model_aff.labels_)

In [52]:
print(f"Rand {rand}", f"Adjusted Rand Index {adj_rand}", f"Adjusted Mutual Information: {mutual}", f"Silhouette {siholitte}", sep='\n')

Rand 0.9090075887429309
Adjusted Rand Index 0.15440833556141098
Adjusted Mutual Information: 0.5731048862115512
Silhouette 0.0610423081240067


In [53]:
aff_pca_estimator = AffinityPropagation()
aff_pca_estimator.fit(data_pca_8)

AffinityPropagation()

In [54]:
len(aff_pca_estimator.cluster_centers_indices_)

74

In [55]:
mutual = metrics.adjusted_mutual_info_score(labels,  aff_pca_estimator.labels_)
siholitte = metrics.silhouette_score(data, aff_pca_estimator.labels_,
                                      metric='euclidean',
                                      sample_size=sample_size)
adj_rand = metrics.adjusted_rand_score(labels, aff_pca_estimator.labels_)
rand = metrics.rand_score(labels, aff_pca_estimator.labels_)

In [56]:
print(f"Rand {rand}", f"Adjusted Rand Index {adj_rand}", f"Adjusted Mutual Information: {mutual}", f"Silhouette {siholitte}", sep='\n')

Rand 0.9112434359170754
Adjusted Rand Index 0.21522144604658655
Adjusted Mutual Information: 0.5859428607590447
Silhouette 0.044492217087618094


In [57]:
aff_pca_estimator = AffinityPropagation()
aff_pca_estimator.fit(data_pca_5)
mutual = metrics.adjusted_mutual_info_score(labels,  aff_pca_estimator.labels_)
siholitte = metrics.silhouette_score(data, aff_pca_estimator.labels_,
                                      metric='euclidean',
                                      sample_size=sample_size)
adj_rand = metrics.adjusted_rand_score(labels, aff_pca_estimator.labels_)
rand = metrics.rand_score(labels, aff_pca_estimator.labels_)
print(f"Rand {rand}", f"Adjusted Rand Index {adj_rand}", f"Adjusted Mutual Information: {mutual}", f"Silhouette {siholitte}", sep='\n')

Rand 0.9093670098518566
Adjusted Rand Index 0.20732718590875004
Adjusted Mutual Information: 0.5426420427412191
Silhouette 0.040191527942498334


## Agglomerative Hierarchical clustering

In [58]:
from sklearn.cluster import AgglomerativeClustering

In [59]:
model_agglom = AgglomerativeClustering(n_clusters=n_digits)
model_agglom.fit(data)

AgglomerativeClustering(n_clusters=10)

In [60]:
model_agglom.labels_

array([5, 1, 1, ..., 1, 1, 1])

In [61]:
mutual = metrics.adjusted_mutual_info_score(labels,  model_agglom.labels_)
adj_rand = metrics.adjusted_rand_score(labels, model_agglom.labels_)
rand = metrics.rand_score(labels, model_agglom.labels_)
siholitte = metrics.silhouette_score(data, model_agglom.labels_,
                                      metric='euclidean',
                                      sample_size=300)

In [62]:
print(f"Rand {rand}",f"Adjusted Rand Index {adj_rand}", f"Adjusted Mutual Information: {mutual}", f"Silhouette {siholitte}", sep='\n')

Rand 0.9302549535045417
Adjusted Rand Index 0.6643458356002894
Adjusted Mutual Information: 0.7934927361004462
Silhouette 0.14204655220593906


In [63]:
agg_pca_estimator = AgglomerativeClustering(n_clusters=n_digits)
agg_pca_estimator.fit(data_pca_8)

AgglomerativeClustering(n_clusters=10)

In [64]:
mutual = metrics.adjusted_mutual_info_score(labels,  agg_pca_estimator.labels_)
siholitte = metrics.silhouette_score(data, agg_pca_estimator.labels_,
                                      metric='euclidean',
                                      sample_size=sample_size)
rand = metrics.rand_score(labels, agg_pca_estimator.labels_)
adj_rand = metrics.adjusted_rand_score(labels, agg_pca_estimator.labels_)

In [65]:
print(f"Rand {rand}", f"Adjusted Rand Index {adj_rand}", f"Adjusted Mutual Information: {mutual}", f"Silhouette {siholitte}", sep='\n')

Rand 0.9141033124993029
Adjusted Rand Index 0.5556872427395279
Adjusted Mutual Information: 0.6862775529879439
Silhouette 0.11673411007943015


In [66]:
agg_pca_estimator = AgglomerativeClustering(n_clusters=n_digits)
agg_pca_estimator.fit(data_pca_5)
mutual = metrics.adjusted_mutual_info_score(labels,  agg_pca_estimator.labels_)
siholitte = metrics.silhouette_score(data, agg_pca_estimator.labels_,
                                      metric='euclidean',
                                      sample_size=sample_size)
adj_rand = metrics.adjusted_rand_score(labels, agg_pca_estimator.labels_)
rand = metrics.rand_score(labels, agg_pca_estimator.labels_)
print(f"Rand {rand}", f"Adjusted Rand Index {adj_rand}", f"Adjusted Mutual Information: {mutual}", f"Silhouette {siholitte}", sep='\n')

Rand 0.8753155779305524
Adjusted Rand Index 0.4191347209316558
Adjusted Mutual Information: 0.5971611132494623
Silhouette 0.10324761703841295
