In [38]:
from sklearn.decomposition import PCA
from sklearn import datasets
from sklearn.preprocessing import scale
import numpy as np
from sklearn import metrics

In [39]:
digits = datasets.load_digits()
data = scale(digits.data)
n_samples, n_features = data.shape
n_digits = len(np.unique(digits.target))
labels = digits.target
sample_size = 300

In [40]:
pca = PCA(n_components=n_digits).fit(data)

In [71]:
data_pca =  pca.fit_transform(data)
data_pca

array([[ 1.91423647, -0.95453733, -3.94609453, ...,  1.48688184,
         0.10830879, -0.81418189],
       [ 0.58892003,  0.92472431,  3.92464773, ...,  0.56929456,
         1.08630913,  0.0768659 ],
       [ 1.30207782, -0.31729262,  3.02279292, ...,  1.14149862,
         0.76335208, -1.10666762],
       ...,
       [ 1.02254591, -0.14780846,  2.4692302 , ...,  0.54251764,
         2.06498769, -2.03110768],
       [ 1.07625992, -0.38082791, -2.45527164, ...,  0.74636942,
         1.1028517 , -0.31603437],
       [-1.2575744 , -2.22758872,  0.28304905, ..., -1.21779362,
         0.8003097 , -1.82684755]])

## K-means

In [72]:
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist

In [73]:
estimator = KMeans(init='k-means++', n_clusters=n_digits, n_init=10)
estimator.fit(data)

KMeans(n_clusters=10)

In [74]:
mutual = metrics.adjusted_mutual_info_score(labels,  estimator.labels_)
siholitte = metrics.silhouette_score(data, estimator.labels_,
                                      metric='euclidean',
                                      sample_size=sample_size)
rand = metrics.adjusted_rand_score(labels, estimator.labels_)

In [75]:
print(f"Adjusted Rand Index {rand}", f"Adjusted Mutual Information: {mutual}", f"Silhouette {siholitte}", sep='\n')

Adjusted Rand Index 0.4792905956825979
Adjusted Mutual Information: 0.6297756012869173
Silhouette 0.14055768487796422


In [77]:
pca_estimator = KMeans(init=pca.components_, n_clusters=n_digits, n_init=1)
pca_estimator.fit(data)

KMeans(init=array([[-2.54682015e-17, -1.82233904e-01, -2.85867963e-01,
        -2.20370020e-01,  2.51688666e-02,  9.49553392e-03,
         5.24765125e-02,  6.26959166e-02, -3.47102289e-02,
        -2.45533431e-01, -2.29150110e-01,  1.07945478e-01,
        -3.62007386e-02, -3.87116043e-02,  8.37811938e-02,
         9.27757038e-02, -1.67012570e-02, -1.36715088e-01,
         6.30533216e-02,  1.22878700e-01, -1.48193153e-01,
         2.3...
        -1.31012231e-01,  8.12188224e-02, -1.39453559e-02,
        -1.48727740e-01,  4.58752165e-02,  4.36652278e-02,
         1.14268375e-01, -1.00827372e-01, -2.41509177e-01,
        -6.21881185e-02, -1.14129753e-01, -1.67783739e-01,
         3.95040742e-02,  2.65240810e-02, -2.35284390e-02,
         9.06368518e-03,  1.31571538e-02, -1.06814041e-01,
        -1.01085399e-01, -3.68546064e-02,  5.67490854e-02,
         2.92571515e-02]]),
       n_clusters=10, n_init=1)

In [78]:
mutual = metrics.adjusted_mutual_info_score(labels,  pca_estimator.labels_)
siholitte = metrics.silhouette_score(data, pca_estimator.labels_,
                                      metric='euclidean',
                                      sample_size=sample_size)
rand = metrics.adjusted_rand_score(labels, pca_estimator.labels_)

In [79]:
print(f"Adjusted Rand Index {rand}", f"Adjusted Mutual Information: {mutual}", f"Silhouette {siholitte}", sep='\n')

Adjusted Rand Index 0.5542653654111069
Adjusted Mutual Information: 0.6771577864975292
Silhouette 0.1395210630788652


## Affinity Propagation

In [80]:
from sklearn.cluster import AffinityPropagation

In [81]:
model_aff = AffinityPropagation()
model_aff.fit(data)

AffinityPropagation()

In [112]:
max(model_aff.labels_)

130

In [83]:
mutual = metrics.adjusted_mutual_info_score(labels,  model_aff.labels_)
siholitte = metrics.silhouette_score(data, model_aff.labels_,
                                      metric='euclidean',
                                      sample_size=sample_size)
rand = metrics.adjusted_rand_score(labels, model_aff.labels_)

In [84]:
print(f"Adjusted Rand Index {rand}", f"Adjusted Mutual Information: {mutual}", f"Silhouette {siholitte}", sep='\n')

Adjusted Rand Index 0.15438938454814916
Adjusted Mutual Information: 0.5729007446527502
Silhouette 0.060616256541993985


In [85]:
pca_estimator = AffinityPropagation()
pca_estimator.fit(data_pca)

AffinityPropagation()

In [86]:
pca_estimator

AffinityPropagation()

In [87]:
pca_estimator.labels_

array([42, 51, 27, ...,  7,  1, 54])

In [89]:
mutual = metrics.adjusted_mutual_info_score(labels,  pca_estimator.labels_)
siholitte = metrics.silhouette_score(data, pca_estimator.labels_,
                                      metric='euclidean',
                                      sample_size=sample_size)
rand = metrics.adjusted_rand_score(labels, pca_estimator.labels_)

In [90]:
print(f"Adjusted Rand Index {rand}", f"Adjusted Mutual Information: {mutual}", f"Silhouette {siholitte}", sep='\n')

Adjusted Rand Index 0.20142978488473018
Adjusted Mutual Information: 0.5831488706759776
Silhouette 0.06269891184630552


## Agglomerative Hierarchical clustering

In [105]:
from sklearn.cluster import AgglomerativeClustering

In [106]:
model_agglom = AgglomerativeClustering(n_clusters=10)
model_agglom.fit(data)

AgglomerativeClustering(n_clusters=10)

In [107]:
model_agglom.labels_

array([5, 1, 1, ..., 1, 1, 1])

In [109]:
mutual = metrics.adjusted_mutual_info_score(labels,  model_agglom.labels_)
rand = metrics.adjusted_rand_score(labels, model_agglom.labels_)
siholitte = metrics.silhouette_score(data, model_agglom.labels_,
                                      metric='euclidean',
                                      sample_size=300)

In [110]:
print(f"Adjusted Rand Index {rand}", f"Adjusted Mutual Information: {mutual}", f"Silhouette {siholitte}", sep='\n')

Adjusted Rand Index 0.6643458356002894
Adjusted Mutual Information: 0.7934927361004462
Silhouette 0.11875838372999634


10

In [117]:
pca_estimator = AgglomerativeClustering(n_clusters=10)
pca_estimator.fit(data_pca)

AgglomerativeClustering(n_clusters=10)

In [118]:
mutual = metrics.adjusted_mutual_info_score(labels,  pca_estimator.labels_)
siholitte = metrics.silhouette_score(data, pca_estimator.labels_,
                                      metric='euclidean',
                                      sample_size=sample_size)
rand = metrics.adjusted_rand_score(labels, pca_estimator.labels_)

In [119]:
print(f"Adjusted Rand Index {rand}", f"Adjusted Mutual Information: {mutual}", f"Silhouette {siholitte}", sep='\n')

Adjusted Rand Index 0.5906613319256733
Adjusted Mutual Information: 0.7124491223783881
Silhouette 0.13357964076454537
