In [176]:
from sklearn.decomposition import PCA
from sklearn import datasets
from sklearn.preprocessing import scale
import numpy as np
from sklearn import metrics

In [177]:
digits = datasets.load_digits()
data = scale(digits.data)
n_samples, n_features = data.shape
n_digits = len(np.unique(digits.target))
labels = digits.target
sample_size = 300

In [178]:
data_pca = PCA(n_components=8).fit_transform(data)
data_pca

array([[ 1.91413373, -0.95458743, -3.94624086, ...,  0.53098916,
        -1.41731914,  1.49851984],
       [ 0.58872589,  0.92360495,  3.92452141, ..., -0.6827531 ,
         1.8805979 ,  0.60039835],
       [ 1.30205587, -0.31696114,  3.02345393, ...,  0.94524771,
        -1.28480786,  1.09471128],
       ...,
       [ 1.02311127, -0.14716993,  2.47047289, ..., -0.01268978,
         0.38453697,  0.46288635],
       [ 1.07673132, -0.37989279, -2.45499662, ..., -0.65739515,
         1.03797733,  0.72391092],
       [-1.25680065, -2.22548811,  0.28511595, ...,  2.35109249,
        -0.3882886 , -1.29881808]])

## K-means

In [179]:
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist

In [180]:
estimator = KMeans(init='k-means++', n_clusters=n_digits, n_init=10)
estimator.fit(data)

KMeans(n_clusters=10)

In [181]:
mutual = metrics.adjusted_mutual_info_score(labels,  estimator.labels_)
siholitte = metrics.silhouette_score(data, estimator.labels_,
                                      metric='euclidean',
                                      sample_size=sample_size)
rand = metrics.adjusted_rand_score(labels, estimator.labels_)

In [182]:
print(f"Adjusted Rand Index {rand}", f"Adjusted Mutual Information: {mutual}", f"Silhouette {siholitte}", sep='\n')

Adjusted Rand Index 0.41890829630057824
Adjusted Mutual Information: 0.5846629360314028
Silhouette 0.1380183400966704


In [183]:
pca_estimator = KMeans(init='k-means++', n_clusters=n_digits, n_init=10)
pca_estimator.fit(data_pca)

KMeans(n_clusters=10)

In [184]:
mutual = metrics.adjusted_mutual_info_score(labels,  pca_estimator.labels_)
siholitte = metrics.silhouette_score(data, pca_estimator.labels_,
                                      metric='euclidean',
                                      sample_size=sample_size)
rand = metrics.adjusted_rand_score(labels, pca_estimator.labels_)

In [185]:
print(f"Adjusted Rand Index {rand}", f"Adjusted Mutual Information: {mutual}", f"Silhouette {siholitte}", sep='\n')

Adjusted Rand Index 0.4561101658905323
Adjusted Mutual Information: 0.5980106233251878
Silhouette 0.13315970703390764


## Affinity Propagation

In [186]:
from sklearn.cluster import AffinityPropagation

In [187]:
model_aff = AffinityPropagation()
model_aff.fit(data)

AffinityPropagation()

In [188]:
len(model_aff.cluster_centers_indices_)

132

In [189]:
max(model_aff.labels_)

131

In [190]:
mutual = metrics.adjusted_mutual_info_score(labels,  model_aff.labels_)
siholitte = metrics.silhouette_score(data, model_aff.labels_,
                                      metric='euclidean',
                                      sample_size=sample_size)
rand = metrics.adjusted_rand_score(labels, model_aff.labels_)

In [191]:
print(f"Adjusted Rand Index {rand}", f"Adjusted Mutual Information: {mutual}", f"Silhouette {siholitte}", sep='\n')

Adjusted Rand Index 0.1543890171236833
Adjusted Mutual Information: 0.5728829386106491
Silhouette 0.08563738209551562


In [192]:
pca_estimator = AffinityPropagation()
pca_estimator.fit(data_pca)

AffinityPropagation()

In [193]:
len(pca_estimator.cluster_centers_indices_)

73

In [194]:
mutual = metrics.adjusted_mutual_info_score(labels,  pca_estimator.labels_)
siholitte = metrics.silhouette_score(data, pca_estimator.labels_,
                                      metric='euclidean',
                                      sample_size=sample_size)
rand = metrics.adjusted_rand_score(labels, pca_estimator.labels_)

In [195]:
print(f"Adjusted Rand Index {rand}", f"Adjusted Mutual Information: {mutual}", f"Silhouette {siholitte}", sep='\n')

Adjusted Rand Index 0.2193039408963392
Adjusted Mutual Information: 0.5864655423464729
Silhouette 0.05081533772598464


## Agglomerative Hierarchical clustering

In [196]:
from sklearn.cluster import AgglomerativeClustering

In [197]:
model_agglom = AgglomerativeClustering(n_clusters=n_digits)
model_agglom.fit(data)

AgglomerativeClustering(n_clusters=10)

In [198]:
model_agglom.labels_

array([5, 1, 1, ..., 1, 1, 1])

In [199]:
mutual = metrics.adjusted_mutual_info_score(labels,  model_agglom.labels_)
rand = metrics.adjusted_rand_score(labels, model_agglom.labels_)
siholitte = metrics.silhouette_score(data, model_agglom.labels_,
                                      metric='euclidean',
                                      sample_size=300)

In [200]:
print(f"Adjusted Rand Index {rand}", f"Adjusted Mutual Information: {mutual}", f"Silhouette {siholitte}", sep='\n')

Adjusted Rand Index 0.6643458356002894
Adjusted Mutual Information: 0.7934927361004462
Silhouette 0.125761620215887


In [201]:
pca_estimator = AgglomerativeClustering(n_clusters=n_digits)
pca_estimator.fit(data_pca)

AgglomerativeClustering(n_clusters=10)

In [202]:
mutual = metrics.adjusted_mutual_info_score(labels,  pca_estimator.labels_)
siholitte = metrics.silhouette_score(data, pca_estimator.labels_,
                                      metric='euclidean',
                                      sample_size=sample_size)
rand = metrics.adjusted_rand_score(labels, pca_estimator.labels_)

In [203]:
print(f"Adjusted Rand Index {rand}", f"Adjusted Mutual Information: {mutual}", f"Silhouette {siholitte}", sep='\n')

Adjusted Rand Index 0.5363813063840955
Adjusted Mutual Information: 0.6920767402095714
Silhouette 0.10897093934434961
