In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import metrics

from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN
from sklearn.cluster import MeanShift
from sklearn.cluster import Birch
from sklearn.cluster import AffinityPropagation
from sklearn.cluster import MiniBatchKMeans

In [2]:
iris = pd.read_csv('datasets/iris.csv',
                   skiprows= 1,
                   names= ['sepal-length',
                           'sepal-width',
                           'petal-length',
                           'petal-width',
                           'class'])
iris.shape

(150, 5)

In [3]:
iris.head()

Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [4]:
iris = iris.sample(frac= 1).reset_index(drop= True)
iris.head()

Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width,class
0,6.7,3.0,5.2,2.3,Iris-virginica
1,6.5,3.0,5.2,2.0,Iris-virginica
2,4.6,3.2,1.4,0.2,Iris-setosa
3,5.0,3.5,1.3,0.3,Iris-setosa
4,4.9,2.4,3.3,1.0,Iris-versicolor


In [5]:
from sklearn import preprocessing

label_encoding = preprocessing.LabelEncoder()
iris['class']= label_encoding.fit_transform(iris['class'].astype(str))

iris.head()

Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width,class
0,6.7,3.0,5.2,2.3,2
1,6.5,3.0,5.2,2.0,2
2,4.6,3.2,1.4,0.2,0
3,5.0,3.5,1.3,0.3,0
4,4.9,2.4,3.3,1.0,1


In [6]:
iris_fe = iris.drop('class', axis= 1)
iris_fe.head()

Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width
0,6.7,3.0,5.2,2.3
1,6.5,3.0,5.2,2.0
2,4.6,3.2,1.4,0.2
3,5.0,3.5,1.3,0.3
4,4.9,2.4,3.3,1.0


In [7]:
iris_labels = iris['class']

In [8]:
def build_model(clustering_model, data, labels):
    
    model= clustering_model(data)
    print('homo\tcompl\tv-meas\tARI\tAMI\tsilhouette')
    print(50* '-')
    
    print('%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f'
           %(metrics.homogeneity_score(labels, model.labels_),
             metrics.completeness_score(labels, model.labels_),
             metrics.v_measure_score(labels, model.labels_),
             metrics.adjusted_rand_score(labels, model.labels_),
             metrics.adjusted_mutual_info_score(labels, model.labels_),
             metrics.silhouette_score(data, model.labels_)))

### kmeans

In [9]:
def k_means(data, n_clusters= 3, max_iter= 1000):
    
    model = KMeans(n_clusters= n_clusters, max_iter= max_iter).fit(data)
    return model

In [10]:
build_model(k_means, iris_fe, iris_labels)

homo	compl	v-meas	ARI	AMI	silhouette
--------------------------------------------------
0.751	0.765	0.758	0.730	0.755	0.553


### agglomerative

In [11]:
def agglomerative(data, n_clusters= 3):
    
    model= AgglomerativeClustering(n_clusters= n_clusters).fit(data)
    
    return model

In [12]:
build_model(agglomerative, iris_fe, iris_labels)

homo	compl	v-meas	ARI	AMI	silhouette
--------------------------------------------------
0.761	0.780	0.770	0.731	0.767	0.554


### dbscan

In [13]:
 def dbscan(data, eps= 0.55, min_samples= 5):
        
        model= DBSCAN(eps= eps, min_samples= min_samples).fit(data)
        
        return model

In [14]:
build_model(dbscan, iris_fe, iris_labels)

homo	compl	v-meas	ARI	AMI	silhouette
--------------------------------------------------
0.559	0.711	0.626	0.532	0.621	0.500


### mean-shift 

In [15]:
def meanshift(data, bandwidth= 0.83):
    
    model= MeanShift(bandwidth= bandwidth).fit(data)
    return model

In [16]:
build_model(meanshift, iris_fe, iris_labels)

homo	compl	v-meas	ARI	AMI	silhouette
--------------------------------------------------
0.735	0.658	0.695	0.655	0.689	0.492


### birch

In [17]:
def birch(data, n_clusters= 3):
    
    model= Birch(n_clusters= n_clusters).fit(data)
    return model

In [18]:
build_model(birch, iris_fe, iris_labels)

homo	compl	v-meas	ARI	AMI	silhouette
--------------------------------------------------
0.635	0.792	0.705	0.566	0.700	0.534


### affinity propagation

In [19]:
def affinity_propagation(data, damping= 0.51, max_iter= 1000):
    
    model= AffinityPropagation(damping= damping, max_iter= max_iter).fit(data)
    return model

In [20]:
build_model(affinity_propagation, iris_fe, iris_labels)

homo	compl	v-meas	ARI	AMI	silhouette
--------------------------------------------------
0.928	0.503	0.653	0.444	0.641	0.340


### mini-batch kmeans

In [21]:
def mini_batch_kmeans(data, n_clusters= 3, max_iter= 1000):
    
    model = MiniBatchKMeans(n_clusters= n_clusters, max_iter= max_iter,
                            batch_size= 15).fit(data)
    return model

In [22]:
build_model(mini_batch_kmeans, iris_fe, iris_labels)

homo	compl	v-meas	ARI	AMI	silhouette
--------------------------------------------------
0.751	0.765	0.758	0.730	0.755	0.553


### spectral 

In [23]:
from sklearn.cluster import SpectralClustering

In [24]:
SS = 1000 #self similarity
IS = 10 #intra-cluster similarity
LS = 0.01 #low similarity

In [25]:
#pre-computed similarity matrix

similarity_mat = [[SS, IS, IS, LS, LS, LS, LS, LS, LS],
                  [IS, SS, IS, LS, LS, LS, LS, LS, LS],
                  [IS, IS, SS, LS, LS, LS, LS, LS, LS],
                  [LS, LS, LS, SS, IS, IS, LS, LS, LS],
                  [LS, LS, LS, IS, SS, IS, LS, LS, LS],
                  [LS, LS, LS, IS, IS, SS, LS, LS, LS],
                  [LS, LS, LS, LS, LS, LS, SS, IS, IS],
                  [LS, LS, LS, LS, LS, LS, IS, SS, IS],
                  [LS, LS, LS, LS, LS, LS, IS, IS, SS]] 

In [27]:
spectral_model = SpectralClustering(n_clusters= 3, affinity= 'precomputed').fit(similarity_mat)

In [28]:
spectral_model.labels_

array([1, 1, 1, 0, 0, 0, 2, 2, 2], dtype=int32)