In [1]:
import pandas as pd
import random


In [9]:
iris = pd.read_csv("iris.csv")
iris

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [10]:
import numpy as np
from sklearn.preprocessing import LabelEncoder


In [11]:
data_train = np.array(iris.iloc[:, 0:4])
iris_label = iris.iloc[:,-1]
species_encoder = LabelEncoder().fit(iris_label)
iris_label_encoded = species_encoder.transform(iris_label)

In [37]:
class KMedoids:
    
    # k -> jumlah kelas
    # max_iter -> jumlah iterasi maksimal yang diperbolehkan
    def __init__ (self, init_method="random", k=2, max_iter=100):
        self.init_method = init_method
        self.k = k
        self.max_iter = max_iter
        self.centroids = []
        self.data = None
        self.labels = None
        
    def fit(self, data):
        self.data = data
        self.initCentroid()
        i = 0
        error = 1
        stop = False
        while not stop and i <= self.max_iter :
            self.initClassClassification()
            self.assignClassClassification()
            prev_centroids = self.centroids
            prev_error = self.calculateError(prev_centroids)    
#             print("Prev: ", prev_centroids)
            current_centroids, current_idx = self.changeCentroids()
#             print("Curr: ", current_centroids)
            curr_error = self.calculateError(current_centroids)
#             print(prev_error, curr_error)
            if curr_error <= prev_error :
                self.centroids = current_centroids
                self.idx_centroids = current_idx
            else :
                stop = True                
            i += 1
        self.labels = []
        for i in range(len(data)) :
            for j in range(self.k) :
                if i in self.classifications[j] :
                    self.labels.append(j)
        return self
            
        
    def initCentroid(self):
        self.idx_centroids = []
        if self.init_method == "random" :
             for i in range(self.k):
                idx = random.randint(0, len(self.data)-1)
                while (idx in self.idx_centroids):
                    idx = random.randint(0, len(self.data))
                self.centroids.append(self.data[idx])
                self.idx_centroids.append(idx)
        else :
            section = len(self.data) / self.k
            for i in range(self.k) :
                idx = int(i*section)
                self.idx_centroids.append(idx)
                self.centroids.append(self.data[idx])
    
    def initClassClassification(self) :
        self.classifications = {}
        for i in range (self.k):
            self.classifications[i] = []
            
    def assignClassClassification(self) :
        for i in range (len(self.data)) :
            distances = [self.calculateDistance(self.data[i], centroid) for centroid in self.centroids]
            classification = distances.index(min(distances))
            self.classifications[classification].append(i)
            
    def calculateDistance(self, vec1, vec2) :
        return np.sum([np.abs(v1-v2) for v1, v2 in zip(vec1, vec2)])
    
    def changeCentroids(self) :
        centroids = self.centroids
        idx_centroids = self.idx_centroids
        idx = random.randint(0, self.k)
        for i in range(self.k) :
            if i == idx :
                idx_change = random.randint(0, len(self.data))
                while (idx_change in self.idx_centroids):
                    idx_change = random.randint(0, len(self.data))
                centroids[i] = np.array(self.data[idx_change])
                idx_centroids[i] = idx_change
                
        return centroids, idx_centroids
    
    def calculateError(self, centroids) :
        error = 0
        for i in range (len(self.data)) :
            distance = self.calculateDistance(self.data[i], centroids[self.getClusterIndex(i)])
            error += distance
        return error
    
    def getClusterIndex(self, idx) :
        for k in range(self.k) :
            if idx in self.classifications[k] :
                return k


In [38]:
from sklearn.metrics import confusion_matrix

In [39]:
clf = KMedoids(init_method="distribute", k=3)
km = clf.fit(data_train)
cm = confusion_matrix(km.labels, iris_label_encoded)

In [40]:
pd.crosstab(iris_label, np.array(km.labels))

col_0,0,1,2
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
setosa,50,0,0
versicolor,4,46,0
virginica,0,17,33


In [41]:
purity = float(cm[0].max() + cm[1].max() + cm[2].max()) / float(cm.sum())

print("Purity: ", purity)

Purity:  0.86
