In [3]:
import pandas as pd 
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import pairwise_distances


df = pd.read_csv('숙제5_데이터.csv')
x = df.loc[0:].values

import numpy as np
import random

def kMedoids(D, k, tmax=100):
    # determine dimensions of distance matrix D
    m, n = D.shape

    if k > n:
        raise Exception('too many medoids')

    # find a set of valid initial cluster medoid indices since we
    # can't seed different clusters with two points at the same location
    valid_medoid_inds = set(range(n))
    invalid_medoid_inds = set([])
    rs,cs = np.where(D==0)
    # the rows, cols must be shuffled because we will keep the first duplicate below
    index_shuf = list(range(len(rs)))
    np.random.shuffle(index_shuf)
    rs = rs[index_shuf]
    cs = cs[index_shuf]
    for r,c in zip(rs,cs):
        # if there are two points with a distance of 0...
        # keep the first one for cluster init
        if r < c and r not in invalid_medoid_inds:
            invalid_medoid_inds.add(c)
    valid_medoid_inds = list(valid_medoid_inds - invalid_medoid_inds)

    if k > len(valid_medoid_inds):
        raise Exception('too many medoids (after removing {} duplicate points)'.format(
            len(invalid_medoid_inds)))

    # randomly initialize an array of k medoid indices
    M = np.array(valid_medoid_inds)
    np.random.shuffle(M)
    M = np.sort(M[:k])

    # create a copy of the array of medoid indices
    Mnew = np.copy(M)

    # initialize a dictionary to represent clusters
    C = {}
    for t in range(tmax):
        # determine clusters, i. e. arrays of data indices
        J = np.argmin(D[:,M], axis=1)
        for kappa in range(k):
            C[kappa] = np.where(J==kappa)[0]
        # update cluster medoids
        for kappa in range(k):
            J = np.mean(D[np.ix_(C[kappa],C[kappa])],axis=1)
            j = np.argmin(J)
            Mnew[kappa] = C[kappa][j]
        np.sort(Mnew)
        # check for convergence
        if np.array_equal(M, Mnew):
            break
        M = np.copy(Mnew)
    else:
        J = np.argmin(D[:,M], axis=1)
        for kappa in range(k):
            C[kappa] = np.where(J==kappa)[0]
            
    return M, C

# distance matrix
D = pairwise_distances(x, metric='euclidean')

M, C = kMedoids(D, 4)

print('medoids centers:')
for point_idx in M:
    print( x[point_idx] )
print("")

medoids centers:
[5.  3.4 1.5 0.2]
[6.  2.9 4.5 1.5]
[6.8 3.  5.5 2.1]
[6.7 3.3 5.7 2.5]



In [2]:
print('clustering result:')
for label in C:
    for point_idx in C[label]:
        print('label {0}:　{1}'.format(label, x[point_idx]))

clustering result:
label 0:　[4.9 3.  1.4 0.2]
label 0:　[4.7 3.2 1.3 0.2]
label 0:　[4.6 3.1 1.5 0.2]
label 0:　[4.6 3.4 1.4 0.3]
label 0:　[4.4 2.9 1.4 0.2]
label 0:　[4.9 3.1 1.5 0.1]
label 0:　[4.8 3.4 1.6 0.2]
label 0:　[4.8 3.  1.4 0.1]
label 0:　[4.3 3.  1.1 0.1]
label 0:　[4.8 3.4 1.9 0.2]
label 0:　[5.  3.  1.6 0.2]
label 0:　[4.7 3.2 1.6 0.2]
label 0:　[4.8 3.1 1.6 0.2]
label 0:　[4.9 3.1 1.5 0.1]
label 0:　[4.9 3.1 1.5 0.1]
label 0:　[4.4 3.  1.3 0.2]
label 0:　[4.5 2.3 1.3 0.3]
label 0:　[4.4 3.2 1.3 0.2]
label 0:　[4.8 3.  1.4 0.3]
label 0:　[4.6 3.2 1.4 0.2]
label 0:　[4.9 2.4 3.3 1. ]
label 0:　[5.1 2.5 3.  1.1]
label 1:　[5.4 3.9 1.7 0.4]
label 1:　[5.8 4.  1.2 0.2]
label 1:　[5.7 4.4 1.5 0.4]
label 1:　[5.4 3.9 1.3 0.4]
label 1:　[5.7 3.8 1.7 0.3]
label 1:　[5.2 4.1 1.5 0.1]
label 1:　[5.5 4.2 1.4 0.2]
label 2:　[5.1 3.5 1.4 0.2]
label 2:　[5.  3.6 1.4 0.2]
label 2:　[5.  3.4 1.5 0.2]
label 2:　[5.4 3.7 1.5 0.2]
label 2:　[5.1 3.5 1.4 0.3]
label 2:　[5.1 3.8 1.5 0.3]
label 2:　[5.4 3.4 1.7 0.2]
label 2:　