# kMedoids

In [1]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn import datasets
from sklearn.decomposition import PCA

#Data
X = np.array([[2 , 10],    #A1
              [2 ,  8],    #A2
              [2 ,  5],    #A3
              [1 ,  2],    #A4
              [2 ,  3],    #A5
              [4 ,  8],    #A6
              [7 ,  4],    #A7
              [6 ,  2],    #A8
              [8 ,  4],    #A9
              [8 ,  2]])  #A10

datapoints = X
m, f = datapoints.shape
print(f"number of points: {m}")
k = 3
samples = [0,3,8]
starting_medioids = X[samples, :]
p=1

def init_medoids(X, k):
    from numpy.random import choice
    from numpy.random import seed
 
    seed(1)
    samples = choice(len(X), size=k, replace=False)
    print(f"initial medioids: {samples}")
    print("=============================================")
    return X[samples, :]

def compute_d_p(X, medoids, p):
    m = len(X)
    medoids_shape = medoids.shape
    # If a 1-D array is provided, 
    # it will be reshlabelsaped to a single row 2-D array
    if len(medoids_shape) == 1: 
        medoids = medoids.reshape((1,len(medoids)))
    k = len(medoids)
    
    S = np.empty((m, k))
    print("=============================================")
    print(f"Compute distances from points to medoid")
    for i in range(m):
        d_i = np.linalg.norm(X[i, :] - medoids, ord=p, axis=1)
        S[i, :] = d_i**p
    print(f"distances: {S}")

    return S


def assign_labels(S):
    print(f"Assign points to medoids: {np.argmin(S, axis=1)}")
    return np.argmin(S, axis=1)


def update_medoids(X, medoids, p):
    print("=============================================")
    S = compute_d_p(datapoints, medoids, p)
    print("=============================================")
    labels = assign_labels(S)
        
    out_medoids = medoids
    print("=============================================")
    print(f"Update medoids:")
    for i in set(labels):
        print(f"For cluster {i}:")
        avg_dissimilarity = np.sum(compute_d_p(datapoints, medoids[i], p))
        print(f"avg dissimilarity of all points to medioid {medoids[i]} = {avg_dissimilarity}")
        
        cluster_points = datapoints[labels == i]
        print("=============================================")
        print(f"Compute disimilarity every cluster points of C{i}:")
        for datap in cluster_points:
            new_medoid = datap
            
            new_dissimilarity= np.sum(compute_d_p(datapoints, datap, p))
            print(f"new dissimilarity of all points to cluster point {datap} = {new_dissimilarity}")
            
            if new_dissimilarity < avg_dissimilarity :
                print("new_dissimilarity < avg_dissimilarity => avg_dissimilarity = {new_dissimilarity}")
                avg_dissimilarity = new_dissimilarity
                
                out_medoids[i] = datap
    return out_medoids, S, labels


def has_converged(old_medoids, medoids):
    return set([tuple(x) for x in old_medoids]) == set([tuple(x) for x in medoids])
  
  
#Full algorithm
def kmedoids(X, k, p, starting_medoids=None, max_steps=np.inf):
    if starting_medoids is None:
        medoids = init_medoids(X, k)
    else:
        medoids = starting_medoids
        
    converged = False
    labels = np.zeros(len(X))
    i = 1
    while (not converged) and (i <= max_steps):
        old_medoids = medoids.copy()
        print("=============================================")
        medoids, S, labels = update_medoids(X, medoids, p)
        print("=============================================")
        converged = has_converged(old_medoids, medoids)
        i += 1
    return (medoids,labels)

results = kmedoids(datapoints, 3, 2, starting_medoids=starting_medioids)
print(f"Converged. Medoids doesn't change anymore, or maxsteps.")
print("=============================================")
final_medoids = results[0]
print(f"final_medoids: {final_medoids}")
labels = results[1]
print(f"labels: {labels}")

number of points: 10
Compute distances from points to medoid
distances: [[  0.  65.  72.]
 [  4.  37.  52.]
 [ 25.  10.  37.]
 [ 65.   0.  53.]
 [ 49.   2.  37.]
 [  8.  45.  32.]
 [ 61.  40.   1.]
 [ 80.  25.   8.]
 [ 72.  53.   0.]
 [100.  49.   4.]]
Assign points to medoids: [0 0 1 1 1 0 2 2 2 2]
Update medoids:
For cluster 0:
Compute distances from points to medoid
distances: [[  0.]
 [  4.]
 [ 25.]
 [ 65.]
 [ 49.]
 [  8.]
 [ 61.]
 [ 80.]
 [ 72.]
 [100.]]
avg dissimilarity of all points to medioid [ 2 10] = 464.0
Compute disimilarity every cluster points of C0:
Compute distances from points to medoid
distances: [[  0.]
 [  4.]
 [ 25.]
 [ 65.]
 [ 49.]
 [  8.]
 [ 61.]
 [ 80.]
 [ 72.]
 [100.]]
new dissimilarity of all points to cluster point [ 2 10] = 464.0
Compute distances from points to medoid
distances: [[ 4.]
 [ 0.]
 [ 9.]
 [37.]
 [25.]
 [ 4.]
 [41.]
 [52.]
 [52.]
 [72.]]
new dissimilarity of all points to cluster point [2 8] = 296.0
new_dissimilarity < avg_dissimilarity => avg_d