In [3]:
import pandas as pd
import numpy as np
import math
import random

In [49]:
def get_distance(x, y):
    return np.sqrt(np.sum((x-y)**2))   

In [None]:
# select K random points directly from the sample
def initialize_centroids(data, k):
    k_loc = random.sample(range(0, len(data)), k)
    return data[k_loc, :]

In [120]:
# select K random points from the range(min, max)
def initialize_centroids(data, k):
    centroids = np.zeros((data.shape[1], k))
    
    for i in range(k):
        for c in range(data.shape[1]):
            low = min(data[:, c])
            high = max(data[:, c])
            centroids[c][i] = low + (high - low) * random.random()
        
    return centroids    

In [81]:
def update_centroids(data, k, centroids, label):
    
    centroids_new = np.zeros(centroids.shape)
    count = np.zeros(k)
    
    for j in range(k):
        
        for d, l in zip(data, label):
            if l == j:
                centroids_new[j] += d
                count[j] += 1
        
        if count[j] != 0:
            centroids_new[j] = centroids_new[j] / count[j]
        
    return centroids_new
            

In [96]:
def assign_label(data, k, centroids):
    
    label = np.zeros(len(data))
    
    for i, x in enumerate(data):

        d = np.zeros(k)
        d_min = float('inf')
        label[i] = 0

        for j in range(0, k):
            d[j] = get_distance(centroids[j, :], x)
            if d[j] < d_min:
                d_min = d[j]
                label[i] = j
                
    return label            

In [66]:
def main(data, k):
    
    centroids = initialize_centroids(data, k)
    label = np.zeros(len(data))
    
    for i in range(1000):
        
        centroids_old = centroids
        label = assign_label(data, k, centroids)
        centroids = update_centroids(data, k, centroids, label)
         
        if get_distance(centroids_old, centroids) < 0.00001:
            break
        
    return centroids, label

# Kmean implementation from Medium
https://towardsdatascience.com/k-means-clustering-algorithm-applications-evaluation-methods-and-drawbacks-aa03e644b48a

In [None]:
import numpy as np
from numpy.linalg import norm


class Kmeans:
    '''Implementing Kmeans algorithm.'''

    def __init__(self, n_clusters, max_iter=100, random_state=123):
        self.n_clusters = n_clusters
        self.max_iter = max_iter
        self.random_state = random_state

    def initializ_centroids(self, X):
        np.random.RandomState(self.random_state)
        random_idx = np.random.permutation(X.shape[0])
        centroids = X[random_idx[:self.n_clusters]]
        return centroids

    def compute_centroids(self, X, labels):
        centroids = np.zeros((self.n_clusters, X.shape[1]))
        for k in range(self.n_clusters):
            centroids[k, :] = np.mean(X[labels == k, :], axis=0)
        return centroids

    def compute_distance(self, X, centroids):
        distance = np.zeros((X.shape[0], self.n_clusters))
        for k in range(self.n_clusters):
            row_norm = norm(X - centroids[k, :], axis=1)
            distance[:, k] = np.square(row_norm)
        return distance

    def find_closest_cluster(self, distance):
        return np.argmin(distance, axis=1)

    def compute_sse(self, X, labels, centroids):
        distance = np.zeros(X.shape[0])
        for k in range(self.n_clusters):
            distance[labels == k] = norm(X[labels == k] - centroids[k], axis=1)
        return np.sum(np.square(distance))
    
    def fit(self, X):
        self.centroids = self.initializ_centroids(X)
        for i in range(self.max_iter):
            old_centroids = self.centroids
            distance = self.compute_distance(X, old_centroids)
            self.labels = self.find_closest_cluster(distance)
            self.centroids = self.compute_centroids(X, self.labels)
            if np.all(old_centroids == self.centroids):
                break
        self.error = self.compute_sse(X, self.labels, self.centroids)
    
    def predict(self, X):
        distance = self.compute_distance(X, old_centroids)
        return self.find_closest_cluster(distance)

In [77]:
data = np.array([[1.,1.],[1.5,2.0],[3.,4.],[5.,7.],[3.5,5.],[4.5,5.],[3.5,4.5]])
data_1 = data + 1

In [48]:
np.sum((data_1 - data) ** 2)

14.0

In [110]:
data[:, 1]

array([1. , 2. , 4. , 7. , 5. , 5. , 4.5])

In [127]:
centroid = initialize_centroids(data, 2)
print(centroid)
label = assign_label(data, 2, centroid)
print(label)

[[1.09157871 3.31702219]
 [6.63981173 5.79337463]]
[0. 0. 0. 1. 0. 1. 0.]


In [128]:
centroid_1 = update_centroids(data, 2, centroid, label)
print(centroid_1)
centroid_1[1][0]

[[2.5  3.3 ]
 [4.75 6.  ]]


4.75

In [136]:
centroid,label = main(data, 2)
print(centroid)
print(label)

[[1.25 1.5 ]
 [3.9  5.1 ]]
[0. 0. 1. 1. 1. 1. 1.]


In [125]:
data.shape

(7, 2)

In [21]:
data = np.array([[1., 2.], [2., 3.], [5., 6.], [7., 8.]])
label = [1,1,2,2]
[d for d, l in zip(data, label) if l==1]


[array([1., 2.]), array([2., 3.])]

In [18]:
data[0][1]

2.0

In [26]:
np.zeros(data.shape)

array([[0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.]])