In [1]:
import numpy as np

In [3]:
X = np.random.multivariate_normal(mean = [0, 2, 4],
                                  cov = [[1,0,0],[0,1,0],[0,0,1]],
                                  size = 1000)

In [4]:
X

array([[ 1.34209573,  2.00078566,  2.14207923],
       [ 0.35886515,  1.17656357,  4.89466672],
       [ 0.07511223,  2.82791101,  3.45436655],
       ...,
       [ 0.75040294,  0.9963062 ,  4.30795926],
       [-0.88523404,  1.21518208,  4.26316108],
       [ 0.33454464,  3.42218396,  3.06584946]])

In [5]:
K = 5

In [17]:
leng = X.shape[0]
r = np.random.choice(range(leng), K, replace = False)

In [19]:
centroids = X[r,:]

In [20]:
pairwise_d = centroids[:,None] - X
dists = np.sum(np.power(pairwise_d,2), axis = 2)

In [24]:
centroids

array([[-1.34254146,  0.9716763 ,  2.85797915],
       [-0.38658135,  1.43059068,  5.64556727],
       [ 1.00511152,  3.0641479 ,  3.02473226],
       [-0.82669018,  2.27644056,  4.81189767],
       [ 0.47312616,  1.12349384,  3.35749456]])

In [33]:
X

array([[ 1.34209573,  2.00078566,  2.14207923],
       [ 0.35886515,  1.17656357,  4.89466672],
       [ 0.07511223,  2.82791101,  3.45436655],
       ...,
       [ 0.75040294,  0.9963062 ,  4.30795926],
       [-0.88523404,  1.21518208,  4.26316108],
       [ 0.33454464,  3.42218396,  3.06584946]])

In [27]:
clusters = np.argmin(dists, axis = 0)

In [22]:
dists.shape

(5, 1000)

In [39]:
centroids = []
for i in sorted(np.unique(clusters)):
    c_i = X[np.where(clusters==i)]
    centroids.append(c_i.mean(axis = 0))
np.array(centroids)

array([[-1.22135605,  1.4448244 ,  3.068088  ],
       [ 0.06884458,  1.39449792,  5.36543043],
       [ 0.84426507,  2.93168219,  3.45581045],
       [-0.59874749,  2.54799256,  4.54309298],
       [ 0.55864929,  1.19961475,  3.63478331]])

In [88]:
from sklearn.datasets import make_blobs

n_samples = 1500
random_state = 170
X, y = make_blobs(n_samples=n_samples, random_state=random_state)

In [80]:
import numpy as np

class KMeans:
    def __init__(self, K):
        self.K = K

    def reset_K(self, K):
        self.K = K

    def initiate_centroids(self, X: np.array):
        array_len = X.shape[0]
        random = np.random.choice(range(array_len), self.K, replace = False)
        return X[random,:]
    
    def compute_distance(self, centroids, X, d_type = 'e'):
        #broadcast the difference
        pairwise_d = centroids[:,None] - X
        if d_type == 'e':
            dists = np.sum(np.power(pairwise_d,2), axis = 2)
        elif d_type == 'm':
            dists = np.sum(np.abs(pairwise_d), axis = 2)
        return dists

    def get_clusters(self, dists):
        arg_min = np.argmin(dists, axis = 0)
        return arg_min
    
    def new_centroids(self, X, clusters):
        centroids = []
        for i in sorted(np.unique(clusters)):
            c_i = X[np.where(clusters==i)]
            centroids.append(c_i.mean(axis = 0))
        return np.array(centroids)
        
    def convergence(self, new, old, tol = 1e-4):
        diff = np.sum(np.power(new - old,2))
        if diff < tol:
            return True
        else:
            return False
        
    def pip(self, cent, X, dtype):
        dists = self.compute_distance(cent, X, dtype)
        clusters = self.get_clusters(dists)
        new_cent = self.new_centroids(X, clusters)
        return new_cent, clusters
    
    def kmeans(self, X: np.array):
        cent = self.initiate_centroids(X)
        new_cent , _  = self.pip(cent, X, dtype = 'e')
        converged = self.convergence(new_cent, cent)
        while not converged:
            cent = new_cent
            new_cent, clusters = self.pip(cent, X, dtype = 'e')
            converged = self.convergence(new_cent, cent)
        return new_cent, clusters

    def kmedian(self, X: np.array):
        cent = self.initiate_centroids(X)
        new_cent , _  = self.pip(cent, X, dtype = 'd')
        converged = self.convergence(new_cent, cent)
        while not converged:
            cent = new_cent
            new_cent, clusters = self.pip(cent, X, dtype = 'd')
            converged = self.convergence(new_cent, cent)
        return new_cent, clusters

In [81]:
km = KMeans(3)

In [82]:
cent, clusters = km.kmeans(X)

In [83]:
from sklearn.cluster import KMeans

In [84]:
km2 = KMeans(n_clusters=3)

In [85]:
km2.fit(X)

KMeans(n_clusters=3)

In [86]:
km2.cluster_centers_

array([[-4.55490993,  0.02920864],
       [-8.94137566, -5.48137132],
       [ 1.91176144,  0.40634045]])

In [87]:
cent

array([[-4.55490993,  0.02920864],
       [ 1.91176144,  0.40634045],
       [-8.94137566, -5.48137132]])

In [57]:
clusters

array([2, 3, 1, 3, 4, 3, 2, 0, 4, 0, 0, 2, 1, 4, 4, 3, 1, 0, 4, 2, 3, 4,
       3, 2, 2, 4, 1, 3, 2, 0, 0, 3, 3, 4, 2, 1, 1, 4, 4, 4, 3, 0, 0, 3,
       2, 2, 3, 4, 1, 0, 2, 2, 3, 0, 1, 0, 1, 2, 3, 0, 4, 1, 3, 0, 1, 2,
       1, 4, 3, 2, 4, 3, 1, 3, 4, 0, 2, 3, 1, 2, 1, 3, 0, 4, 1, 3, 3, 1,
       3, 3, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 0, 0, 4, 1, 3, 0, 3, 2, 1, 4,
       4, 4, 2, 4, 4, 4, 2, 1, 1, 1, 1, 4, 1, 0, 1, 4, 4, 3, 1, 2, 3, 1,
       2, 1, 2, 3, 3, 3, 2, 3, 3, 3, 1, 1, 4, 0, 0, 3, 0, 2, 1, 2, 1, 3,
       4, 4, 3, 3, 3, 2, 2, 0, 2, 4, 2, 2, 4, 3, 3, 1, 2, 3, 4, 0, 0, 3,
       1, 1, 0, 0, 4, 3, 3, 4, 0, 1, 3, 3, 2, 3, 0, 1, 0, 2, 2, 3, 4, 0,
       2, 4, 1, 0, 0, 3, 3, 1, 4, 2, 0, 4, 3, 2, 2, 3, 1, 2, 0, 1, 1, 2,
       3, 4, 1, 0, 0, 3, 3, 1, 0, 3, 1, 0, 4, 4, 1, 4, 4, 2, 2, 3, 3, 4,
       4, 1, 1, 2, 2, 0, 2, 2, 3, 1, 0, 4, 2, 4, 2, 2, 1, 4, 0, 3, 3, 1,
       2, 1, 3, 1, 3, 3, 2, 4, 3, 1, 2, 2, 3, 1, 2, 4, 1, 1, 0, 1, 4, 3,
       4, 3, 1, 1, 1, 1, 3, 1, 3, 3, 1, 0, 4, 3, 2,