In [None]:
import numpy as np
import pandas as pd
import scipy.spatial.distance as dist
import collections 

# Kmeans

In [9]:
def kmeans(data, k, centroids, max_iter=10000):
    
    for i in range(max_iter):
        cdist = (dist.cdist(data, centroids))**2
        labels = np.argmin(cdist, axis=1)
        update_centroids = np.zeros(centroids.shape)
        for j in range(k):
            # check if the centroid is the closest to some data point
            if sum(labels == j) != 0:
                update_centroids[j] = np.mean(data[labels ==j], axis=0)
            else:
                # if not, leave the lone centroid unmoved
                update_centroids[j] = centroids[j]
                
        if np.allclose(update_centroids, centroids):
            print("Algorithm converged after", i, "iterations.")
            return centroids
        else:
            centroids = update_centroids
        
    print("Warning: maximum number of iterations reached. Failed to converge.")
    return centroids

In [10]:
def main():
    k, n, d = 20, 1000, 15
    
    mean, cov = np.zeros(d), np.eye(d)
    data = np.random.multivariate_normal(mean, cov, n)
    #inrandom initial 
    initial_centers = np.random.multivariate_normal(mean, cov, k)
    
    centroids = kmeans(data, k, initial_centers)

if __name__ == '__main__':
    main()

Algorithm converged after 25 iterations.


# Kmeans++

In [11]:
def kmeans_pp(weights, data, k):
    first_random = np.random.choice(data.shape[0], 1)
    C = data[first_random, :]
    
    for i in range(k-1):
        cdist = (dist.cdist(data, C))**2
        cdist_min = np.min(cdist, axis = 1)* weights
        prob = cdist_min/np.sum(cdist_min)
        new_center = np.random.choice(data.shape[0],1, p=prob)
        C = np.vstack([C, data[new_center,:]])
        
    return C

In [12]:
def main():
    k, n, d = 20, 1000, 15
    
    mean, cov = np.zeros(d), np.eye(d)
    data = np.random.multivariate_normal(mean, cov, n)
    #equal weights for kmenas++
    initial_centers = kmeans_pp(1, data, 20)
    
    centroids = kmeans(data, k, initial_centers)

if __name__ == '__main__':
    main()

Algorithm converged after 16 iterations.


# Kmeans_II

In [13]:
def get_weight(C, data): 
    weights=np.zeros(C.shape[0])
    cdist = (dist.cdist(data,C))**2
    min_cdist = np.argmin(cdist, axis = 1)
    count = collections.Counter(min_cdist) 
    weights = list(collections.OrderedDict(sorted(count.items(), key=lambda x: x[0])).values())
    weights=np.array(weights)/sum(weights)
    return weights

In [None]:
def kmeans_II(data, k, l, max_iter=10000):
    first_random = np.random.choice(data.shape[0], 1)
    C = data[first_random, :]
    
    cdist = (dist.cdist(data, C))**2
    cdist_min = np.min(cdist, axis = 1)
    cost_phi = np.sum(cdist_min)
    
    for i in range(int(round(np.log(cost_phi)))):
        cdist = (dist.cdist(data, C))**2
        cdist_min = np.min(cdist, axis = 1)
        prob = cdist_min * l/np.sum(cdist_min)
        for j in range(data.shape[0]):
            if np.random.uniform() <= prob[j] and data[j,:] not in C:
                C = np.vstack([C, data[j,:]])
   
    weights= get_weight(C, data)

    return kmeans_pp(weights, C,k)
    

In [15]:
def main():
    k, n, d = 20, 1000, 15
    
    mean, cov = np.zeros(d), np.eye(d)
    data = np.random.multivariate_normal(mean, cov, n)
    #initial with weight 
    initial_centers = kmeans_II(data, 20, 10)
    
    centroids = kmeans(data, k, initial_centers)

if __name__ == '__main__':
    main()

Algorithm converged after 20 iterations.
