In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from utility import random_seed, generate_data, show_clusters
from utility import compute_centroids, assign_clusters, inertia

def kmeans(data, K, init=None, max_iter=20, seed=None):
    r"""Perform the K - means algorithm on the data

    -- data is a(n, p) numpy array

    -- K is the number of clusters

    -- init: the first centroids(if None (default) will randomly choose K datapoints from data.

    -- max_iter: if convergence not attained, will stop anyway after max_iter iteration.

    -- seed is for reproducibility of initailization (None is for complete random initialization

    returns clusters, centroids

    -- clusters is a(t, n) numpy array with clusters assigned during
    the t iterations. To get the final assignment, use clusters[-1]

    -- centroids is a(t, K, p) numpy array of centroids computed at the
    t iterations. To get the final clusters use centroids[-1] that is a(K, p) numpy array.

    """
    assert K > 1, 'did you reallly mean to do a kmean with {} cluster ?'.format(K)
    n, d = data.shape

    clusters = np.zeros((n, 0))

    if not init:
        with random_seed(seed):
            init = data[np.random.permutation(n)[:K]]

    centroids = init[None] # init[None] creates a new axis for concatenation of the results.
    clusters = assign_clusters(data, centroids[0])[None]

    for i in range(max_iter):
        new_centroids = compute_centroids(data, clusters[-1])
        new_clusters = # this has to be completed
        centroids = np.concatenate([centroids, new_centroids[None]])
        clusters = np.concatenate([clusters, new_clusters[None]])

        if all(clusters[-1] == # this has to be completed # ):
            break

    else:
        print('Stopped before convergence')

    return clusters, centroids

data, centroids, clusters = generate_data(n, p, K, seed=1)

fig = {}
fig[-1], ax = plt.subplots()
ax.set_title('ground truth')

show_clusters(data, clusters, centroids, ax)

K = 3 # we are not supposed to know the value of K, this value can be changed.

# change the seed to see what happens
clusters_, centroids_ = kmeans(data, K, seed=1) #

for iteration in range(len(clusters_)):

    clusters = clusters_[iteration]
    centroids = centroids_[iteration]

    intra, inter, total = inertia(data, clusters)
    print('K=={} iter {}: {:.3e}'.format(K, iteration, intra))

    fig[iteration], ax = plt.subplots()
    ax.set_title('iteration {}'.format(iteration))

    show_clusters(data, clusters, centroids, ax)

for _ in fig:
    fig[_].show()

Jw = []
K_max = 30
K_ = [*range(2, K_max + 1)]
for K in K_:
    clusters_, centroids_ = kmeans(data, K)
    intra, inter, total = inertia(data, clusters_[-1])

    Jw.append(intra)

    print('K={:2d} ({} iterations) : {:.3e}'.format(K, len(clusters_), intra))

f, ax = plt.subplots()
ax.set_xlabel('K')
ax.set_ylabel('Jw(K)')
ax.set_title('Intra cluster inertia w.r.t K')
ax.plot(K_, Jw)
f.show()
