In [None]:
import kmeans
%matplotlib inline 
kmeans.main()

In [None]:
import kmeans_visualize
%matplotlib inline 
kmeans_visualize.main()

In [None]:
import kmeans_fail
%matplotlib inline 
kmeans_fail.main()

Next is a sandbox

In [None]:
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline  

In [None]:
D = 2 # so we can visualize it more easily
s = 4 # separation so we can control how far apart the means are
mu1 = np.array([0, 0])
mu2 = np.array([s, s])
mu3 = np.array([0, s])

print(mu1, mu2, mu3)

In [None]:
N = 900 # number of samples
X = np.zeros((N, D))
X[:300, :] = np.random.randn(300, D) + mu1
X[300:600, :] = np.random.randn(300, D) + mu2
X[600:, :] = np.random.randn(300, D) + mu3

In [None]:
plt.scatter(X[:,0], X[:,1])
plt.show()

In [None]:
# distance between 2 vectors (squared!)
def d(u, v):
    diff = u - v
    return diff.dot(diff)

def cost(X, R, M):
    cost = 0
    for k in range(len(M)):
        # method 1
        # for n in range(len(X)):
        #     cost += R[n,k]*d(M[k], X[n])

        # method 2
        diff = X - M[k]
        sq_distances = (diff * diff).sum(axis=1)
        cost += (R[:,k] * sq_distances).sum()
    return cost

def plot_k_means(X, K, max_iter=20, beta=1.0, show_plots=True):
    N, D = X.shape
    M = np.zeros((K, D)) # means
    R = np.zeros((N, K)) # responsibility matrix
    
    # initialize M to random
    for k in range(K):
        M[k] = X[np.random.choice(N)]
        
    costs = np.zeros(max_iter)
    
    for i in range(max_iter):
        # step 1: determine assignments / resposibilities
        # is this inefficient?
        for k in range(K):
            for n in range(N):
                R[n,k] = np.exp(-beta*d(M[k], X[n])) / np.sum( np.exp(-beta*d(M[j], X[n])) for j in range(K) )
                #exponents[n,k] = np.exp(-beta*d(M[k], X[n]))
                
        # step 2: recalculate means
        for k in range(K):
            M[k] = R[:,k].dot(X) / R[:,k].sum()

        costs[i] = cost(X, R, M)
        if i > 0:
            if np.abs(costs[i] - costs[i-1]) < 1e-5:
                break
                
    if show_plots:
        plt.plot(costs)
        plt.title("Costs")
        plt.show()

        random_colors = np.random.random((K, 3))
        colors = R.dot(random_colors)
        plt.scatter(X[:,0], X[:,1], c=colors)
        plt.show()

    return M, R

In [None]:
K = 3 # luckily, we already know this
plot_k_means(X, K)

In [None]:
K = 5 # what happens if we choose a "bad" K?
plot_k_means(X, K, max_iter=30)

In [None]:
K = 5 # what happens if we change beta?
plot_k_means(X, K, max_iter=30, beta=0.3)