In [None]:
# import required modules
import numpy as np
from scipy.cluster.vq import kmeans2
import pylab

# generate clouds of 2D normally distributed points
N = 600 # number of points in each cluster

In [None]:
# cloud 1: mean (0, 0)
mean1 = [0, 0]
cov1  = [[1, 0], [0, 1]]
x1,y1 = np.random.multivariate_normal(mean1, cov1, N).T

# cloud 2: mean (5, 5)
mean2 = [5, 5]
cov2  = [[1, 0], [0, 1]]
x2,y2 = np.random.multivariate_normal(mean2, cov2, N).T

xs, ys = np.concatenate( (x1, x2) ), np.concatenate( (y1, y2) )
ax = pylab.subplot(111)
ax.scatter(xs, ys)
ax.figure.show()

In [None]:
# merge the clouds and arrange into data points
points = np.array([xs, ys]).T

# cluster the points using k-means
centroids, clusters = kmeans2(points, k=2)
print(centroids, clusters)

In [None]:
pred_x1 = xs[clusters==0]
pred_y1 = ys[clusters==0]

pred_x2 = xs[clusters==1]
pred_y2 = ys[clusters==1]

ax = pylab.subplot(111)
ax.scatter(pred_x1, pred_y1)
ax.scatter(pred_x2, pred_y2)
ax.scatter(centroids[0][0], centroids[0][1], c='yellow')
ax.scatter(centroids[1][0], centroids[1][1], c='green')
ax.figure.show()

In [None]:
# determine which centroid belongs to which cluster
# using Euclidean distance
dist1 = np.linalg.norm(centroids[0]-mean1)
dist2 = np.linalg.norm(centroids[1]-mean1)
if dist1 <= dist2:
    FIRST, SECOND = 0, 1
else:
    FIRST, SECOND = 1, 0

# compute accuracy by iterating through all 2N points
# note: first N points are from cloud1, second N points are from cloud2
correct = 0
for i in range(len(clusters)):
    if clusters[i] == FIRST and i < N:
        correct += 1
    elif clusters[i] == SECOND and i >= N:
        correct += 1

# output accuracy
print('Accuracy: %.2f' % (correct*100./len(clusters)))