# K-Means Tutorial

Load the modules

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import sklearn.metrics as metrics

### Load the data
from kmeans.csv

In [None]:
f = open("kmeans.csv")
data = np.loadtxt(f, delimiter='\t')
print(data)

In [None]:
data.shape

In [None]:
plt.scatter(data[:,0], data[:,1])
plt.axis([0, 100, 0, 100])
plt.show()

In [None]:
euclidean_dists = metrics.euclidean_distances(data)

def plot_distance_matrix(D):
    fig, ax = plt.subplots()
    image = ax.matshow(D)
    fig.colorbar(image)
    ax.set_xticks([])
    ax.set_yticks([])
    fig.suptitle('Distance Matrix')
    
plot_distance_matrix(euclidean_dists)

Create the K-Means Model

In [None]:
model = KMeans(n_clusters=3)

Fit the model

In [None]:
model.fit(data)

Predict the cluster of each element

In [None]:
clusters = model.predict(data)

Display the clusters

In [None]:
clusters

In [None]:
plt.scatter(data[:,0], data[:,1], c=clusters)
plt.axis([0, 100, 0, 100])
plt.show()

In [None]:
plt.scatter(data[:,0], data[:,1],  c=clusters)

centroids = model.cluster_centers_
centroids_x = centroids[:,0]
centroids_y = centroids[:,1]
plt.scatter(centroids_x, centroids_y, marker='X', s=200, c='r')

plt.axis([0, 100, 0, 100])
plt.show()

In [None]:
markers = '^s+'
colors = 'rgb'
for i, m in enumerate(markers):
    points = (clusters == i)
    c = [colors[l] for l in clusters[points]]
    plt.scatter(data[points,0], data[points,1], c=c, marker=m)
plt.axis([0, 100, 0, 100])

plt.show()

In [None]:
idx = np.argsort(clusters)

rearranged_dists = euclidean_dists[idx,:][:,idx]

plot_distance_matrix(rearranged_dists)

This matrix shows how 
- intra-class distance is low
- inter-class distance is high

In [None]:
data2 = data[idx]
idx

In [None]:
clusters2 = model.predict(data2)
clusters2

In [None]:
print(centroids_x)
print(centroids_y)

In [None]:
model.predict([[10,10]])