# K-means Clustering

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
from sklearn.metrics import silhouette_samples, silhouette_score

## Generate Data

In [None]:
data = make_blobs(n_samples=300, n_features=2, centers=4, cluster_std=1, random_state = 0)

In [None]:
plt.scatter(data[0][:,0],data[0][:,1],c=data[1])

In [None]:
X = data[0]

## Elbow plot

In [None]:
Error =[]
for i in range(1, 12):
    kmeans = KMeans(n_clusters = i)
    kmeans.fit(X)
    Error.append(kmeans.inertia_)

plt.plot(range(1, 12), Error, marker = 'o')
plt.xlabel('N of clusters')
plt.ylabel('Error')
plt.grid()
plt.show()

## Silhouette score
for more information see https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html

In [None]:
for i in range(2, 7):
    kmeans = KMeans(n_clusters=i, random_state=0)
    kmeans_labels = kmeans.fit_predict(X)

    silhouette_avg = silhouette_score(X, kmeans_labels)
    print("For n_clusters =", i, " silhouette average:", silhouette_avg)

### Clustering: 

In [None]:
k = 2

kmeans = KMeans(n_clusters=k)
kmeans.fit(data[0])
centers = kmeans.cluster_centers_
labels = kmeans.labels_

f, (ax1, ax2) = plt.subplots(1, 2, sharey=True)
ax1.set_title('K Means')
ax1.scatter(data[0][:,0],data[0][:,1],c=labels)
ax2.set_title("Original")
ax2.scatter(data[0][:,0],data[0][:,1],c=data[1])
ax1.scatter(centers[:, 0], centers[:, 1], marker='o', color = 'red', s = 100, edgecolor='black')

In [None]:
k = 3

In [None]:
kmeans = KMeans(n_clusters=k)
kmeans.fit(data[0])
centers = kmeans.cluster_centers_
labels = kmeans.labels_

f, (ax1, ax2) = plt.subplots(1, 2, sharey=True)
ax1.set_title('K Means')
ax1.scatter(data[0][:,0],data[0][:,1],c=labels)
ax2.set_title("Original")
ax2.scatter(data[0][:,0],data[0][:,1],c=data[1])
ax1.scatter(centers[:, 0], centers[:, 1], marker='o', color = 'red', s = 100, edgecolor='black')

k = 4

In [None]:
kmeans = KMeans(n_clusters=4)
kmeans.fit(data[0])
centers = kmeans.cluster_centers_
labels = kmeans.labels_

f, (ax1, ax2) = plt.subplots(1, 2, sharey=True)
ax1.set_title('K Means')
ax1.scatter(data[0][:,0],data[0][:,1],c=labels)
ax2.set_title("Original")
ax2.scatter(data[0][:,0],data[0][:,1],c=data[1])
ax1.scatter(centers[:, 0], centers[:, 1], marker='o', color = 'red', s = 100, edgecolor='black')

k = 5

In [None]:
kmeans = KMeans(n_clusters=5)
kmeans.fit(data[0])
centers = kmeans.cluster_centers_
labels = kmeans.labels_

f, (ax1, ax2) = plt.subplots(1, 2, sharey=True)
ax1.set_title('K Means')
ax1.scatter(data[0][:,0],data[0][:,1],c=labels)
ax2.set_title("Original")
ax2.scatter(data[0][:,0],data[0][:,1],c=data[1])
ax1.scatter(centers[:, 0], centers[:, 1], marker='o', color = 'red', s = 100, edgecolor='black')