In [None]:
#! conda install -c conda-forge kneed -y

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from matplotlib import pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
#from kneed import KneeLocator
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<a id='clusteringtypes'></a>

# Types of Clustering

1. Non overlapping clustering(kmeans clustering) - Data points does not overlap on each other.

2. Overlapping clustering (C/Fuzzy clustering) - Data points overlap on each other.

3. Hierarchial clustering


In this notebook we will talk about kmeans where k is number of clusters we need. We define two things for this algorithm:

1. Number of clusters - It might come from domain knowledge or we choose number of clusters with minimum sum of square distance from centroid and data points in the cluster.

2. Centroids - This can be random value for a cluster.


In [None]:
heart = pd.read_csv('../input/heart-disease-uci/heart.csv')
heart = heart.drop_duplicates()
y = heart.target
heart = heart.drop('target', axis = 1)
heart.shape

In [None]:
# Suppose we start k = 3
k=3
centroids = {i+1:[np.random.randint(0,302), np.random.randint(0,302)] for i in range(k)}
print(centroids)

In [None]:
heart_val = heart.values
X = heart[['chol','thalach']].values
scaler = StandardScaler()
x_std = scaler.fit_transform(X=heart[['chol','thalach']])
x_min_max = MinMaxScaler.fit_transform(X=heart[['chol','thalach']])

In [None]:
kmeans = KMeans(init="random", n_clusters=5, random_state=42)
kmeans.fit(x_std)
y = kmeans.fit_predict(x_std)

In [None]:
heart['cluster'] = y

In [None]:
heart[heart['cluster']==1].head()

In [None]:
heart[heart['cluster'] == 0].head()

In [None]:
heart[heart['cluster'] == 2].head()

In [None]:
heart[heart['cluster'] == 1].head()

In [None]:
y

In [None]:
X[y == 0, 0]

In [None]:
# Visualising the clusters
#Scatter plotting for (x,y) with label 1 as Cluster 1 in color c = red and points in size s = 50
plt.scatter(X[y == 0, 0], X[y == 0, 1], s = 50, c = 'red', label = 'Cluster 1')
#Scatter plotting for (x,y) with label 2 as Cluster 2 in color c = blue and points in size s = 50
plt.scatter(X[y == 1, 0], X[y == 1, 1], s = 50, c = 'blue', label = 'Cluster 2')
#Scatter plotting for (x,y) with label 3 as Cluster 3 in color c = green and points in size s = 50
plt.scatter(X[y == 2, 0], X[y == 2, 1], s = 50, c = 'green', label = 'Cluster 3')
#Scatter plotting for (x,y) with label 3 as Cluster 3 in color c = green and points in size s = 50
plt.scatter(X[y == 3, 0], X[y == 3, 1], s = 50, c = 'orange', label = 'Cluster 4')
#Scatter plotting for (x,y) with label 3 as Cluster 3 in color c = green and points in size s = 50
plt.scatter(X[y == 4, 0], X[y == 4, 1], s = 50, c = 'pink', label = 'Cluster 5')

#Scatter plotting the centroids with label = 'Centroids' in color c = cyan and points in size s = 100
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s = 100, c = 'cyan', label = 'Centroids')

plt.title('Heart Patient Clusters')
plt.xlabel('Cholestrol')
plt.ylabel('Thalach')
plt.legend()
plt.show()

# Silhouette Score

If the ground truth labels are not known, evaluation must be performed using the model itself. The Silhouette Coefficient (sklearn.metrics.silhouette_score) is an example of such an evaluation, where a higher Silhouette Coefficient score relates to a model with better defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores:

a: The mean distance between a sample and all other points in the same class.

b: The mean distance between a sample and all other points in the next nearest cluster.

The Silhouette Coefficient s for a single sample is then given as:

$$ ss = \frac {b - a}{max(a,b)} $$

In [None]:
sse = []
silhouette_scores = []
for k in range(2,11):
    kmeans = KMeans(init="random", n_clusters=k, random_state=42)
    kmeans.fit(X)
    sse.append(kmeans.inertia_)
    sc = silhouette_score(X, kmeans.labels_)
    silhouette_scores.append(sc)

In [None]:
plt.plot(range(2, 11), silhouette_scores)
plt.xticks(range(2, 11))
plt.xlabel("Number of Clusters")
plt.ylabel("Silhouette Coefficient")
plt.show()

In [None]:
plt.plot(range(2, 11), sse)
plt.xticks(range(2, 11))
plt.xlabel("Number of Clusters")
plt.ylabel("SSE")
plt.show()

References

1. [Updating centroids](https://towardsdatascience.com/k-means-clustering-for-beginners-2dc7b2994a4)