In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.datasets import load_iris

In [None]:
#load the iris dataset

data = load_iris(as_frame = True)
df = data['data']
target_names = data['target_names']
df['species'] = [target_names[i] for i in data['target'].values]
df.columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']

Normally it's important to standardize your variables before kmeans, but since the petal and sepal lengths and widths are all measured in centimeters, we should be able to get away with not doing it this time.

First up, let's plot all the variables we are going to use to cluster on colored by species. This will show us the clusters we hope to find with kmeans.

In [None]:
sns.pairplot(df, hue = 'species')
plt.show()

Using the elbow method or silhouttes scores, we could estimate the optimal number of K's for our clusters, but if we use some feel we can go with K = 3 since there are three different species, so we'll start with K = 3.

In [None]:
#initialize the kmeans model
km = KMeans(n_clusters = 3)

#fit the model to the data. We want to cluster on the petal lengths and widths and the sepal lengths and widths
X = df[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']].values
km.fit(X)

#now define the cluster labels for each point in the data
df['cluster_label'] = km.predict(X)

#This step is optional, but the cluster labels are integers, so if we did the seaborn pairplot with hue=cluster_label,
#it would assume a continuous hue. IF we want to replicate the results from our earlier plot when we colored by species, we need a discrete hue.
#We can trick seaborn into thinking the integers are discrete by changing their data types to strings.
df['cluster_label'] = df.cluster_label.astype(str)

#now we can see how good our KMeans clustering did by plotting the same pairplot we did before but using our label instead of the species
sns.pairplot(df, hue = 'cluster_label')
plt.show()

The colors may not exactly match up because it is random which cluster gets defined first, second, and third, but the plots in the previous cell should show similar groupings as the earlier plots when we colored the points by species. KMeans was able to group the different iris species together!

Now let's practice using the elbow method and silhoutte score to choose an optimal K. Obviously K = 3 is a great choice for this problem, so let's see if the elbow method and the silhoutte scores also find that K = 3 is a good choice.

In [None]:
#elbow method first

#initialize an empty list to store the within cluster sum of squares for various values of k
wcss = []

#loop through the values of k that you want to test. Here I choose to test K = 2 to 7
for k in range(2, 8):

  #define and fit a KMeans clusterer with k centroids
  km = KMeans(n_clusters = k, random_state = 42)
  km.fit(X)

  #the within cluster sum of squares is an attribute in sklearns KMeans object class. The attribute is called inertia_
  wcss.append(km.inertia_)

#now we have calculated the different wcss values for the different values of K, so we just need to plot them and find the elbow
plt.plot([k for k in range(2, 8)], wcss, 'b-x')
plt.xlabel('K')
plt.ylabel('Sum of Squared Distances')
plt.show()

Looks to me like K = 3 or 4 would be a good choice according to the elbow test.

In [None]:
#next up, the silhouette score

#initialize an empty list to store the scores
sil_scores = []

#loop through the values of K
for k in range(2,8):

  #fit a kmeans clusterer
  km = KMeans(n_clusters = k, random_state = 42)
  km.fit(X)

  #predict the cluster labels
  labels = km.predict(X)

  #calculate silhouette score and append to the list
  sil_scores.append(silhouette_score(X, labels))

#plot the K values and the silhouette score. The optimal K is the one with max silhoutte score
plt.plot([k for k in range(2,8)], sil_scores, 'b-x')
plt.xlabel('K')
plt.ylabel('Silhouette Score')
plt.show()

The silhoutte score in this case suggests that we only want two clusters, which is why sometimes it's important to have some data science feel. If we know there are three species of irises in the dataset, we should probably go with three clusters despite what the silhoutte score says.