In [73]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.figure_factory as ff
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
plt.rcParams['figure.figsize']=12,7

### Importing Data

In [2]:
data=pd.read_csv('iris.csv')

In [3]:
data.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [5]:
len(data)

150

In [7]:
data.columns

Index(['Id', 'SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm',
       'Species'],
      dtype='object')

In [8]:
cols=['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']

### Doing some visualisation

In [6]:
px.bar(data,x='Species',color='Species')

In [15]:
fig=px.scatter_matrix(data,data[cols],color='Species')
fig.update_layout(width=1000,height=1000)

### Finding the optimum number of clusters

In [38]:
dat=data[cols].values
sse=[]
for k in range(1,11):
    means=KMeans(n_clusters=k).fit(dat)
    sse.append(means.inertia_)

In [39]:
sse

[680.8244,
 152.36870647733906,
 78.94084142614602,
 57.31787321428571,
 46.53558205128205,
 38.964787851037855,
 34.1967910993998,
 30.338748947107646,
 27.994118195529964,
 26.35045835305046]

In [52]:
fig=px.line(x=[x for x in range(1,11)],y=sse,title='WSSE vs Number of clusters')
fig.add_traces(px.scatter(x=[x for x in range(1,11)],y=sse).data[0])
fig.update_traces(marker=dict(size=12,line=dict(width=2,color='darkred')),selector=dict(mode='markers'))
fig.update_xaxes(tickvals=[x for x in range(11)],title='Number of clusters')
fig.update_yaxes(title='Within Sum of Squares Error')

In [44]:
print('The optimum number of clusters as identified from the elbow method is 3')

The optimum number of clusters as identified from the elbow method is 3


### Fitting the data and making predictions

In [53]:
means=KMeans(n_clusters=3).fit(dat)
preds=means.predict(dat)

In [61]:
mapper={1:'Iris-setosa',0:'Iris-versicolor',2:'Iris-virginica'}

In [62]:
data['Mapped_pred']=preds

In [63]:
data['Mapped_pred']=data['Mapped_pred'].map(mapper)

In [64]:
data['Mapped_pred']

0          Iris-setosa
1          Iris-setosa
2          Iris-setosa
3          Iris-setosa
4          Iris-setosa
            ...       
145     Iris-virginica
146    Iris-versicolor
147     Iris-virginica
148     Iris-virginica
149    Iris-versicolor
Name: Mapped_pred, Length: 150, dtype: object

### Plotting the predictions

In [65]:
fig=px.scatter_matrix(data,data[cols],color='Mapped_pred')
fig.update_layout(width=1000,height=1000)

In [67]:
from sklearn.metrics import classification_report,confusion_matrix

In [68]:
print(classification_report(data['Species'],data['Mapped_pred']))

precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        50
Iris-versicolor       0.77      0.96      0.86        50
 Iris-virginica       0.95      0.72      0.82        50

       accuracy                           0.89       150
      macro avg       0.91      0.89      0.89       150
   weighted avg       0.91      0.89      0.89       150



In [74]:
cm=confusion_matrix(data['Species'],data['Mapped_pred'])

In [85]:
cm

array([[50,  0,  0],
       [ 0, 48,  2],
       [ 0, 14, 36]], dtype=int64)

### There seems to be a few misclassifications between versiclor and virginica 

In [84]:
fig=ff.create_annotated_heatmap(cm,x=['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'],y=['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'])
fig.update_xaxes(title='Predicted')
fig.update_yaxes(title='True')