In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import OneHotEncoder
from sklearn import metrics
import plotly.graph_objs as go
import plotly.graph_objs as pgo

In [2]:
df = pd.read_csv('dataset.csv')
df.head()
df = df.loc[df['Name'].isin(['Eugene', 'Albukerke', 'Choluteco']), ['I', 'E', 'D', 'Name']].reset_index(drop=True)
df.head()

Unnamed: 0,I,E,D,Name
0,26.646903,12.469125,62.18773,Albukerke
1,18.941216,22.988814,48.877788,Eugene
2,5.547961,19.685162,44.169089,Eugene
3,7.979146,22.039042,21.711693,Eugene
4,7.113827,15.680129,59.93816,Eugene


In [3]:
df.shape

(2247, 4)

In [4]:
# converting type of columns to 'category'
df['Name'] = df['Name'].astype('category')
# Assigning numerical values and storing in another column
df['Name_Cat'] = df['Name'].cat.codes
# creating instance of one-hot-encoder
encoder = OneHotEncoder(handle_unknown='ignore')
# passing names-cat column (label encoded values of names)
encoder_df = pd.DataFrame(encoder.fit_transform(df[['Name_Cat']]).toarray(), columns=['Albukerke', 'Choluteco', 'Eugene'])
df = pd.concat([df, encoder_df], axis=1)
df.drop(['Name'], axis=1, inplace=True)
df.head()

Unnamed: 0,I,E,D,Name_Cat,Albukerke,Choluteco,Eugene
0,26.646903,12.469125,62.18773,0,1.0,0.0,0.0
1,18.941216,22.988814,48.877788,2,0.0,0.0,1.0
2,5.547961,19.685162,44.169089,2,0.0,0.0,1.0
3,7.979146,22.039042,21.711693,2,0.0,0.0,1.0
4,7.113827,15.680129,59.93816,2,0.0,0.0,1.0


In [5]:
kmeans = KMeans(n_clusters=3, random_state=1)
X = df.loc[:, ['I', 'E', 'D']]
y_clusters = kmeans.fit_predict(X)
labels = kmeans.labels_
metrics.silhouette_score(X, labels, metric='euclidean')

0.42336854422693293

In [6]:
df['Cluster'] = labels
df['Cluster'] = df['Cluster'].replace([0, 1, 2], ['Eugene', 'Albukerke', 'Choluteco'])
df.head(10)

Unnamed: 0,I,E,D,Name_Cat,Albukerke,Choluteco,Eugene,Cluster
0,26.646903,12.469125,62.18773,0,1.0,0.0,0.0,Albukerke
1,18.941216,22.988814,48.877788,2,0.0,0.0,1.0,Eugene
2,5.547961,19.685162,44.169089,2,0.0,0.0,1.0,Eugene
3,7.979146,22.039042,21.711693,2,0.0,0.0,1.0,Choluteco
4,7.113827,15.680129,59.93816,2,0.0,0.0,1.0,Albukerke
5,18.258538,19.23864,27.059426,2,0.0,0.0,1.0,Eugene
6,12.292205,0.600446,73.48117,1,0.0,1.0,0.0,Albukerke
7,13.504227,22.683483,3.999019,2,0.0,0.0,1.0,Choluteco
8,19.221195,15.103099,28.773817,2,0.0,0.0,1.0,Eugene
9,11.347162,21.737494,43.03995,2,0.0,0.0,1.0,Eugene


In [13]:
# 3d scatterplot using plotly
Scene = dict(xaxis = dict(title  = 'D -->'),yaxis = dict(title  = 'E -->'),zaxis = dict(title  = 'I -->'))

# model.labels_ is nothing but the predicted clusters i.e y_clusters
trace = go.Scatter3d(x=X.iloc[:, 2], 
                      y=X.iloc[:, 1],
                      z=X.iloc[:, 0],
                      mode='markers',
                      marker=dict(color = labels, 
                                  size= 5,
                                  colorscale='rainbow',
                                  opacity=0.2))

# Represent cluster centers.
centroids = go.Scatter3d(x=kmeans.cluster_centers_[:, 2],
                         y=kmeans.cluster_centers_[:, 1],
                         z=kmeans.cluster_centers_[:, 0],
                         mode='markers',
                         marker=dict(symbol='x',
                                     size=5,
                                     color=['#96005a', '#45ed99', '#ff0000']))
data = go.Data([trace, centroids])
layout = go.Layout(margin=dict(l=0,r=0),
                   scene = Scene,
                   height = 900,
                   width = 900,
                   showlegend=False)
fig = go.Figure(data = data, layout = layout)

fig.show()