# Clusterização usando o algoritmo kMeans.

## Importação de Bibliotecas

In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import plotly.express  as px
import plotly.graph_objects as po

## Verificando as versões de biblioteca

In [3]:
import sklearn as sk
import plotly as plt

print(f''' Versões das bibliotecas: \n
   pandas: {pd.__version__}
   sklearn: {sk.__version__}
   plotly: {plt.__version__}
''')

 Versões das bibliotecas: 

   pandas: 1.5.3
   sklearn: 1.2.2
   plotly: 5.15.0



## Importando os Dados

In [4]:
iris = pd.read_csv('/content/iris.csv', sep=',')
iris.head()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa


## Analisando os Dataset

In [5]:
iris.describe()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [6]:
iris['variety'].unique()

array(['Setosa', 'Versicolor', 'Virginica'], dtype=object)

In [7]:
len(iris['variety'].unique())

3

In [8]:
iris.shape

(150, 5)

In [9]:
iris.isna().sum()

sepal.length    0
sepal.width     0
petal.length    0
petal.width     0
variety         0
dtype: int64

## Criando os Dados das Pétalas

In [10]:
xPetalas = iris.iloc[:,[2,3]].values
xPetalas[0:10]

array([[1.4, 0.2],
       [1.4, 0.2],
       [1.3, 0.2],
       [1.5, 0.2],
       [1.4, 0.2],
       [1.7, 0.4],
       [1.4, 0.3],
       [1.5, 0.2],
       [1.4, 0.2],
       [1.5, 0.1]])

## Normalizando os Dados das Petalas

In [11]:
normalizarDados = StandardScaler()
xPetalasNormalizada = normalizarDados.fit_transform(xPetalas)
xPetalasNormalizada[:10]

array([[-1.34022653, -1.3154443 ],
       [-1.34022653, -1.3154443 ],
       [-1.39706395, -1.3154443 ],
       [-1.2833891 , -1.3154443 ],
       [-1.34022653, -1.3154443 ],
       [-1.16971425, -1.05217993],
       [-1.34022653, -1.18381211],
       [-1.2833891 , -1.3154443 ],
       [-1.34022653, -1.3154443 ],
       [-1.2833891 , -1.44707648]])

## Calculando numero Cluster

In [12]:
wcssPetalas = []

for i in range(1, 11):
    kmeansPetala = KMeans(n_clusters=i, random_state=0)
    kmeansPetala.fit(xPetalasNormalizada)
    wcssPetalas.append(kmeansPetala.inertia_)



In [13]:
for i in range(len(wcssPetalas)):
  print(f'Cluster {i+1}: Valor do WCSS {wcssPetalas[i]}')

Cluster 1: Valor do WCSS 300.0
Cluster 2: Valor do WCSS 54.16878133149558
Cluster 3: Valor do WCSS 18.02696261254407
Cluster 4: Valor do WCSS 12.283372197379123
Cluster 5: Valor do WCSS 9.152134379409805
Cluster 6: Valor do WCSS 7.187603589633753
Cluster 7: Valor do WCSS 5.994729609966573
Cluster 8: Valor do WCSS 5.144963679454987
Cluster 9: Valor do WCSS 4.402436506349908
Cluster 10: Valor do WCSS 3.8964398329646883


## Definindo o melhor valor do WCSS



In [14]:
graficoCotoveloPetalas = px.line(x=range(1,11),y=wcssPetalas)
graficoCotoveloPetalas.update_xaxes(title_text="Numero de Clusters")
graficoCotoveloPetalas.update_yaxes(title_text="Valor de WCSS")
graficoCotoveloPetalas.update_layout(title_text="Gráfico de Cotovelo das Pétalas", title_x = 0.5)
graficoCotoveloPetalas.show()

## Executando Algoritmo do Kmeans

In [15]:
kmeansPetalaFinal = KMeans(n_clusters=3, random_state=0)
labelClusterPetala = kmeansPetalaFinal.fit_predict(xPetalasNormalizada)

display(labelClusterPetala)





array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)

## Calculando centroides

In [16]:
centroidesPetalas = kmeansPetalaFinal.cluster_centers_
print(centroidesPetalas)

[[-1.30498732 -1.25489349]
 [ 1.02799959  1.12797813]
 [ 0.3058728   0.16541778]]


## Grafico de Dispersão dos Grupos

In [17]:
graficoPetala = px.scatter(x=xPetalasNormalizada[:,0], y=xPetalasNormalizada[:,1], color = labelClusterPetala)
graficoCentroidePetala = px.scatter(x=centroidesPetalas[:,0], y= centroidesPetalas[:,1], size=[7,7,7])
graficoFinal = po.Figure(data= graficoPetala.data + graficoCentroidePetala.data)

graficoFinal.update_xaxes(title_text = "Comprimento da Petala")
graficoFinal.update_yaxes(title_text = "Largura da Petala")
graficoFinal.update_layout(title_text = "Agrupamento da Flores", title_x = 0.5)

graficoFinal.show()

## Criando os Dados das Sepalas

In [18]:
xSepalas = iris.iloc[:,[0,1]].values
xSepalas[0:10]

array([[5.1, 3.5],
       [4.9, 3. ],
       [4.7, 3.2],
       [4.6, 3.1],
       [5. , 3.6],
       [5.4, 3.9],
       [4.6, 3.4],
       [5. , 3.4],
       [4.4, 2.9],
       [4.9, 3.1]])

## Normalizando os Dados das Sepalas

In [19]:
normalizarDados = StandardScaler()
xSepalasNormalizada = normalizarDados.fit_transform(xSepalas)
xSepalasNormalizada[:10]

array([[-0.90068117,  1.01900435],
       [-1.14301691, -0.13197948],
       [-1.38535265,  0.32841405],
       [-1.50652052,  0.09821729],
       [-1.02184904,  1.24920112],
       [-0.53717756,  1.93979142],
       [-1.50652052,  0.78880759],
       [-1.02184904,  0.78880759],
       [-1.74885626, -0.36217625],
       [-1.14301691,  0.09821729]])

## Calculando o numero Cluster

In [20]:
wcssSepalas = []

for i in range(1, 11):
    kmeansSepala = KMeans(n_clusters=i, random_state=0)
    kmeansSepala.fit(xSepalasNormalizada)
    wcssSepalas.append(kmeansSepala.inertia_)























In [21]:
for i in range(len(wcssSepalas)):
  print(f'Cluster {i+1}: Valor do WCSS {wcssSepalas[i]}')

Cluster 1: Valor do WCSS 300.0
Cluster 2: Valor do WCSS 166.9517093393448
Cluster 3: Valor do WCSS 102.61686896314461
Cluster 4: Valor do WCSS 79.76108196298645
Cluster 5: Valor do WCSS 61.80924909521247
Cluster 6: Valor do WCSS 52.53666909138647
Cluster 7: Valor do WCSS 44.416621911377995
Cluster 8: Valor do WCSS 36.18434119518747
Cluster 9: Valor do WCSS 30.51194838055993
Cluster 10: Valor do WCSS 26.667787013897346


## Definindo o melhor valor do WCSS

In [22]:
graficoCotoveloSepalas = px.line(x=range(1,11),y=wcssSepalas)
graficoCotoveloSepalas.update_xaxes(title_text="Numero de Clusters")
graficoCotoveloSepalas.update_yaxes(title_text="Valor de WCSS")
graficoCotoveloSepalas.update_layout(title_text="Gráfico de Cotovelo das Sepalas", title_x = 0.5)
graficoCotoveloSepalas.show()

## Executando o algoritmo do Kmeans

In [23]:
kmeansSepalaFinal = KMeans(n_clusters=3, random_state=0)
labelClusterSepala = kmeansSepalaFinal.fit_predict(xSepalasNormalizada)

display(labelClusterSepala)





array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 2, 2, 2, 1, 2, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 2,
       1, 1, 1, 1, 2, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1,
       1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 1, 2, 1, 2,
       2, 1, 2, 1, 1, 2, 2, 2, 2, 1, 2, 1, 2, 1, 2, 2, 1, 2, 2, 2, 2, 2,
       2, 1, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 1], dtype=int32)

## Calculando os Centroides

In [24]:
centroidesSepalas = kmeansSepalaFinal.cluster_centers_
print(centroidesSepalas)

[[-1.00206653  0.90625492]
 [-0.10143926 -0.94652188]
 [ 1.10971635  0.09821729]]


## Gráfico de Dispersão de Grupos

In [25]:
graficoSepala = px.scatter(x=xSepalasNormalizada[:,0], y=xSepalasNormalizada[:,1], color = labelClusterSepala)
graficoCentroideSepala = px.scatter(x=centroidesSepalas[:,0], y= centroidesSepalas[:,1], size=[7,7,7])
graficoFinal = po.Figure(data= graficoSepala.data + graficoCentroideSepala.data)

graficoFinal.update_xaxes(title_text = "Comprimento da Sepala")
graficoFinal.update_yaxes(title_text = "Largura da Sepala")
graficoFinal.update_layout(title_text = "Agrupamento das Flores", title_x = 0.5)

graficoFinal.show()