<a href="https://colab.research.google.com/github/valerio-unifei/UNIFEI-IA-Aulas/blob/main/UNIFEI_IA_Agrupamentos.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Agrupamentos (*Clustering*)

## Dataset

In [None]:
from sklearn import datasets

# vamos usar o dataset Flores IRIS que tem tutor
# mas o tutor não vai ser usado no agrupamento
iris = datasets.load_iris()
print('casos =',iris.data.shape[0],', atributos =',iris.data.shape[1])

## Metodologias de Agrupamento

In [None]:
import time
from sklearn import cluster, mixture

import numpy as np
import matplotlib.pyplot as plt
from itertools import cycle, islice

plt.figure(figsize=(12, 12))

#https://scikit-learn.org/stable/modules/classes.html#module-sklearn.cluster
agrupamentos = (
                (cluster.MiniBatchKMeans(n_clusters=3,random_state=42),'K-Means'),
                (cluster.AgglomerativeClustering(n_clusters=3, linkage='ward'),'Ward'),
                (cluster.SpectralClustering(n_clusters=3, eigen_solver='arpack', affinity='nearest_neighbors',random_state=42),'Spectral'),
                (cluster.DBSCAN(eps=1.0),'DBSCAN'),
                (cluster.OPTICS(),'OPTICS'),
                (cluster.AffinityPropagation(),'Affinity'),
                (cluster.Birch(n_clusters=3),'Birch'),
                (mixture.GaussianMixture(n_components=3,covariance_type='full',random_state=42),'GaussianMix'),
               )

plot_num = 1

for metodo, nome in agrupamentos:

  t0 = time.time()
  metodo.fit(iris.data)
  tempo = time.time() - t0

  if hasattr(metodo, 'labels_'):
    y_pred = metodo.labels_.astype(int)
  else:
    y_pred = metodo.predict(iris.data)
  
  colunas = iris.data.shape[1]

  # Plotando grupos pela combinação de 2 colunas do dataset
  for c in range(1,colunas):
    plt.subplot(len(agrupamentos),colunas-1,plot_num)

    if c == 1:
      plt.title(nome, size=18)

    colors = np.array(list(islice(cycle(['#377eb8','#ff7f00','#4daf4a','#f781bf', '#a65628', '#984ea3','#999999', '#e41a1c', '#dede00']),int(max(y_pred) + 1))))
    colors = np.append(colors, ['#000000'])

    # plotando pelas colunas [c-1,c]
    plt.scatter(iris.data[:, c-1], iris.data[:, c], s=10, color=colors[y_pred])

    plt.xticks(())
    plt.yticks(())

    if c == (colunas -1):
      plt.text(.99, .01, ('%.3fs' % tempo).lstrip('0'), transform=plt.gca().transAxes, size=15, horizontalalignment='right')
      
    plot_num += 1

plt.show()

## Métricas de Avaliação

In [None]:
from sklearn import metrics
import pandas as pd

tabela = []
for metodo, nome in agrupamentos:

  if hasattr(metodo, 'labels_'):
    y_pred = metodo.labels_.astype(int)
  else:
    y_pred = metodo.predict(iris.data)

  valores = []
  
  valores.append(set(y_pred))

  #https://scikit-learn.org/stable/modules/classes.html#clustering-metrics
  valores.append(metrics.adjusted_rand_score(y_pred,iris.target))
  valores.append(metrics.adjusted_mutual_info_score(y_pred,iris.target))
  valores.append(metrics.normalized_mutual_info_score(y_pred,iris.target))
  valores.append(metrics.fowlkes_mallows_score(y_pred,iris.target))

  valores.append(metrics.silhouette_score(iris.data, y_pred, metric='euclidean'))
  valores.append(metrics.calinski_harabasz_score(iris.data, y_pred))
  valores.append(metrics.davies_bouldin_score(iris.data, y_pred))

  tabela.append(valores)

metricas = ['Grupos','Adjusted Rand','Ad.Mutual','No.Mutual','Fowlkes Mallows','Silhouette','Calinski Harabasz','Davies Bouldin']
metodos = [y for x,y in agrupamentos]

pd.DataFrame(data=tabela, index=metodos, columns=metricas)