![logo](images/untumbes.PNG)

<center><b>Prof. Dr. Jorge Zavaleta - zavaleta.jorge@gmail.com</b></center>

>## Algoritmos de Agrupamientos (Clustering)


In [None]:
# librarys
import numpy as np
import pandas as pd
from pandas.plotting import parallel_coordinates
# sklearn
from sklearn import preprocessing
from sklearn.metrics import pairwise
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from sklearn.cluster import KMeans
# graphics
import matplotlib.pylab as plt
import seaborn as sns
from pandas.plotting import parallel_coordinates
# warnings
import warnings
warnings.filterwarnings("ignore")

>## Leitura de datos

In [None]:
# paths
path_dataset ='data/'    # diretorio do dataset local
# dataset read
file_name='curados_obitos_final.csv'          # nome do dataset
covid19 = pd.read_csv(path_dataset+file_name,sep=';',encoding='utf-8',low_memory=False) #index_col=0  #encoding='ISO-8859-1','latin-1')     # leitura do dataset
covid19.head()

In [None]:
#data x
covid19.columns.name='SINTO'
covid19.head()

In [None]:
dX = covid19.copy()
dX.head()

In [None]:
dX.shape

In [None]:
dY = covid19.T
dY.head()

In [None]:
dY.shape

In [None]:
#cria matriz cuadrada
ncols = dX.shape[1]
data_dist = pd.DataFrame(np.zeros((ncols,ncols)))
nc = dX.keys()
data_dist.columns = list(nc)
data_dist.index = list(nc)
data_dist.head()

In [None]:
# calcular distancia euclideana
def dist_euclidean(X,Y, dXY):
    nidx = X.shape[0]
    ncols = X.shape[1]
    name_cols = list(X.keys())
    for i in range(0,ncols-1):
        coluna = name_cols[i]
        for j in range(0,ncols-1):
            fila = name_cols[j]
            s = 0
            for i in range(0,nidx-1):
                vx = dX.loc[i,coluna]
                vy = dY.loc[fila,i]
                p = (vx-vy)*(vx-vy)
                s = s+p
            r = round(np.sqrt(s),2)
            dXY.loc[fila,coluna] = r     
    #
    return dXY

In [None]:
diste = dist_euclidean(dX,dY,data_dist)
diste.head(10)

In [None]:
diste.head(10)

In [None]:
# Index and columns names
diste.columns.name='SINTO'
diste.index.name = 'SINTO'
diste.head()

In [None]:
#normalizando
# scikit-learn uses population standard deviation
data_norm = diste.apply(preprocessing.scale, axis=0)
data_norm.head()

In [None]:
# pandas uses sample standard deviation
data_norm = (diste - diste.mean())/diste.std()
data_norm.head()

In [None]:
d_norm = data_norm.drop('EVOLUCAO',axis=1)
d_norm = d_norm.drop('EVOLUCAO',axis=0)
d_norm.head()

In [None]:
d_norm.shape

In [None]:
#data_norm[['FEBRE','TOSSE']]
d1_norm = pairwise.pairwise_distances(d_norm[['FEBRE','TOSSE','DISPNEIA','DESC_RESP']], metric='euclidean')
dx_norm = pd.DataFrame(d1_norm, columns=d_norm.index, index=d_norm.index)
dx_norm

In [None]:
# linkage single
Z = linkage(dx_norm, method='single')
dendrogram(Z, labels=dx_norm.index, color_threshold=2.75)

In [None]:
# dendogram
Z = linkage(dx_norm, method='average')
dendrogram(Z, labels=dx_norm.index, color_threshold=3.6)

In [None]:
# linkage simples
memb = fcluster(linkage(dx_norm, method='single'), 6, criterion='maxclust')
memb = pd.Series(memb, index=dx_norm.index)
for key, item in memb.groupby(memb):
    print(key, ': ', ', '.join(item.index))

In [None]:
# linkage average
memb = fcluster(linkage(dx_norm, method='average'), 6, criterion='maxclust')
memb = pd.Series(memb, index=dx_norm.index)
for key, item in memb.groupby(memb):
    print(key, ': ', ', '.join(item.index))

In [None]:
# set labels as cluster membership and utility name
dx_norm.index = ['{}: {}'.format(cluster, state) for cluster, state in zip(memb, dx_norm.index)]
#

In [None]:
# plot heatmap
# the '_r' suffix reverses the color mapping to large = dark
sns.clustermap(dx_norm, method='average', col_cluster=False, cmap='mako_r');

In [None]:
# kmeans k= 6 cluster
kmedias = KMeans(n_clusters=6, random_state=0).fit(dx_norm)
# Cluster membership
memb = pd.Series(kmedias.labels_, index=dx_norm.index)
for key, item in memb.groupby(memb):
    print(key, ': ', ', '.join(item.index))

In [None]:
#centroids
centroides = pd.DataFrame(kmedias.cluster_centers_, columns=dx_norm.columns)
#pd.options.display.max_rows = 5
centroides.head(5)

In [None]:
#Within-cluster sum of squared distances and cluster count
# calculate the distances of each data point to the cluster centers
distances = kmedias.transform(dx_norm)
distances

In [None]:
# find closest cluster for each data point
minSquaredDistances = distances.min(axis=1) ** 2
minSquaredDistances

In [None]:
# combine with cluster labels into a data frame
df = pd.DataFrame({'squaredDistance': minSquaredDistances, 'cluster': kmedias.labels_},index=dx_norm.index)
df.head()

In [None]:
# group by cluster and print information
for cluster, data in df.groupby('cluster'):
    count = len(data)
    withinClustSS = data.squaredDistance.sum()
    print(f'Cluster {cluster} ({count} members): {withinClustSS:.2f} dentro del cluster ')

In [None]:
#plot
centroides['cluster'] = ['Cluster {}'.format(i) for i in centroides.index]
#
plt.figure(figsize=(10,6))
parallel_coordinates(centroides, class_column='cluster', colormap='Dark2', linewidth=5)
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5));

In [None]:
ratings = [['john',5,5,2,1],['mary',4,5,3,2],['bob',4,4,4,3],['lisa',2,2,4,5],['lee',1,2,3,4],['harry',2,1,5,5]]
titles = ['user','Jaws','Star Wars','Exorcist','Omen']
movies = pd.DataFrame(ratings,columns=titles)
data = movies.drop('user',axis=1)
data.head()

In [None]:
#import matplotlib.pyplot as plt
#%matplotlib inline
from sklearn import cluster

numClusters = [1,2,3,4,5,6]
SSE = []
for k in numClusters:
    k_means = cluster.KMeans(n_clusters=k)
    k_means.fit(dx_norm)
    SSE.append(k_means.inertia_)

plt.plot(numClusters, SSE)
plt.xlabel('Number of Clusters')
plt.ylabel('SSE');

---
<center><b>© Jorge Zavaleta, 2024</b></center>