In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from copy import deepcopy
import time
import os
from sklearn.metrics import silhouette_score

In [11]:

folder_path = r"C:\Users\tere1\OneDrive\Escritorio\TFM\datos_img"
# Cargar las características normalizadas por Z-score
df_shape_z = pd.read_csv(os.path.join(folder_path, "features_shape_all_zscore.csv"))
# Cargar las características normalizadas por Min-Max
df_shape_mm = pd.read_csv(os.path.join(folder_path, "features_shape_all_minmax.csv"))


In [12]:
#Se eliminan los datos no numéricos: 'label' y 'filename'
df_shape_z_cl= df_shape_z.drop(columns=['filename', 'label'])

In [13]:
def _get_init_centers(n_clusters, n_samples):
    '''Return random unique indices as initial medoid centers'''
    return np.random.choice(n_samples, size=n_clusters, replace=False).tolist()

def _get_distance(data1, data2):
    '''Euclidean distance function'''
    return np.sqrt(np.sum((data1 - data2) ** 2))

def _get_cost(X, centers_id, dist_func):
    '''Return members, per-cluster costs, total cost, and distance matrix'''
    dist_mat = np.zeros((len(X), len(centers_id)))
    for j, center_id in enumerate(centers_id):
        center = X[center_id, :]
        for i in range(len(X)):
            if i == center_id:
                dist_mat[i, j] = 0.
            else:
                dist_mat[i, j] = dist_func(X[i, :], center)

    mask = np.argmin(dist_mat, axis=1)
    members = mask.copy()
    costs = np.array([np.sum(dist_mat[mask == i, i]) for i in range(len(centers_id))])
    return members, costs, np.sum(costs), dist_mat

def _kmedoids_run(X, n_clusters, dist_func, max_iter=1000, tol=0.001, verbose=True):
    n_samples = X.shape[0]
    centers = _get_init_centers(n_clusters, n_samples)
    if verbose:
        print("Initial centers are", centers)

    members, costs, tot_cost, dist_mat = _get_cost(X, centers, dist_func)
    cc, swapped = 0, True

    while swapped and cc < max_iter:
        swapped = False
        for i in range(n_samples):
            if i not in centers:
                for j in range(len(centers)):
                    new_centers = deepcopy(centers)
                    new_centers[j] = i
                    members_, costs_, tot_cost_, dist_mat_ = _get_cost(X, new_centers, dist_func)
                    if tot_cost - tot_cost_ > tol:
                        members, costs, tot_cost, dist_mat = members_, costs_, tot_cost_, dist_mat_
                        centers = new_centers
                        swapped = True
                        if verbose:
                            print("Change centers to", centers)
        cc += 1

    if verbose:
        if not swapped:
            print("End Searching by no swaps")
        else:
            print("End Searching by reaching maximum iteration", max_iter)

    return centers, members, costs, tot_cost, dist_mat, cc

class KMedoids:
    '''
    K-Medoids Clustering (PAM) Algorithm

    Parameters
    ----------
    n_clusters : int
        Number of clusters
    dist_func : callable
        Distance function (default: Euclidean)
    max_iter : int
        Maximum number of iterations
    tol : float
        Tolerance to stop swapping
    '''
    def __init__(self, n_clusters, dist_func=_get_distance, max_iter=1000, tol=0.001):
        self.n_clusters = n_clusters
        self.dist_func = dist_func
        self.max_iter = max_iter
        self.tol = tol

    def fit(self, X, plotit=True, verbose=True):
        self.X_train = X
        result = _kmedoids_run(
            X, self.n_clusters, self.dist_func, self.max_iter, self.tol, verbose
        )
        self.centers_, self.labels_, self.costs_, self.total_cost_, self.dist_mat_, self.n_iter_ = result

        if plotit and X.shape[1] == 2:
            self._plot_clusters(X)
        elif plotit:
            print("Plotting only supported for 2D data.")

    def predict(self, X):
        if not hasattr(self, "centers_"):
            raise RuntimeError("Model has not been fitted.")
        labels = np.zeros(len(X), dtype=int)
        for i, x in enumerate(X):
            dists = [self.dist_func(x, self.X_train[c]) for c in self.centers_]
            labels[i] = np.argmin(dists)
        return labels

    def _plot_clusters(self, X):
        plt.figure(figsize=(8, 6))
        cmap = plt.get_cmap('tab10')
        for i in range(self.n_clusters):
            cluster_points = X[self.labels_ == i]
            plt.scatter(cluster_points[:, 0], cluster_points[:, 1], 
                        color=cmap(i), alpha=0.5, label=f'Cluster {i}')
            plt.scatter(X[self.centers_[i], 0], X[self.centers_[i], 1],
                        color=cmap(i), marker='*', s=300, edgecolor='k')
        plt.title("K-Medoids Clustering")
        plt.legend()
        plt.grid(True)
        plt.show()

In [14]:
# Convertir el DataFrame a un arreglo NumPy (asegúrate de que solo contiene variables numéricas)
X = df_shape_z_cl.values  

# Crear modelo con el número de clusters que estimes apropiado
model = KMedoids(n_clusters=3)  

# Ajustar el modelo
model.fit(X, plotit=False)  # Desactiva plotit si los datos no son 2D

# Ver resultados
print("Medoides:", model.centers_)
print("Etiquetas:", model.labels_)

Initial centers are [362, 127, 287]
Change centers to [0, 127, 287]
Change centers to [0, 6, 287]
Change centers to [0, 6, 46]
Change centers to [0, 89, 46]
Change centers to [172, 89, 46]
Change centers to [172, 196, 46]
Change centers to [172, 196, 417]
Change centers to [172, 196, 456]
Change centers to [172, 196, 516]
Change centers to [172, 196, 534]
End Searching by no swaps
Medoides: [172, 196, 534]
Etiquetas: [0 1 0 0 0 1 1 0 1 1 1 1 1 0 1 1 1 1 0 1 1 0 1 0 1 1 1 0 1 1 1 1 0 0 1 0 1
 1 0 0 1 1 0 0 0 1 2 0 0 0 1 1 1 1 1 1 1 1 0 1 1 0 1 1 1 0 1 1 1 1 1 0 1 0
 0 1 1 1 1 1 0 0 1 1 1 1 0 1 1 1 1 0 0 1 1 1 0 1 1 0 1 0 0 1 1 1 0 1 1 1 1
 1 1 1 1 1 0 0 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 0 0 1 0 1 0 0 1 1 1 1 1 1 1
 1 0 1 0 0 1 1 1 0 1 0 0 1 1 0 0 1 0 1 1 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0
 0 0 0 0 1 0 0 0 1 0 0 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0 1 0 0 1 0 1 0 1 1 1
 0 0 1 0 1 1 1 1 1 1 0 0 1 1 1 1 0 0 0 0 0 1 1 1 1 1 0 0 1 0 0 0 1 1 0 0 0
 0 1 1 1 1 0 1 0 0 1 0 2 1 1 0 1 1 0 2 0 0 0 2 0 0 0 1

In [15]:
score = silhouette_score(X, model.labels_)
print(f"Coeficiente de silueta: {score:.4f}")


Coeficiente de silueta: 0.2296


In [16]:
#Se eliminan los datos no numéricos: 'label' y 'filename'
df_shape_mm_cl= df_shape_mm.drop(columns=['filename', 'label'])

In [19]:
# Convertir el DataFrame a un arreglo NumPy (asegúrate de que solo contiene variables numéricas)
X = df_shape_mm_cl.values  

# Crear modelo con el número de clusters que estimes apropiado
model = KMedoids(n_clusters=4)  

# Ajustar el modelo
model.fit(X, plotit=False)  # Desactiva plotit si los datos no son 2D

# Ver resultados
print("Medoides:", model.centers_)
print("Etiquetas:", model.labels_)

Initial centers are [116, 355, 352, 471]
Change centers to [0, 355, 352, 471]
Change centers to [0, 1, 352, 471]
Change centers to [2, 1, 352, 471]
Change centers to [5, 1, 352, 471]
Change centers to [6, 1, 352, 471]
Change centers to [6, 7, 352, 471]
Change centers to [6, 13, 352, 471]
Change centers to [6, 16, 352, 471]
Change centers to [6, 24, 352, 471]
Change centers to [28, 24, 352, 471]
Change centers to [28, 24, 352, 42]
Change centers to [28, 24, 46, 42]
Change centers to [28, 24, 46, 49]
Change centers to [28, 64, 46, 49]
Change centers to [28, 75, 46, 49]
Change centers to [28, 87, 46, 49]
Change centers to [90, 87, 46, 49]
Change centers to [98, 87, 46, 49]
Change centers to [98, 118, 46, 49]
Change centers to [98, 196, 46, 49]
Change centers to [98, 196, 46, 218]
Change centers to [98, 196, 46, 476]
Change centers to [98, 496, 46, 476]
End Searching by no swaps
Medoides: [98, 496, 46, 476]
Etiquetas: [3 1 0 3 0 0 0 3 0 0 0 1 0 3 0 1 1 1 0 1 0 1 0 0 1 0 0 1 0 1 0 1 0 0 1 3

In [20]:
score = silhouette_score(X, model.labels_)
print(f"Coeficiente de silueta: {score:.4f}")

Coeficiente de silueta: 0.2360
