In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from copy import deepcopy
import time
import os
from sklearn.metrics import silhouette_score

In [2]:

folder_path = r"C:\Users\tere1\OneDrive\Escritorio\TFM\datos_img"
# Cargar las características normalizadas por Z-score
df_color_z = pd.read_csv(os.path.join(folder_path, "features_color_all_zscore.csv"))
# Cargar las características normalizadas por Min-Max
df_color_mm = pd.read_csv(os.path.join(folder_path, "features_color_all_minmax.csv"))


In [3]:
#Se eliminan los datos no numéricos: 'label' y 'filename'
df_color_z_cl= df_color_z.drop(columns=['filename', 'label'])

In [4]:
def _get_init_centers(n_clusters, n_samples):
    '''Return random unique indices as initial medoid centers'''
    return np.random.choice(n_samples, size=n_clusters, replace=False).tolist()

def _get_distance(data1, data2):
    '''Euclidean distance function'''
    return np.sqrt(np.sum((data1 - data2) ** 2))

def _get_cost(X, centers_id, dist_func):
    '''Return members, per-cluster costs, total cost, and distance matrix'''
    dist_mat = np.zeros((len(X), len(centers_id)))
    for j, center_id in enumerate(centers_id):
        center = X[center_id, :]
        for i in range(len(X)):
            if i == center_id:
                dist_mat[i, j] = 0.
            else:
                dist_mat[i, j] = dist_func(X[i, :], center)

    mask = np.argmin(dist_mat, axis=1)
    members = mask.copy()
    costs = np.array([np.sum(dist_mat[mask == i, i]) for i in range(len(centers_id))])
    return members, costs, np.sum(costs), dist_mat

def _kmedoids_run(X, n_clusters, dist_func, max_iter=1000, tol=0.001, verbose=True):
    n_samples = X.shape[0]
    centers = _get_init_centers(n_clusters, n_samples)
    if verbose:
        print("Initial centers are", centers)

    members, costs, tot_cost, dist_mat = _get_cost(X, centers, dist_func)
    cc, swapped = 0, True

    while swapped and cc < max_iter:
        swapped = False
        for i in range(n_samples):
            if i not in centers:
                for j in range(len(centers)):
                    new_centers = deepcopy(centers)
                    new_centers[j] = i
                    members_, costs_, tot_cost_, dist_mat_ = _get_cost(X, new_centers, dist_func)
                    if tot_cost - tot_cost_ > tol:
                        members, costs, tot_cost, dist_mat = members_, costs_, tot_cost_, dist_mat_
                        centers = new_centers
                        swapped = True
                        if verbose:
                            print("Change centers to", centers)
        cc += 1

    if verbose:
        if not swapped:
            print("End Searching by no swaps")
        else:
            print("End Searching by reaching maximum iteration", max_iter)

    return centers, members, costs, tot_cost, dist_mat, cc

class KMedoids:
    '''
    K-Medoids Clustering (PAM) Algorithm

    Parameters
    ----------
    n_clusters : int
        Number of clusters
    dist_func : callable
        Distance function (default: Euclidean)
    max_iter : int
        Maximum number of iterations
    tol : float
        Tolerance to stop swapping
    '''
    def __init__(self, n_clusters, dist_func=_get_distance, max_iter=1000, tol=0.001):
        self.n_clusters = n_clusters
        self.dist_func = dist_func
        self.max_iter = max_iter
        self.tol = tol

    def fit(self, X, plotit=True, verbose=True):
        self.X_train = X
        result = _kmedoids_run(
            X, self.n_clusters, self.dist_func, self.max_iter, self.tol, verbose
        )
        self.centers_, self.labels_, self.costs_, self.total_cost_, self.dist_mat_, self.n_iter_ = result

        if plotit and X.shape[1] == 2:
            self._plot_clusters(X)
        elif plotit:
            print("Plotting only supported for 2D data.")

    def predict(self, X):
        if not hasattr(self, "centers_"):
            raise RuntimeError("Model has not been fitted.")
        labels = np.zeros(len(X), dtype=int)
        for i, x in enumerate(X):
            dists = [self.dist_func(x, self.X_train[c]) for c in self.centers_]
            labels[i] = np.argmin(dists)
        return labels

    def _plot_clusters(self, X):
        plt.figure(figsize=(8, 6))
        cmap = plt.get_cmap('tab10')
        for i in range(self.n_clusters):
            cluster_points = X[self.labels_ == i]
            plt.scatter(cluster_points[:, 0], cluster_points[:, 1], 
                        color=cmap(i), alpha=0.5, label=f'Cluster {i}')
            plt.scatter(X[self.centers_[i], 0], X[self.centers_[i], 1],
                        color=cmap(i), marker='*', s=300, edgecolor='k')
        plt.title("K-Medoids Clustering")
        plt.legend()
        plt.grid(True)
        plt.show()

In [5]:
# Convertir el DataFrame a un arreglo NumPy (asegúrate de que solo contiene variables numéricas)
X = df_color_z_cl.values  

# Crear modelo con el número de clusters que estimes apropiado
model = KMedoids(n_clusters=3)  

# Ajustar el modelo
model.fit(X, plotit=False)  # Desactiva plotit si los datos no son 2D

# Ver resultados
print("Medoides:", model.centers_)
print("Etiquetas:", model.labels_)

Initial centers are [366, 121, 175]
Change centers to [366, 121, 1]
Change centers to [366, 121, 3]
Change centers to [6, 121, 3]
Change centers to [11, 121, 3]
Change centers to [11, 121, 13]
Change centers to [11, 121, 37]
Change centers to [11, 121, 64]
Change centers to [77, 121, 64]
Change centers to [77, 86, 64]
Change centers to [77, 86, 87]
Change centers to [77, 86, 136]
Change centers to [77, 304, 136]
Change centers to [77, 304, 405]
End Searching by no swaps
Medoides: [77, 304, 405]
Etiquetas: [0 0 1 2 2 0 0 0 2 1 0 0 1 2 2 0 0 0 1 1 0 1 2 1 1 2 1 1 1 2 0 2 1 1 0 2 2
 2 1 2 1 0 1 2 1 0 0 1 0 0 2 2 2 2 2 2 1 1 2 2 2 2 2 2 2 2 2 1 2 0 2 1 0 1
 1 0 1 0 0 0 1 1 1 2 1 2 1 2 2 0 2 2 2 2 2 2 1 2 2 2 1 1 1 2 1 2 2 1 2 2 1
 1 1 1 1 1 1 1 1 1 1 1 0 0 2 1 2 2 2 0 2 1 2 2 1 2 2 1 2 2 1 2 2 0 2 1 2 2
 2 2 1 2 1 2 1 2 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 0 2 0 1 0 2 2 1 0 0 0 0 0
 0 0 0 0 0 1 2 1 2 2 2 2 0 0 0 0 2 0 1 2 2 0 0 2 0 2 1 2 2 0 2 2 2 2 2 0 0
 0 1 0 0 0 0 0 0 2 2 0 2 2 1 2 2 2 0 1 

In [6]:
score = silhouette_score(X, model.labels_)
print(f"Coeficiente de silueta: {score:.4f}")


Coeficiente de silueta: 0.3363


In [7]:
#Se eliminan los datos no numéricos: 'label' y 'filename'
df_color_mm_cl= df_color_mm.drop(columns=['filename', 'label'])

In [8]:
# Convertir el DataFrame a un arreglo NumPy (asegúrate de que solo contiene variables numéricas)
X = df_color_mm_cl.values  

# Crear modelo con el número de clusters que estimes apropiado
model = KMedoids(n_clusters=4)  

# Ajustar el modelo
model.fit(X, plotit=False)  # Desactiva plotit si los datos no son 2D

# Ver resultados
print("Medoides:", model.centers_)
print("Etiquetas:", model.labels_)

Initial centers are [422, 278, 41, 370]
Change centers to [0, 278, 41, 370]
Change centers to [1, 278, 41, 370]
Change centers to [1, 278, 41, 3]
Change centers to [1, 278, 6, 3]
Change centers to [13, 278, 6, 3]
Change centers to [13, 278, 6, 14]
Change centers to [13, 18, 6, 14]
Change centers to [22, 18, 6, 14]
Change centers to [22, 33, 6, 14]
Change centers to [22, 33, 6, 37]
Change centers to [22, 33, 41, 37]
Change centers to [22, 33, 41, 62]
Change centers to [83, 33, 41, 62]
Change centers to [83, 86, 41, 62]
Change centers to [83, 86, 41, 142]
Change centers to [83, 86, 172, 142]
Change centers to [83, 287, 172, 142]
Change centers to [83, 295, 172, 142]
End Searching by no swaps
Medoides: [83, 295, 172, 142]
Etiquetas: [2 0 3 3 3 2 2 2 3 1 2 2 2 3 3 2 2 2 1 2 2 1 0 1 2 2 3 1 3 0 2 3 1 1 2 3 3
 3 3 3 1 2 1 3 1 2 2 1 2 2 3 3 3 3 3 0 1 2 3 0 3 3 3 3 3 0 3 1 0 0 0 1 2 1
 1 2 1 2 2 2 1 1 1 0 1 3 1 3 3 2 0 3 3 0 3 3 3 3 3 3 2 3 1 3 3 3 3 1 3 3 1
 1 2 1 1 1 2 1 1 1 1 1 0 2 3 1 0 0 

In [9]:
score = silhouette_score(X, model.labels_)
print(f"Coeficiente de silueta: {score:.4f}")

Coeficiente de silueta: 0.3460
