# Multivariate Fuzzy C-means method: Implementation

## Equations

### $J= \sum_{i=1}^{c} \sum_{k=1}^{n} \sum_{j=1}^{p} \left(u_{ijk} \right)^{m} d_{ijk}$ - Objective function to minimize.

### $d_{ijk} = \left(x_{jk} - y_{ij} \right)^{2}$ - euclidian distance squared.

### $ y_{ij} = \frac{\sum_{k=1}^{n} \left(u_{ijk} \right)^{m} x_{jk}} {\sum_{k=1}^{n} \left(u_{ijk}\right)^{m}}$ - prototype coordinate of a given cluster in feature j.

### $ u_{ijk} =  \left[\sum_{h=1}^{c}\sum_{l=1}^{p} \left(\frac{d_{ijk}}{d_{hlk}}\right)^{(1/(m-1))}  \right]^{-1} $ - membership degree of pattern k in cluster $C_{i}$ on the feature j.

### $\delta_{ik} = \sum_{j=1}^{p} u_{ijk}$ - represents an aggregation measure for all the p features.

## Constraints:

### - $u_{ijk} \in [0, 1]$ for all i, j and k;
### - $0 < \sum_{j=1}^{p} \sum_{k=1}^{n} u_{ijk} < n$ for all i and
### - $\sum_{i=1}^{c}\sum_{j=1}^{p}u_{ijk} = 1$ for all k.

## Importando bibliotecas

In [89]:
import numpy as np
import pandas as pd
import seaborn as sns

## Tratamento de dados

In [90]:
df = sns.load_dataset('iris')
df["species"].replace({"setosa": 0, "versicolor": 1, "virginica": 2}, inplace=True)
df.columns = ["SepalLengthCm", "SepalWidthCm", "PetalLengthCm", "PetalWidthCm", "Class"]
labels = df["Class"].values
df.drop("Class", axis=1, inplace=True)
dados = df.to_numpy()

## Agrupamento

In [91]:
class MFCM():
    def __init__(self, c, X, m):
        self.c = c
        self.n = X.shape[0]
        self.p = X.shape[1]
        self.m = m
    
    def initialize_u(self):
        return np.random.dirichlet(alpha=np.ones(self.c * self.p),
                                   size=self.n).reshape(self.n, self.c, self.p)
    
    def find_centroides(self, X, U):
        return np.sum((U ** self.m) * X[:, np.newaxis, :], axis=0) / np.sum(U ** self.m, axis=0)
    
    def get_distances(self, X, V): # as vezes tem umas distâncias muito pequenas
        return (X[:, np.newaxis, :] - V[np.newaxis, :, :]) ** 2
    
    def update_u(self, D):
        ratio = (D[:, np.newaxis, np.newaxis, :, :] / D[:, :, :, np.newaxis, np.newaxis]) ** (1/(self.m-1))
        return 1 / (np.sum(ratio, axis=(3, 4)))
    
    def get_objective_function(self, U, D):
        return np.sum((U ** self.m)*D)

## Clustering

In [92]:
def mfcm_run(dados, num_clusters, m=2, max_iter=10**3, epsilon=1e-5):
    mfcm = MFCM(c=num_clusters, X=dados, m=m) # create the MFCM object
    
    U = mfcm.initialize_u() # initialize the membership matrix
    
    for _ in range(max_iter):
        centroids = mfcm.find_centroides(dados, U)
        D = mfcm.get_distances(dados, centroids)
        new_U = mfcm.update_u(D)
        if np.linalg.norm(U - new_U) < epsilon:
            break
        U = new_U
    
    Delta = np.sum(U, axis=2)  # summing over the second axis (variables `j`)
    
    return centroids, U, Delta

## Fuzzyficação dos labels

In [93]:
def crisp_to_fuzzy(dados, y, n_clusters, m=2):
    mfcm = MFCM(c=n_clusters, X=dados, m=m)  # Criar objeto MFCM
    
    # Calcular os protótipos reais (médias das amostras de cada classe)
    prototipos = np.array([dados[y == c].mean(axis=0) for c in range(n_clusters)])
    
    # Calcular distâncias entre os pontos e os protótipos
    distancias = mfcm.get_distances(dados, prototipos)
    distancias = np.fmax(distancias, np.finfo(np.float64).eps)  # Evitar divisão por zero
    
    # Atualizar matriz de pertinência com a fórmula do MFCM
    matriz_pertinencia = mfcm.update_u(distancias)
    
    # Somar as pertinências sobre as variáveis (eixo 2) para obter pertinências por classe
    matriz_pertinencia_final = np.sum(matriz_pertinencia, axis=2)  
    
    return matriz_pertinencia_final



labels = crisp_to_fuzzy(dados, labels, 3)
labels

array([[5.91764136e-04, 3.16296805e-01, 6.83111431e-01],
       [5.85950088e-03, 3.09194732e-01, 6.84945767e-01],
       [4.70910466e-03, 3.17821853e-01, 6.77469043e-01],
       [7.93587532e-03, 3.09960717e-01, 6.82103408e-01],
       [1.03025761e-03, 3.18878107e-01, 6.80091636e-01],
       [1.57064159e-02, 3.07538189e-01, 6.76755395e-01],
       [4.79656050e-03, 3.17602314e-01, 6.77601125e-01],
       [1.34277354e-04, 3.11427567e-01, 6.88438156e-01],
       [1.72290722e-02, 3.12449613e-01, 6.70321315e-01],
       [4.23412365e-03, 3.08120031e-01, 6.87645845e-01],
       [7.28305731e-03, 3.14051639e-01, 6.78665304e-01],
       [1.98542835e-03, 3.09344170e-01, 6.88670402e-01],
       [7.07957067e-03, 3.11268267e-01, 6.81652163e-01],
       [1.90330556e-02, 3.24484226e-01, 6.56482718e-01],
       [2.76899906e-02, 3.27495489e-01, 6.44814520e-01],
       [4.20457138e-02, 3.24056308e-01, 6.33897979e-01],
       [1.23681302e-02, 3.23183710e-01, 6.64448160e-01],
       [6.26019541e-04, 3.15241

## Índice de Rand Difuso

In [94]:
def pertinence_distance(delta_k, delta_k_linha, c):
    # calcula a distância entre δ_k e δ_k' (matrizes de pertinência)
    return (1/c) * np.sum((delta_k - delta_k_linha) ** 2)

def fuzzy_rand_index(particao1, particao2, c):
    n = particao1.shape[0]
    total_sum = 0

    for k in range(n):
        for k_linha in range(k+1, n):
            if k != k_linha:
                # calcula a métrica para P
                delta_k = particao1[k]
                delta_k_prime = particao1[k_linha]
                EP = 1 - pertinence_distance(delta_k, delta_k_prime, c)

                # calcula a métrica para Q
                delta_k_Q = particao2[k]
                delta_k_prime_Q = particao2[k_linha]
                EQ = 1 - pertinence_distance(delta_k_Q, delta_k_prime_Q, c)

                total_sum += np.abs(EP - EQ) # soma a diferença absoluta entre EP e EQ

    denominador = n * (n - 1) / 2
    if denominador == 0:
        raise ValueError

    return 1- (total_sum / denominador)

## Simulação de Monte Carlo

In [95]:
def monte_carlo_simulation(dados, labels, num_clusters, num_trials):
    results = []
    for _ in range(num_trials):
        centroids, U, Delta = mfcm_run(dados, num_clusters)
        fr = fuzzy_rand_index(labels, Delta, num_clusters)
        results.append(fr)
    mean_fr = np.mean(results)
    std_fr = np.std(results)
    return mean_fr, std_fr

In [96]:
num_clusters = 3
num_trials = 100
mean_fr, std_fr = monte_carlo_simulation(dados, labels, num_clusters, num_trials)

print(f"Mean ARI: {mean_fr}")
print(f"Std ARI: {std_fr}")

Mean ARI: 0.9056141665900617
Std ARI: 0.028310422618390257
