# Multivariate Fuzzy C-means method: Implementation

## Equations

### $J= \sum_{i=1}^{c} \sum_{k=1}^{n} \sum_{j=1}^{p} \left(u_{ijk} \right)^{m} d_{ijk}$ - Objective function to minimize.

### $d_{ijk} = \left(x_{jk} - y_{ij} \right)^{2}$ - euclidian distance squared.

### $q = \argmin_{1 \le i \le c} \sum_{j=1}^p \sum_{k=1}^n (u_{ijk})^m \cdot d_{ijk}$ - prototype coordinate of a given cluster in feature j.

### $ u_{ijk} =  \left[\sum_{h=1}^{c}\sum_{l=1}^{p} \left(\frac{d_{ijk}}{d_{hlk}}\right)^{(1/(m-1))}  \right]^{-1} $ - membership degree of pattern k in cluster $C_{i}$ on the feature j.

### $\delta_{ik} = \sum_{j=1}^{p} u_{ijk}$ - represents an aggregation measure for all the p features.

## Constraints:

### - $u_{ijk} \in [0, 1]$ for all i, j and k;
### - $0 < \sum_{j=1}^{p} \sum_{k=1}^{n} u_{ijk} < n$ for all i and
### - $\sum_{i=1}^{c}\sum_{j=1}^{p}u_{ijk} = 1$ for all k.

## Importando bibliotecas

In [19]:
import numpy as np
import pandas as pd
from sklearn.metrics import adjusted_rand_score
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns

## Tratamento dos dados

In [20]:
df = sns.load_dataset('iris')
df["species"].replace({"setosa": 0, "versicolor": 1, "virginica": 2}, inplace=True)
df.columns = ["SepalLengthCm", "SepalWidthCm", "PetalLengthCm", "PetalWidthCm", "Class"]
labels = df["Class"].values
df.drop("Class", axis=1, inplace=True)
dados = df.to_numpy()

## Método de agrupamento

In [21]:
class MFCMdd():
    def __init__(self, c, X, m):
        self.c = c
        self.n = X.shape[0]
        self.p = X.shape[1]
        self.m = m
    
    def initialize_u(self):
        return np.random.dirichlet(alpha=np.ones(self.c * self.p),
                                   size=self.n).reshape(self.n, self.c, self.p)
    
    def find_medoids(self, X, U, D):
        medoids = np.zeros((self.c, self.p))  # matriz para armazenar os medoids
        for i in range(self.c):  # para cada cluster
            weighted_distances = np.sum(np.sum((U[:, i, :] ** self.m) * D[:, i, :], axis=2), axis=1) # soma ponderada das distâncias para cada ponto k, somando sobre todas as variáveis j
            medoid_index = np.argmin(weighted_distances) # índice do ponto que minimiza a soma ponderada das distâncias
            medoids[i, :] = X[medoid_index, :] # atribuir o ponto correspondente como medoid (para todas as variáveis simultaneamente)
        return medoids

    
    def get_distances(self, X, V): # as vezes tem umas distâncias muito pequenas
        return (X[:, np.newaxis, :] - V[np.newaxis, :, :]) ** 2
    
    def update_u(self, D):
        ratio = (D[:, np.newaxis, np.newaxis, :, :] / D[:, :, :, np.newaxis, np.newaxis]) ** (1/(self.m-1))
        return 1 / (np.sum(ratio, axis=(3, 4)))
    
    def get_objective_function(self, U, D):
        return np.sum((U ** self.m)*D)

# Clustering

In [33]:
def mfcmdd_run(dados, num_clusters, m=2, max_iter=10**3, epsilon=1e-5): # MUDAR
    mfcmdd = MFCMdd(c=num_clusters, X=dados, m=m) # create the MFCM object
    
    U = mfcmdd.initialize_u() # initialize the membership matrix
    
    for _ in range(max_iter):
        medoids = mfcmdd.find_medoids(dados, U, D) # quem vem primeiro? medoids ou distância? como ficaria o outro?
        D = mfcmdd.get_distances(dados, medoids)
        new_U = mfcmdd.update_u(D)
        if np.linalg.norm(U - new_U) < epsilon:
            break
        U = new_U
    
    Delta = np.sum(U, axis=2)  # summing over the second axis (variables `j`)
    
    return medoids, U, Delta

## Simulação de Monte Carlo

In [23]:
def monte_carlo_simulation(dados, labels, num_clusters, num_trials):
    results = []
    for _ in range(num_trials):
        print(_)
        centroids, U, Delta = mfcmdd_run(dados, num_clusters)
        predicted_labels = np.argmax(Delta, axis=1)
        ari = adjusted_rand_score(labels, predicted_labels)
        results.append(ari)
    mean_rand_index = np.mean(results)
    std_rand_index = np.std(results)
    return mean_rand_index, std_rand_index

In [32]:
num_clusters = 3
num_trials = 100
mean_rand_index, std_rand_index = monte_carlo_simulation(dados, labels, num_clusters, num_trials)

print(f"Mean ARI: {mean_rand_index}")
print(f"Std ARI: {std_rand_index}")

0


UnboundLocalError: cannot access local variable 'D' where it is not associated with a value