# Weighted Multivariate Fuzzy C-means method: Implementation

## Equations

### $d_{ijk} = \left(x_{jk} - y_{ij} \right)^{2}$ - euclidian distance squared.

### $ y_{ij} = \frac{\sum_{k=1}^n\left(\frac{u_{ijk}}{\gamma_{ijk}}\right)^mx_{jk}}{\sum_{k=1}^n\left(\frac{u_{ijk}}{\gamma_{ijk}}\right)^m}$ - prototype coordinate of a given cluster $i$ in feature $j$.

### $ u_{ijk} =  \left[\sum_{a=1}^c\sum_{b=1}^p\left(\frac{d_{ijk}}{d_{abk}}\right)^\frac{1}{m-1}\left(\frac{\gamma_{abk}}{\gamma_{ijk}}\right)^\frac{m}{m-1}\right]^{-1} $ - membership degree of pattern $k$ in cluster $C_{i}$ on the feature $j$.

### $\gamma_{ijk} = \frac{\left[(u_{ijk})^md_{ijk}\right]^\frac{1}{m+1}}{\sum_{a=1}^c\sum_{b=1}^p\left[(u_{abk})^md_{abk}\right]^\frac{1}{m+1}u_{abk}}$ - weight of the membership $u_{ijk}$ regarding the cluster $C_i$ and variable $j$

### $\delta_{ik} = \sum_{j=1}^{p} \gamma_{ijk} u_{ijk}$ - represents an aggregation measure for all the $p$ features.

## Constraints:

### - $u_{ijk} \in [0, 1]$ for all i, j and k;
### - $0 < \sum_{j=1}^{p} \sum_{k=1}^{n} u_{ijk} < n$ for all i and
### - $\sum_{i=1}^{c}\sum_{j=1}^{p}u_{ijk} = 1$ for all k.

## Importando bibliotecas

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score
import seaborn as sns

## Tratamento dos dados

In [39]:
df = sns.load_dataset('iris')
df["species"].replace({"setosa": 0, "versicolor": 1, "virginica": 2}, inplace=True)
df.columns = ["SepalLengthCm", "SepalWidthCm", "PetalLengthCm", "PetalWidthCm", "Class"]
labels = df["Class"].values
df.drop("Class", axis=1, inplace=True)
dados = df.to_numpy()

## Método de agrupamento

In [None]:
class MFCM():
    def __init__(self, c, X, m):
        self.c = c
        self.n = X.shape[0]
        self.p = X.shape[1]
        self.m = m
    
    def initialize_u(self):
        return np.random.dirichlet(alpha=np.ones(self.c * self.p),
                                   size=self.n).reshape(self.n, self.c, self.p)
    
    def initialize_gamma(self):
        return np.ones((self.n, self.c, self.p))
    
    def find_centroides(self, X, U, Gamma):
        u_gamma = U / np.fmax(Gamma, np.finfo(np.float64).eps) # U / Gamma
        u_gamma_m = u_gamma ** self.m # (U / Gamma)^m
        
        numerador = u_gamma_m * X[:, np.newaxis, :] # (U / Gamma)^m * x_jk
        numerador_sum = np.sum(numerador, axis=0) # somatório em k
        
        denominador_sum = np.sum(u_gamma_m, axis=0) # somatório em k
        denominador_sum = np.fmax(denominador_sum, np.finfo(np.float64).eps) # impede que seja por zero
        
        centroides = numerador_sum / denominador_sum  
        return centroides
    
    def get_distances(self, X, V):
        return (X[:, np.newaxis, :] - V[np.newaxis, :, :]) ** 2
    
    ## TODO: implementar update_u corretamente
    
    def update_u(self, D):
        ratio = (D[:, np.newaxis, np.newaxis, :, :] / D[:, :, :, np.newaxis, np.newaxis]) ** (1/(self.m-1))
        return 1 / (np.sum(ratio, axis=(3, 4)))
    
    ## TODO: implementar update_gamma

## Clustering

In [None]:
def mfcm_run(dados, num_clusters, m=2, max_iter=10**3, epsilon=1e-5):
    mfcm = MFCM(c=num_clusters, X=dados, m=m) # create the MFCM object
    
    U = mfcm.initialize_u() # initialize the membership matrix
    Gamma = mfcm.initialize_gamma() # initialize the weight matrix
    
    for _ in range(max_iter):
        centroids = mfcm.find_centroides(dados, U, Gamma)
        D = mfcm.get_distances(dados, centroids)
        new_U = mfcm.update_u(D)
        Gamma = mfcm.update_gamma(U, D)
        if np.linalg.norm(U - new_U) < epsilon:
            break
        U = new_U

    ## TODO: calcular Delta corretamente
    
    Delta = np.sum(U, axis=2) 
    
    return centroids, U, Delta

## Simulação de Monte Carlo

In [None]:
def monte_carlo_simulation(dados, labels, num_clusters, num_trials):
    results_ari = []
    results_ami = []
    for _ in range(num_trials):
        centroids, U, Delta = mfcm_run(dados, num_clusters)
        predicted_labels = np.argmax(Delta, axis=1)
        ari = adjusted_rand_score(labels, predicted_labels)
        ami = adjusted_mutual_info_score(labels, predicted_labels)
        results_ari.append(ari)
        results_ami.append(ami)
    mean_ari = np.mean(results_ari)
    std_ari = np.std(results_ari)
    mean_ami = np.mean(results_ami)
    std_ami = np.std(results_ami)
    return mean_ari, std_ari, mean_ami, std_ami

In [None]:
num_clusters = 3
num_trials = 100
mean_ari, std_ari, mean_ami, std_ami = monte_carlo_simulation(dados, labels, num_clusters, num_trials)

print(f"Mean ARI: {mean_ari}")
print(f"Std ARI: {std_ari}")
print("\n")
print(f"Mean AMI: {mean_ami}")
print(f"Std AMI: {std_ami}")

Mean ARI: 0.5469360324949011
Std ARI: 0.03158768950024982
