# Weighted Multivariate Fuzzy C-means method: Implementation

## Equations

### $d_{ijk} = \left(x_{jk} - y_{ij} \right)^{2}$ - euclidian distance squared.

### $ y_{ij} = \frac{\sum_{k=1}^n\left(\frac{u_{ijk}}{\gamma_{ijk}}\right)^mx_{jk}}{\sum_{k=1}^n\left(\frac{u_{ijk}}{\gamma_{ijk}}\right)^m}$ - prototype coordinate of a given cluster $i$ in feature $j$.

### $ u_{ijk} =  \left[\sum_{a=1}^c\sum_{b=1}^p\left(\frac{d_{ijk}}{d_{abk}}\right)^\frac{1}{m-1}\left(\frac{\gamma_{abk}}{\gamma_{ijk}}\right)^\frac{m}{m-1}\right]^{-1} $ - membership degree of pattern $k$ in cluster $C_{i}$ on the feature $j$.

### $\gamma_{ijk} = \frac{\left[(u_{ijk})^md_{ijk}\right]^\frac{1}{m+1}}{\sum_{a=1}^c\sum_{b=1}^p\left[(u_{abk})^md_{abk}\right]^\frac{1}{m+1}u_{abk}}$ - weight of the membership $u_{ijk}$ regarding the cluster $C_i$ and variable $j$

### $\delta_{ik} = \sum_{j=1}^{p} \gamma_{ijk} u_{ijk}$ - represents an aggregation measure for all the $p$ features.

## Constraints:

### - $u_{ijk} \in [0, 1]$ for all i, j and k;
### - $0 < \sum_{j=1}^{p} \sum_{k=1}^{n} u_{ijk} < n$ for all i and
### - $\sum_{i=1}^{c}\sum_{j=1}^{p}u_{ijk} = 1$ for all k.

## Importando bibliotecas

In [7]:
import numpy as np
import pandas as pd
from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score
import seaborn as sns

## Tratamento dos dados

In [8]:
df = pd.read_csv('/Users/thomazaraujo/Documents/CIn-UFPE/PIBIC/Fuzzy_Clustering/datasets/seeds.csv')
df = df.rename(columns={'V8': 'Class'})
df["Class"].replace({1: 0, 2: 1, 3: 2}, inplace=True)
labels = df["Class"].values
df.drop("Class", axis=1, inplace=True)
dados = df.to_numpy()
dados 

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Class"].replace({1: 0, 2: 1, 3: 2}, inplace=True)


array([[15.26  , 14.84  ,  0.871 , ...,  3.312 ,  2.221 ,  5.22  ],
       [14.88  , 14.57  ,  0.8811, ...,  3.333 ,  1.018 ,  4.956 ],
       [14.29  , 14.09  ,  0.905 , ...,  3.337 ,  2.699 ,  4.825 ],
       ...,
       [13.2   , 13.66  ,  0.8883, ...,  3.232 ,  8.315 ,  5.056 ],
       [11.84  , 13.21  ,  0.8521, ...,  2.836 ,  3.598 ,  5.044 ],
       [12.3   , 13.34  ,  0.8684, ...,  2.974 ,  5.637 ,  5.063 ]],
      shape=(210, 7))

## Método de agrupamento

In [9]:
class MFCM():
    def __init__(self, c, X, m):
        self.c = c
        self.n = X.shape[0]
        self.p = X.shape[1]
        self.m = m
    
    def initialize_u(self):
        return np.random.dirichlet(alpha=np.ones(self.c * self.p),
                                   size=self.n).reshape(self.n, self.c, self.p)
    
    def initialize_gamma(self):
        return np.ones((self.n, self.c, self.p))
    
    def find_centroides(self, X, U, Gamma):
        u_gamma = U / np.fmax(Gamma, np.finfo(np.float64).eps) # U / Gamma
        u_gamma_m = u_gamma ** self.m # (U / Gamma)^m
        
        numerador = u_gamma_m * X[:, np.newaxis, :] # (U / Gamma)^m * x_jk
        numerador_sum = np.sum(numerador, axis=0) # somatório em k
        
        denominador_sum = np.sum(u_gamma_m, axis=0) # somatório em k
        denominador_sum = np.fmax(denominador_sum, np.finfo(np.float64).eps) # impede que seja por zero
        
        centroides = numerador_sum / denominador_sum  
        return centroides
    
    def get_distances(self, X, V):
        return (X[:, np.newaxis, :] - V[np.newaxis, :, :]) ** 2

    def update_u(self, D, Gamma):
        P = 1.0 / (self.m - 1)  # expoente de d: 1 / (m-1)
        P_gamma = self.m * P    # expoente do gamma: m / (m-1)
        
        d_inverso = 1.0 / np.fmax(D, np.finfo(np.float64).eps) # d_ijk^(-1)
        d_elevado = d_inverso ** P # ((d_ijk)^(-1)) ^ P
        Gamma_potencia = Gamma ** P_gamma # (gamma_ijk)^(m / (m-1))
        numerador = d_elevado * Gamma_potencia # (N, C, P)
        
        # somatório feito sobre clusters (axis=1) e variáveis (axis=2)
        denominador = np.sum(numerador, axis=(1, 2), keepdims=True)
        
        U = numerador / denominador
        U = U / np.sum(U, axis=(1, 2), keepdims=True) # normalização final
        
        return U
    
    def update_gamma(self, D, U):
        expoente = 1.0 / (self.m + 1)
        
        d = np.fmax(D, np.finfo(np.float64).eps) # d_ijk
        mult = (U ** self.m) * d # [(u_ijk)^m * d_ijk]
        numerador = mult ** expoente # [(u_ijk)^m * d_ijk] ^ (1/(m+1))
        
        den = numerador * U # [(u_ijk)^m * d_ijk]^(1/(m+1)) * u_ijk
        # somatório feito sobre clusters (axis=1) e variáveis (axis=2)
        denominador = np.sum(den, axis=(1, 2), keepdims=True)
        denominador = np.fmax(denominador, np.finfo(np.float64).eps)
        
        gamma = numerador / denominador
        
        return gamma

## Clustering

In [10]:
def mfcm_run(dados, num_clusters, m=2, max_iter=10**3, epsilon=1e-5):
    mfcm = MFCM(c=num_clusters, X=dados, m=m) # create the MFCM object
    
    U = mfcm.initialize_u() # initialize the membership matrix
    Gamma = mfcm.initialize_gamma() # initialize the weight matrix
    
    for _ in range(max_iter):
        centroids = mfcm.find_centroides(dados, U, Gamma)
        D = mfcm.get_distances(dados, centroids)
        new_U = mfcm.update_u(D, Gamma)
        Gamma = mfcm.update_gamma(U, D)
        if np.linalg.norm(U - new_U) < epsilon:
            break
        U = new_U

    Delta = np.sum(Gamma * U, axis=2) 
    
    return centroids, U, Delta

## Simulação de Monte Carlo

In [11]:
def monte_carlo_simulation(dados, labels, num_clusters, num_trials):
    results_ari = []
    results_ami = []
    for _ in range(num_trials):
        centroids, U, Delta = mfcm_run(dados, num_clusters)
        predicted_labels = np.argmax(Delta, axis=1)
        ari = adjusted_rand_score(labels, predicted_labels)
        ami = adjusted_mutual_info_score(labels, predicted_labels)
        results_ari.append(ari)
        results_ami.append(ami)
    mean_ari = np.mean(results_ari)
    std_ari = np.std(results_ari)
    mean_ami = np.mean(results_ami)
    std_ami = np.std(results_ami)
    return mean_ari, std_ari, mean_ami, std_ami

In [12]:
num_clusters = 3
num_trials = 100
mean_ari, std_ari, mean_ami, std_ami = monte_carlo_simulation(dados, labels, num_clusters, num_trials)

print(f"Mean ARI: {mean_ari}")
print(f"Std ARI: {std_ari}")
print("")
print(f"Mean AMI: {mean_ami}")
print(f"Std AMI: {std_ami}")

Mean ARI: 0.42983959406498895
Std ARI: 0.08163920225586059

Mean AMI: 0.484620184010389
Std AMI: 0.08431128190853257
