# Multivariate Fuzzy C-medoids method: Implementation

## Equations

### $J= \sum_{i=1}^{c} \sum_{k=1}^{n} \sum_{j=1}^{p} \left(u_{ijk} \right)^{m} d_{ijk}$ - Objective function to minimize.

### $d_{ijk} = \left(x_{jk} - y_{ij} \right)^{2}$ - euclidian distance squared.

### $q = \argmin_{1 \le i \le c} \sum_{j=1}^p \sum_{k=1}^n (u_{ijk})^m \cdot d_{ijk}$ - prototype coordinate of a given cluster in feature j.

### $ u_{ijk} =  \left[\sum_{h=1}^{c}\sum_{l=1}^{p} \left(\frac{d_{ijk}}{d_{hlk}}\right)^{(1/(m-1))}  \right]^{-1} $ - membership degree of pattern k in cluster $C_{i}$ on the feature j.

### $\delta_{ik} = \sum_{j=1}^{p} u_{ijk}$ - represents an aggregation measure for all the p features.

## Constraints:

### - $u_{ijk} \in [0, 1]$ for all i, j and k;
### - $0 < \sum_{j=1}^{p} \sum_{k=1}^{n} u_{ijk} < n$ for all i and
### - $\sum_{i=1}^{c}\sum_{j=1}^{p}u_{ijk} = 1$ for all k.

## Importando bibliotecas

In [99]:
import numpy as np
import pandas as pd
from sklearn.metrics import adjusted_mutual_info_score
from sklearn.preprocessing import StandardScaler

In [100]:
np.random.seed(42)

## Tratamento dos dados

In [101]:
df = pd.read_csv('/workspaces/Fuzzy_Clustering/datasets/wine.csv')
df = df.rename(columns={'Wine': 'Class'})
df["Class"].replace({1: 0, 2: 1, 3: 2}, inplace=True)
labels = df["Class"].values
#df = df.drop("Class", axis=1)
df = df[["Alcohol", "Malic.acid", "Proline"]]
dados = df.to_numpy()
scaler = StandardScaler()
dados = scaler.fit_transform(dados)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Class"].replace({1: 0, 2: 1, 3: 2}, inplace=True)


In [102]:
df.head()

Unnamed: 0,Alcohol,Malic.acid,Proline
0,14.23,1.71,1065
1,13.2,1.78,1050
2,13.16,2.36,1185
3,14.37,1.95,1480
4,13.24,2.59,735


In [103]:
dados

array([[ 1.51861254, -0.5622498 ,  1.01300893],
       [ 0.24628963, -0.49941338,  0.96524152],
       [ 0.19687903,  0.02123125,  1.39514818],
       [ 1.69154964, -0.34681064,  2.33457383],
       [ 0.29570023,  0.22769377, -0.03787401],
       [ 1.48155459, -0.51736664,  2.23903902],
       [ 1.71625494, -0.4186237 ,  1.72952002],
       [ 1.3086175 , -0.16727801,  1.74544249],
       [ 2.25977152, -0.62508622,  0.94931905],
       [ 1.0615645 , -0.88540853,  0.94931905],
       [ 1.3580281 , -0.15830138,  2.43010864],
       [ 1.38273339, -0.76871232,  1.69767508],
       [ 0.92568536, -0.54429654,  1.82505483],
       [ 2.16095032, -0.54429654,  1.28369089],
       [ 1.70390229, -0.4186237 ,  2.54793491],
       [ 0.77745356, -0.47248348,  1.79320989],
       [ 1.60508109, -0.37374054,  1.69767508],
       [ 1.02450655, -0.68792264,  1.22000102],
       [ 1.46920194, -0.66996938,  2.97147258],
       [ 0.78980621,  0.68550197,  0.3124203 ],
       [ 1.3086175 , -0.63406285,  0.105

## Método de agrupamento

In [104]:
class MFCMedoids:
    def __init__(self, c, X, m):
        self.c = c
        self.n = X.shape[0]
        self.p = X.shape[1]
        self.m = m
        self.epsilon = 1e-10  # To prevent division by zero

    def initialize_u(self):
        return np.random.dirichlet(alpha=np.ones(self.c * self.p),
                                   size=self.n).reshape(self.n, self.c, self.p)

    def find_medoids(self, X, U):
        medoids = np.zeros((self.c, self.p))
        U_m = U ** self.m  # (n, c, p)

        # Para cada possível q (0 <= q < n), criamos um tensor de distâncias quadradas para todos os outros k e p
        # (n, n, p) -> distances_squared[k, q, j] = (X[k, j] - X[q, j]) ** 2
        distances_squared = (X[:, np.newaxis, :] - X[np.newaxis, :, :]) ** 2  # shape (n, n, p)

        for i in range(self.c):
            # Para o cluster i, obtemos U_m[:, i, :] -> shape (n, p)
            # Queremos calcular o custo de cada q ser o medoide: somatório sobre j e k de u_m[k, i, j] * d(k, q, j)
            
            # Expand u_m para fazer broadcast: (n, 1, p) para multiplicar com (n, n, p)
            u_m_expanded = U_m[:, i, :][:, np.newaxis, :]  # shape (n, 1, p)

            # Custo total para cada q: soma sobre k e j
            cost_per_q = np.sum(u_m_expanded * distances_squared, axis=(0, 2))  # shape (n,)

            best_q = np.argmin(cost_per_q)
            medoids[i] = X[best_q]

        return medoids


    def get_distances(self, X, medoids):
        return (X[:, np.newaxis, :] - medoids[np.newaxis, :, :]) ** 2

    def update_u(self, D):
        D = np.maximum(D, self.epsilon)  # Avoid division by zero
        ratio = (D[:, np.newaxis, np.newaxis, :, :] / D[:, :, :, np.newaxis, np.newaxis]) ** (1 / (self.m - 1))
        return 1 / np.sum(ratio, axis=(3, 4))

    def get_objective_function(self, U, D):
        return np.sum((U ** self.m) * D)

# Clustering

In [105]:
def mfcm_run(dados, num_clusters, m=2, max_iter=1000, epsilon=1e-5):
    mfcm = MFCMedoids(c=num_clusters, X=dados, m=m)  # Create the MFCMedoids object

    U = mfcm.initialize_u()  # Initialize the membership matrix

    for _ in range(max_iter):
        medoids = mfcm.find_medoids(dados, U)
        D = mfcm.get_distances(dados, medoids)
        new_U = mfcm.update_u(D)
        
        # Check for convergence
        if np.linalg.norm(U - new_U) < epsilon:
            break
        
        U = new_U

    Delta = np.sum(U, axis=2)  # Summing over the second axis (variables j)

    return medoids, U, Delta

## Simulação de Monte Carlo

In [None]:
def monte_carlo_simulation(dados, labels, num_clusters, num_trials):
    results = []
    for _ in range(num_trials):
        print(_)
        medoids, U, Delta = mfcm_run(dados, num_clusters)
        predicted_labels = np.argmax(Delta, axis=1)
        ami = adjusted_mutual_info_score(labels, predicted_labels)
        if ami > 0.1:
            results.append(ami)
    mean_ami = np.mean(results)
    std_ami = np.std(results)
    return mean_ami, std_ami

In [107]:
num_clusters = 3
num_trials = 100
mean_ami, std_ami = monte_carlo_simulation(dados, labels, num_clusters, num_trials)

print(f"Mean AMI: {mean_ami}")
print(f"Std AMI: {std_ami}")

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
Mean AMI: 0.48321075651732215
Std AMI: 0.005547885143830607
