In [1]:
pip install numpy -q

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install pandas -q

Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install scikit-learn -q

Note: you may need to restart the kernel to use updated packages.


## Tratamento dos dados

### Importando bibliotecas

In [7]:
import numpy as np
import pandas as pd

### Importando o dataset

In [9]:
df = pd.read_csv('/workspaces/Fuzzy_Clustering/datasets/abalone.csv')
df.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


### Verificando as classes

In [10]:
df["Sex"].unique()

array(['M', 'F', 'I'], dtype=object)

### Verificando o nome exato das colunas

In [11]:
df.columns

Index(['Sex', 'Length', 'Diameter', 'Height', 'Whole weight', 'Shucked weight',
       'Viscera weight', 'Shell weight', 'Rings'],
      dtype='object')

### Retirando variáveis inúteis e transformando classes em números

In [12]:
df = df.rename(columns={'Sex': 'Class'})
df["Class"].replace({"M": 0, "F": 1, "I": 2}, inplace=True)
df.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Class"].replace({"M": 0, "F": 1, "I": 2}, inplace=True)
  df["Class"].replace({"M": 0, "F": 1, "I": 2}, inplace=True)


Unnamed: 0,Class,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,0,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,0,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,1,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,0,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,2,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


### Verificando a corretude das classes

In [13]:
df["Class"].unique()

array([0, 1, 2])

### Verificando o tamanho do conjunto de dados

In [14]:
df.shape

(4177, 9)

### Armazenando as classes em uma variável separada

In [15]:
labels = df["Class"].values
labels

array([0, 0, 1, ..., 0, 1, 0], shape=(4177,))

In [16]:
df.drop("Class", axis=1, inplace=True)
df.head()

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


### Retirando a classe para isolar as variáveis

In [17]:
dados = df.to_numpy()
dados

array([[ 0.455 ,  0.365 ,  0.095 , ...,  0.101 ,  0.15  , 15.    ],
       [ 0.35  ,  0.265 ,  0.09  , ...,  0.0485,  0.07  ,  7.    ],
       [ 0.53  ,  0.42  ,  0.135 , ...,  0.1415,  0.21  ,  9.    ],
       ...,
       [ 0.6   ,  0.475 ,  0.205 , ...,  0.2875,  0.308 ,  9.    ],
       [ 0.625 ,  0.485 ,  0.15  , ...,  0.261 ,  0.296 , 10.    ],
       [ 0.71  ,  0.555 ,  0.195 , ...,  0.3765,  0.495 , 12.    ]],
      shape=(4177, 8))

## Clustering

### Inicialização da matriz de pertinência

A matriz de pertinência é inicializada aleatoriamente $u_{ik}(i=1,...c$ e $k=1,...,n)$ do objeto $k$ pertencente ao grupo $C_i$ tal que:
- $u_{ik} \in [0,1]$;
- $0 < \sum_{k=1}^nu_{ik} < n$;
- $\sum_{i=1}^cu_{ik} = 1$ para todo $k \in \Omega$.

In [18]:
def inicializao_matriz_pertinencia(num_amostras, num_clusters):
    matriz_pertinencia = np.random.rand(num_amostras, num_clusters) # gera uma matriz inicial aleatória com valores entre 0 e 1
    matriz_pertinencia = matriz_pertinencia / matriz_pertinencia.sum(axis=1, keepdims=True) # normalização da matriz pra garantir que a soma dos graus dê um
    return matriz_pertinencia

### Atualização dos centroides

Fixo os graus de pertinência, os centroides são atualizados com base nessa equação:

### $y_i = \frac{\sum_{k=1}^n(u_{ik})^mx_k}{\sum_{k=1}^n(u_{ik})^m}$

In [19]:
def atualizacao_centroides(dados, matriz_pertinencia, m):
    matriz_pertinencia_m = matriz_pertinencia ** m # preparação dos graus de pertinência
    centroides = np.dot(matriz_pertinencia_m.T, dados) / np.sum(matriz_pertinencia_m.T, axis=1, keepdims=True) # fórmula para o cálculo dos centroides
    return centroides

### Atualização da matriz de pertinência

Fixo o protótipo, os graus de pertinência são atualizados com base nessa equação:

### $u_{ik}[\sum_{h=1}^c\{\frac{d(x_k,y_i)}{d(x_k,y_h)}\}^\frac{1}{m-1}]^{-1}$

onde

$d(x_k,y_i) = \sum_{j=1}^p(x_k^j-y_i^j)^2$

In [20]:
def atualizacao_matriz_pertinencia(dados, centroides, m): # talvez a fórmula para calcular a matriz de pertinência esteja errada
    # dados[:, np.newaxis] - centroides cria uma matriz de diferenças entre os pontos de dados e os centroides
    # np.linalg.norm(..., axis=2) calcula a norma (distância euclidiana) das diferenças
    # ** 2 para a distância ser a quadrada
    matriz_distancias = np.linalg.norm(dados[:, np.newaxis] - centroides, axis=2) ** 2
    matriz_distancias = np.fmax(matriz_distancias, np.finfo(np.float64).eps) # evita que matriz_distancias seja 0, np.finfo... é o menor número maior que zero aaqui
    matriz_distancias_inversa = 1 / matriz_distancias
    potencia = 1 / (m-1)
    matriz_pertinencia_atualizada = matriz_distancias_inversa ** potencia/ np.sum(matriz_distancias_inversa ** potencia, axis=1, keepdims=True) # fórmula para atualizar os graus de pertinência
    return matriz_pertinencia_atualizada

### Fuzzy C-Means

Ações:
1. Inicialização da matriz de pertinência
2. Atualização dos centroides
3. Atualização da matriz de pertinência

Critérios de parada:
1. Número máximo de iterações atingido
2. Pouca diferença (erro) entre as matrizes de pertinência de iterações consecutivas

In [21]:
def fcm(dados, num_clusters, m=2, max_iter=1000, erro=1e-5):
    num_amostras = dados.shape[0]
    matriz_pertinencia = inicializao_matriz_pertinencia(num_amostras, num_clusters)
    for _ in range(max_iter): # primeiro critério de parada
        centroides = atualizacao_centroides(dados, matriz_pertinencia, m)
        nova_matriz_pertinencia = atualizacao_matriz_pertinencia(dados, centroides, m)
        if np.linalg.norm(nova_matriz_pertinencia - matriz_pertinencia) < erro: # segundo critério de parada
            break
        matriz_pertinencia = nova_matriz_pertinencia
    return centroides, matriz_pertinencia

### AMI

In [22]:
from sklearn.metrics import adjusted_mutual_info_score

def ami(labels, predicted_labels):
    return adjusted_mutual_info_score(labels, predicted_labels)

### Simulação de Monte Carlo

In [23]:
def simulacao_monte_carlo(dados, labels, num_clusters, num_trials):
    indices_rand = []
    for _ in range(num_trials):
        centroides, matriz_pertinencia = fcm(dados, num_clusters)
        predicted_labels = np.argmax(matriz_pertinencia, axis=1)
        ami_idx = ami(labels, predicted_labels)
        indices_rand.append(ami_idx)
    mean_ami = np.mean(indices_rand)
    std_ami = np.std(indices_rand)
    return mean_ami, std_ami

### Definição de parâmetros e execução do método

In [24]:
num_clusters = 3
num_trials = 100
media_ami, dp_ami = simulacao_monte_carlo(dados, labels, num_clusters, num_trials)

print(f"Monte Carlo FCM Clustering Results ({num_trials} trials)")
print(f"Mean AMI: {media_ami:.4f}")
print(f"Standard Deviation of AMI: {dp_ami:.4f}")

Monte Carlo FCM Clustering Results (100 trials)
Mean AMI: 0.1289
Standard Deviation of AMI: 0.0000
