## Tratamento dos dados

### Importando bibliotecas

In [1]:
import numpy as np
from sklearn.metrics import adjusted_rand_score
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
from sklearn.metrics import pairwise_distances

### Carregando o dataset

In [2]:
def gerar_configuracao(mu_list, sigma_list, tamanhos, config_id):
    dfs = []
    for i, (mu, sigma2, n) in enumerate(zip(mu_list, sigma_list, tamanhos)):
        Sigma = np.diag(sigma2)
        data = np.random.multivariate_normal(mu, Sigma, n)
        df = pd.DataFrame(data, columns=["x1", "x2"])
        df["class"] = i + 1
        dfs.append(df)
    df_config = pd.concat(dfs, ignore_index=True)
    df_config["config"] = config_id
    return df_config

np.random.seed(42)  # reprodutibilidade

# -------------------------------
# Configura√ß√£o 1
mu_1 = [[5, 0], [15, 5], [18, 14]]
sigma2_1 = [[81, 9], [9, 100], [25, 36]]
n1 = [200, 100, 50]
df1 = gerar_configuracao(mu_1, sigma2_1, n1, config_id=1)
l1 = "Classes el√≠pticas de tamanhos diferentes"

# -------------------------------
# Configura√ß√£o 2
mu_2 = [[0, 0], [30, 0], [12, 25]]
sigma2_2 = [[100, 100], [49, 49], [16, 16]]
n2 = [200, 100, 50]
df2 = gerar_configuracao(mu_2, sigma2_2, n2, config_id=2)
l2 = "Classes esf√©ricas de tamanhos diferentes"

# -------------------------------
# Configura√ß√£o 3
mu_3 = [[0, 0], [15, 5], [15, -5]]
sigma2_3 = [[100, 4], [100, 4], [100, 4]]
n3 = [100, 100, 100]
df3 = gerar_configuracao(mu_3, sigma2_3, n3, config_id=3)
l3 = "Classes el√≠pticas de tamanhos iguais"

# -------------------------------
# Configura√ß√£o 4
mu_4 = [[0, 0], [15, 0], [-15, 0]]
sigma2_4 = [[16, 16], [16, 16], [16, 16]]
n4 = [100, 100, 100]
df4 = gerar_configuracao(mu_4, sigma2_4, n4, config_id=4)
l4 = "Classes el√≠pticas de tamanhos iguais"

# -------------------------------
# Configura√ß√£o 5
mu_5 = [[5, 0], [15, 5], [10, -7], [3, 15]]
sigma2_5 = [[81, 9], [9, 100], [49, 16], [25, 25]]
n5 = [50, 50, 50, 50]
df5 = gerar_configuracao(mu_5, sigma2_5, n5, config_id=5)
l5 = "3 classes el√≠pticas e 1 esf√©rica"

# -------------------------------
# Configura√ß√£o 6
mu_6 = [[5, 0], [15, 5], [12, -12], [7, 17]]
sigma2_6 = [[81, 9], [9, 100], [16, 16], [25, 25]]
n6 = [50, 50, 50, 50]
df6 = gerar_configuracao(mu_6, sigma2_6, n6, config_id=6)
l6 = "2 classes el√≠pticas e 2 esf√©ricas"

# -------------------------------
# Configura√ß√£o 7
mu_7 = [[0, 0], [18, 0], [-18, 0], [0, -12]]
sigma2_7 = [[12, 12], [20, 20], [16, 16], [81, 20]]
n7 = [50, 50, 50, 50]
df7 = gerar_configuracao(mu_7, sigma2_7, n7, config_id=7)
l7 = "1 classe el√≠ptica e 3 esf√©ricas"

### M√©todo

In [3]:
def init_membership_matrix(n, k):
    membership_matrix = np.random.rand(n, k) # gera uma matriz inicial aleat√≥ria com valores entre 0 e 1
    membership_matrix = membership_matrix / membership_matrix.sum(axis=1, keepdims=True) # normaliza√ß√£o da matriz pra garantir que a soma dos graus d√™ um
    return membership_matrix

def init_medoids(X, c):
    distances = pairwise_distances(X) # calcula todas as dist√¢ncias entre os pontos uma vez s√≥

    total_distances = np.sum(distances, axis=1) # primeiro medoide: menor soma de dist√¢ncias
    first_medoid_idx = np.argmin(total_distances)

    medoids_indices = [first_medoid_idx] # armazena os √≠ndices dos medoides

    for _ in range(1, c): # para os outros medoides
        max_min_dist = -np.inf # armazena a dist√¢ncia
        next_medoid_idx = -1 # armazena o √≠ndice do medoide escolhido

        for i in range(len(X)):
            if i in medoids_indices: # se o ponto j√° for um medoide
                continue

            min_dist_to_medoids = np.min(distances[i, medoids_indices]) # calcula a menor dist√¢ncia deste ponto para qualquer medoide j√° escolhido

            if min_dist_to_medoids > max_min_dist:
                max_min_dist = min_dist_to_medoids
                next_medoid_idx = i

        medoids_indices.append(next_medoid_idx)

    return X[medoids_indices]

def update_membership_matrix(data, medoids, m):
    distance_matrix = pairwise_distances(data, medoids, metric='manhattan') ** 2
    distance_matrix = np.fmax(distance_matrix, np.finfo(np.float64).eps)  # evita que matriz_distancias seja 0, np.finfo... √© o menor n√∫mero maior que zero aqui
    
    inverse_distance_matrix = 1 / distance_matrix
    power = 1 / (m - 1)
    updated_membership_matrix = (inverse_distance_matrix ** power) / np.sum(inverse_distance_matrix ** power, axis=1, keepdims=True) # f√≥rmula para atualizar os graus de pertin√™ncia
    
    return updated_membership_matrix

def update_medoids(X, membership_matrix, m=2):
    n, c = X.shape[0], membership_matrix.shape[1]
    distances = pairwise_distances(X, X, metric='manhattan') # calcula todas as dist√¢ncias entre os pontos uma vez s√≥
    updated_medoids_indices = []

    for i in range(c):  # para cada cluster
        # custo ponderado total para cada poss√≠vel medoide j
        costs = np.array([
            np.sum((membership_matrix[:, i] ** m) * distances[j, :])
            for j in range(n)
        ])

        # seleciona o ponto com menor custo como novo medoide
        best_medoid_idx = np.argmin(costs)
        updated_medoids_indices.append(best_medoid_idx)

    return X[updated_medoids_indices]

def fcmdd(data, k, m=2, max_iter=1000000):
    n = data.shape[0]
    membership_matrix = init_membership_matrix(n, k)
    medoids = init_medoids(data, k)
    for _ in range(max_iter):
        membership_matrix = update_membership_matrix(data, medoids, m)
        new_medoids = update_medoids(data, membership_matrix, m)
        if np.array_equal(medoids, new_medoids): # se os medoides n√£o mudaram, para
            break
        medoids = new_medoids
    return medoids, membership_matrix

def indice_rand(labels, predicted_labels):
    return adjusted_rand_score(labels, predicted_labels)

def monte_carlo_fuzzy_simulation(X, true_labels, k, m=2, num_trials=100):
    results = []
    for trial in range(num_trials):
        medoids, membership_matrix = fcmdd(X, k, m)
        predicted_labels = np.argmax(membership_matrix, axis=1)
        rand_idx = indice_rand(true_labels, predicted_labels)
        results.append(rand_idx)
    
    mean_ari = np.mean(results)
    std_ari = np.std(results)
    return mean_ari, std_ari

In [4]:
i = 1
for df in [df1, df2, df3, df4, df5, df6, df7]:
    if i == 5 or i == 6 or i == 7:
        num_clusters = 4
    else:
        num_clusters = 3
    df.drop("config", axis=1, inplace=True)
    if i == 5 or i == 6 or i == 7:
        df["class"].replace({1: 0, 2: 1, 3: 2, 4: 3}, inplace=True)
    else: 
        df["class"].replace({1: 0, 2: 1, 3: 2}, inplace=True)
    labels = df["class"].values
    df.drop("class", axis=1, inplace=True)
    data = df.to_numpy()
    k = num_clusters
    num_trials = 100
    m = 2
    mean_rand_index, std_rand_index = monte_carlo_fuzzy_simulation(data, labels, k, m, num_trials)

    print(f"Monte Carlo FCMdd Clustering Results for Config {i}")
    print(f"Mean Rand Index: {mean_rand_index:.4f}")
    print(f"Standard Deviation of Rand Index: {std_rand_index:.4f}")
    print("\n")
    i += 1

Monte Carlo FCMdd Clustering Results for Config 1
Mean Rand Index: 0.2469
Standard Deviation of Rand Index: 0.0000


Monte Carlo FCMdd Clustering Results for Config 2
Mean Rand Index: 0.6460
Standard Deviation of Rand Index: 0.0000


Monte Carlo FCMdd Clustering Results for Config 3
Mean Rand Index: 0.1253
Standard Deviation of Rand Index: 0.0000


Monte Carlo FCMdd Clustering Results for Config 4
Mean Rand Index: 0.8190
Standard Deviation of Rand Index: 0.0000


Monte Carlo FCMdd Clustering Results for Config 5
Mean Rand Index: 0.4198
Standard Deviation of Rand Index: 0.0000


Monte Carlo FCMdd Clustering Results for Config 6
Mean Rand Index: 0.4739
Standard Deviation of Rand Index: 0.0000


Monte Carlo FCMdd Clustering Results for Config 7
Mean Rand Index: 0.7627
Standard Deviation of Rand Index: 0.0000




In [14]:
# Par√¢metros das configura√ß√µes
params_config_12 = [
    {'mu': [-16, -5], 'sigma': [20, 20], 'n': 50},
    {'mu': [-8, 8], 'sigma': [13, 13], 'n': 100},
    {'mu': [0, 0], 'sigma': [6, 6], 'n': 200},
]

params_config_13 = [
    {'mu': [7, -6], 'sigma': [50, 5], 'n': 100},
    {'mu': [0, 0], 'sigma': [2, 50], 'n': 100},
    {'mu': [12, 0], 'sigma': [50, 5], 'n': 100},
]

# Faixa para ru√≠do e semente aleat√≥ria
noise_range = [-100, 50]
np.random.seed(42)

# Fun√ß√£o de gera√ß√£o de dados com ru√≠do
def generate_data(config_params, noise_percent):
    data_all = []
    for class_idx, param in enumerate(config_params, start=1):
        mu = np.array(param['mu'])
        sigma_diag = np.diag(param['sigma'])
        n = param['n']
        n_noise = int(n * noise_percent / 100)
        n_signal = n - n_noise

        real_data = np.random.multivariate_normal(mu, sigma_diag, n_signal)
        labels_real = np.full((n_signal,), class_idx)

        noise_data = np.random.uniform(noise_range[0], noise_range[1], size=(n_noise, 2))
        labels_noise = np.full((n_noise,), 0)

        data = np.vstack([real_data, noise_data])
        labels = np.concatenate([labels_real, labels_noise])

        df = pd.DataFrame(data, columns=['x1', 'x2'])
        df['class'] = labels
        data_all.append(df)

    return pd.concat(data_all, ignore_index=True)

# Configura√ß√µes e t√≠tulos
configs = [
    (params_config_12, 10),
    (params_config_12, 20),
    (params_config_12, 30),
    (params_config_13, 10),
    (params_config_13, 20),
    (params_config_13, 30),
]

titles = [
    "Configura√ß√£o 12 - 10% ru√≠do",
    "Configura√ß√£o 12 - 20% ru√≠do",
    "Configura√ß√£o 12 - 30% ru√≠do",
    "Configura√ß√£o 13 - 10% ru√≠do",
    "Configura√ß√£o 13 - 20% ru√≠do",
    "Configura√ß√£o 13 - 30% ru√≠do",
]

# Gera√ß√£o dos DataFrames separadamente
dfs_por_config = {}

for (params, noise), title in zip(configs, titles):
    df = generate_data(params, noise)
    df['classe_legenda'] = df['class'].replace(0, 'ru√≠do')
    dfs_por_config[title] = df

In [None]:
for nome_config, df in dfs_por_config.items():
    labels = df["class"].values
    df.drop("class", axis=1, inplace=True)
    df.drop("classe_legenda", axis=1, inplace=True)
    dados = df.to_numpy()
    num_clusters = 4
    num_trials = 100
    media_indice_rand, dp_indice_rand = monte_carlo_fuzzy_simulation(dados, labels, num_clusters, 2, num_trials)
    print("\n")
    print(f"Resultados de Monte Carlo para {nome_config} ({num_trials} tentativas)")
    print(f"M√©dia do √çndice Rand: {media_indice_rand:.4f}")
    print(f"Desvio Padr√£o do √çndice Rand: {dp_indice_rand:.4f}")

Os centroides das tr√™s classes est√£o localizados pr√≥ximos uns dos outros:
$\mu_1 = \begin{bmatrix} 20 \\ 20 \end{bmatrix},$
$\mu_2 = \begin{bmatrix} 23 \\ 23 \end{bmatrix},$
$\mu_3 = \begin{bmatrix} 26 \\ 20 \end{bmatrix}$

As classes apresentam diferentes formas e orienta√ß√µes devido √†s suas matrizes de covari√¢ncia:
$\Sigma_1 = \begin{bmatrix} 10 & 9 \\ 9 & 10 \end{bmatrix},$
$\Sigma_2 = \begin{bmatrix} 10 & -9 \\ -9 & 10 \end{bmatrix},$
$\Sigma_3 = \begin{bmatrix} 12 & 0 \\ 0 & 1 \end{bmatrix}$

- $\Sigma_1$ e $\Sigma_2$ geram distribui√ß√µes el√≠pticas com inclina√ß√£o forte nas diagonais principais e secund√°rias, respectivamente.
- $\Sigma_3$ resulta em uma distribui√ß√£o fortemente alongada no eixo $x$.

Cada classe possui $5\%$ de outliers, gerados a partir dos mesmos centros e covari√¢ncias, mas com deslocamentos adicionais direcionados para regi√µes distantes dos centros originais. Os deslocamentos aplicados foram:
$\Delta_1 = \begin{bmatrix} -10 \\ 5 \end{bmatrix},$
$\Delta_2 = \begin{bmatrix} 10 \\ -10 \end{bmatrix},$
$\Delta_3 = \begin{bmatrix} 6 \\ 10 \end{bmatrix}$

In [6]:
def config1_outliers(frac_outlier):
    np.random.seed(42)
    n = 150

    # Covari√¢ncias exageradas para formas mais el√≠pticas e inclinadas
    covs = [
        [[10, 9], [9, 10]],     # fortemente inclinado (diagonal)
        [[10, -9], [-9, 10]],   # diagonal oposta
        [[12, 0], [0, 1]]       # fortemente alongado no eixo x
    ]

    mus = [[20, 20], [23, 23], [26, 20]]  # centroides pr√≥ximos!
    deslocamentos_outliers = [[-10, 5], [10, -10], [6, 10]]

    dados, rotulos, outlier_flags = [], [], []

    for i, (mu, cov, desloc) in enumerate(zip(mus, covs, deslocamentos_outliers)):
        classe = np.random.multivariate_normal(mu, cov, size=n)
        n_outliers = int(n * frac_outlier)

        for j, ponto in enumerate(classe):
            if j < n_outliers:
                outlier = ponto + desloc + np.random.normal(0, 1.8, size=2)
                dados.append(outlier)
                outlier_flags.append(1)
            else:
                dados.append(ponto)
                outlier_flags.append(0)
            rotulos.append(f'Classe {i+1}')

    # Garante que tudo fique no primeiro quadrante
    dados = np.array(dados)
    dados -= np.min(dados, axis=0)
    dados += 1

    df = pd.DataFrame(dados, columns=["x1", "x2"])
    df["Classe"] = rotulos
    df["Outlier"] = outlier_flags
    return df

# üîç Visualiza√ß√£o

In [7]:
for frac_outlier in [0.05, 0.1, 0.15, 0.2, 0.25]:
    df = config1_outliers(frac_outlier)
    df.drop("Outlier", axis=1, inplace=True)
    df['Classe'].replace({'Classe 1': 0, 'Classe 2': 1, 'Classe 3': 2}, inplace=True)
    labels = df["Classe"].values
    df.drop("Classe", axis=1, inplace=True)
    dados = df.to_numpy()
    num_clusters = 3
    num_trials = 100
    media_indice_rand, dp_indice_rand = monte_carlo_fuzzy_simulation(dados, labels, num_clusters, 2, num_trials)
    print("Resultados de Monte Carlo para dados com outliers")
    print(f"M√©dia do √çndice Rand: {media_indice_rand:.4f}")
    print(f"Desvio Padr√£o do √çndice Rand: {dp_indice_rand:.4f}")

Resultados de Monte Carlo para dados com outliers
M√©dia do √çndice Rand: 0.2686
Desvio Padr√£o do √çndice Rand: 0.0000
Resultados de Monte Carlo para dados com outliers
M√©dia do √çndice Rand: 0.1849
Desvio Padr√£o do √çndice Rand: 0.0000
Resultados de Monte Carlo para dados com outliers
M√©dia do √çndice Rand: 0.1663
Desvio Padr√£o do √çndice Rand: 0.0000
Resultados de Monte Carlo para dados com outliers
M√©dia do √çndice Rand: 0.2540
Desvio Padr√£o do √çndice Rand: 0.0000
Resultados de Monte Carlo para dados com outliers
M√©dia do √çndice Rand: 0.2099
Desvio Padr√£o do √çndice Rand: 0.0000
