# 03 - Clustering e M√©tricas

Este notebook aplica os algoritmos de clustering e calcula todas as m√©tricas de avalia√ß√£o.


In [1]:
import sys
from pathlib import Path
import numpy as np
import pandas as pd
from tqdm import tqdm

# Adicionar src ao path
project_root = Path().resolve().parent
sys.path.append(str(project_root))

from src.config import (
    EMBEDDINGS_DIR, CLUSTERING_CONFIGS, N_CLUSTERS, RANDOM_STATE, FIGURES_DIR
)
from src.utils import (
    load_embedding, compute_all_metrics, create_results_dataframe,
    save_results_table, TABLES_DIR
)


## 1. Carregar Embeddings e Labels Verdadeiros

Execute os notebooks anteriores primeiro!


In [2]:
# Carregar dados e labels verdadeiros
from sklearn.datasets import fetch_20newsgroups
from src.config import RAW_DATA_DIR, TWENTY_NG_CATEGORIES, PT6_CLASS_COLUMN_CANDIDATES
from src.utils import detect_class_column

# Carregar 20NG-6 para obter labels
print("üì• Carregando labels verdadeiros...")
newsgroups = fetch_20newsgroups(
    subset='all',
    categories=TWENTY_NG_CATEGORIES,
    remove=('headers', 'footers', 'quotes'),
    shuffle=True,
    random_state=42
)
y_true_20ng = newsgroups.target
print(f"   ‚úÖ 20NG-6: {len(y_true_20ng)} documentos, {len(np.unique(y_true_20ng))} classes")

# Carregar PT-6 para obter labels
pt6_file = RAW_DATA_DIR / "pt6_preprocessed.csv"
if pt6_file.exists():
    df_pt6 = pd.read_csv(pt6_file, encoding='utf-8-sig')
    class_col = detect_class_column(df_pt6, PT6_CLASS_COLUMN_CANDIDATES)
    
    if class_col:
        # Converter classes textuais para num√©ricas
        from sklearn.preprocessing import LabelEncoder
        le = LabelEncoder()
        y_true_pt6 = le.fit_transform(df_pt6[class_col])
        print(f"   ‚úÖ PT-6: {len(y_true_pt6)} documentos, {len(np.unique(y_true_pt6))} classes")
    else:
        raise ValueError("N√£o foi poss√≠vel detectar coluna de classe no PT-6")
else:
    raise FileNotFoundError(f"Arquivo n√£o encontrado: {pt6_file}")

print("\n‚úÖ Labels verdadeiros carregados!")


üì• Carregando labels verdadeiros...
   ‚úÖ 20NG-6: 5906 documentos, 6 classes
   ‚úÖ PT-6: 315 documentos, 6 classes

‚úÖ Labels verdadeiros carregados!


## 2. Definir Fun√ß√µes de Clustering


In [3]:
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN, SpectralClustering
from sklearn.mixture import GaussianMixture
import warnings
# Suprimir aviso de sintaxe do hdbscan (problema conhecido na biblioteca)
warnings.filterwarnings('ignore', category=SyntaxWarning, module='hdbscan')
import hdbscan

def apply_kmeans(X, config):
    """Aplica K-Means clustering."""
    kmeans = KMeans(**config)
    return kmeans.fit_predict(X)

def apply_gmm(X, config):
    """Aplica Gaussian Mixture Model clustering."""
    gmm = GaussianMixture(**config)
    return gmm.fit_predict(X)

def apply_agglomerative(X, config):
    """Aplica Agglomerative Clustering."""
    agg = AgglomerativeClustering(**config)
    return agg.fit_predict(X)

def find_optimal_eps(X, k=4, min_samples=5):
    """Encontra eps √≥timo para DBSCAN usando k-distance graph."""
    from sklearn.neighbors import NearestNeighbors
    neighbors = NearestNeighbors(n_neighbors=k)
    neighbors_fit = neighbors.fit(X)
    distances, indices = neighbors_fit.kneighbors(X)
    distances = np.sort(distances, axis=0)
    distances = distances[:, k-1]
    # Usar o "cotovelo" da curva k-distance
    # Uma heur√≠stica simples: usar percentil 90
    eps = np.percentile(distances, 90)
    return eps

def apply_dbscan(X, config, optimize_eps=True):
    """Aplica DBSCAN clustering com otimiza√ß√£o de eps."""
    config = config.copy()
    if optimize_eps:
        optimal_eps = find_optimal_eps(X, k=4, min_samples=config.get('min_samples', 5))
        config['eps'] = optimal_eps
        print(f"      üìä eps otimizado: {optimal_eps:.4f}")
    dbscan = DBSCAN(**config)
    return dbscan.fit_predict(X)

def apply_spectral(X, config):
    """Aplica Spectral Clustering."""
    spectral = SpectralClustering(**config)
    return spectral.fit_predict(X)

def apply_hdbscan(X, config):
    """Aplica HDBSCAN clustering."""
    clusterer = hdbscan.HDBSCAN(**config)
    return clusterer.fit_predict(X)

# Mapeamento de algoritmos para fun√ß√µes
CLUSTERING_FUNCTIONS = {
    'kmeans': apply_kmeans,
    'gmm': apply_gmm,
    'agglomerative': apply_agglomerative,
    'dbscan': apply_dbscan,
    'spectral': apply_spectral,
    'hdbscan': apply_hdbscan,
}

print("‚úÖ Fun√ß√µes de clustering definidas!")


‚úÖ Fun√ß√µes de clustering definidas!


## 3. Aplicar Clustering e Calcular M√©tricas

Para cada combina√ß√£o de (dataset, embedding, algoritmo), aplicamos o clustering e calculamos todas as m√©tricas.


In [None]:
# Configura√ß√µes
datasets = {
    '20ng6': {'embeddings': {}, 'labels': y_true_20ng},
    'pt6': {'embeddings': {}, 'labels': y_true_pt6}
}

embedding_types = ['tfidf_svd', 'sbert', 'gte', 'bge']
algorithms = list(CLUSTERING_CONFIGS.keys())

# Carregar todos os embeddings
print("=" * 60)
print("CARREGANDO EMBEDDINGS")
print("=" * 60)

for dataset_name in datasets.keys():
    print(f"\nüìä {dataset_name.upper()}:")
    for emb_type in embedding_types:
        embedding = load_embedding(dataset_name, emb_type, EMBEDDINGS_DIR)
        if embedding is not None:
            datasets[dataset_name]['embeddings'][emb_type] = embedding
            print(f"   ‚úÖ {emb_type}: shape {embedding.shape}")
        else:
            print(f"   ‚ùå {emb_type}: n√£o encontrado")

print("\n" + "=" * 60)
print("APLICANDO CLUSTERING E CALCULANDO M√âTRICAS")
print("=" * 60)

# Armazenar todos os resultados
all_results = []

# Iterar sobre todas as combina√ß√µes
total_combinations = sum(
    len(datasets[ds]['embeddings']) * len(algorithms)
    for ds in datasets.keys()
)

with tqdm(total=total_combinations, desc="Processando") as pbar:
    for dataset_name, dataset_data in datasets.items():
        y_true = dataset_data['labels']
        
        for emb_type, X in dataset_data['embeddings'].items():
            for algo_name in algorithms:
                pbar.set_description(f"{dataset_name} | {emb_type} | {algo_name}")
                
                try:
                    # Aplicar clustering
                    config = CLUSTERING_CONFIGS[algo_name].copy()
                    cluster_func = CLUSTERING_FUNCTIONS[algo_name]
                    
                    # Log para algoritmos mais lentos
                    if algo_name in ['gmm', 'spectral', 'hdbscan']:
                        print(f"\n   üîÑ Aplicando {algo_name} em {dataset_name}/{emb_type}... (pode demorar)")
                    
                    if algo_name == 'dbscan':
                        y_pred = cluster_func(X, config, optimize_eps=True)
                    else:
                        y_pred = cluster_func(X, config)
                    
                    # Calcular m√©tricas
                    # Nota: Silhouette √© otimizado para datasets grandes (usa amostra)
                    metrics = compute_all_metrics(y_true, y_pred, X)
                    
                    # Adicionar metadados
                    result = {
                        'dataset': dataset_name,
                        'embedding': emb_type,
                        'algorithm': algo_name,
                        **metrics
                    }
                    
                    # Adicionar informa√ß√µes sobre clusters
                    # Para DBSCAN/HDBSCAN, -1 indica ru√≠do
                    unique_labels = np.unique(y_pred)
                    n_clusters = len(unique_labels[unique_labels >= 0])  # Ignorar ru√≠do (-1)
                    n_noise = int(np.sum(y_pred == -1)) if -1 in unique_labels else 0
                    result['n_clusters'] = n_clusters
                    result['n_noise'] = n_noise
                    
                    all_results.append(result)
                    
                except Exception as e:
                    print(f"\n‚ö†Ô∏è Erro em {dataset_name} | {emb_type} | {algo_name}: {e}")
                    result = {
                        'dataset': dataset_name,
                        'embedding': emb_type,
                        'algorithm': algo_name,
                        'ari': np.nan,
                        'nmi': np.nan,
                        'purity': np.nan,
                        'silhouette': np.nan,
                        'n_clusters': np.nan,
                        'n_noise': np.nan,
                        'error': str(e)
                    }
                    all_results.append(result)
                
                pbar.update(1)

print("\n‚úÖ Clustering conclu√≠do!")


CARREGANDO EMBEDDINGS

üìä 20NG6:
Embedding carregado de: C:\nlp-clustering-benchmark\data\embeddings\20ng6_tfidf_svd.npy
   ‚úÖ tfidf_svd: shape (5906, 300)
Embedding carregado de: C:\nlp-clustering-benchmark\data\embeddings\20ng6_sbert.npy
   ‚úÖ sbert: shape (5906, 768)
Embedding carregado de: C:\nlp-clustering-benchmark\data\embeddings\20ng6_gte.npy
   ‚úÖ gte: shape (5906, 768)
Embedding carregado de: C:\nlp-clustering-benchmark\data\embeddings\20ng6_bge.npy
   ‚úÖ bge: shape (5906, 1024)

üìä PT6:
Embedding carregado de: C:\nlp-clustering-benchmark\data\embeddings\pt6_tfidf_svd.npy
   ‚úÖ tfidf_svd: shape (315, 300)
Embedding carregado de: C:\nlp-clustering-benchmark\data\embeddings\pt6_sbert.npy
   ‚úÖ sbert: shape (315, 768)
Embedding carregado de: C:\nlp-clustering-benchmark\data\embeddings\pt6_gte.npy
   ‚úÖ gte: shape (315, 768)
Embedding carregado de: C:\nlp-clustering-benchmark\data\embeddings\pt6_bge.npy
   ‚úÖ bge: shape (315, 1024)

APLICANDO CLUSTERING E CALCULANDO M

20ng6 | tfidf_svd | dbscan:   6%|‚ñã         | 3/48 [01:12<17:27, 23.27s/it]       

      üìä eps otimizado: 0.5220


20ng6 | sbert | dbscan:  19%|‚ñà‚ñâ        | 9/48 [01:53<05:37,  8.65s/it]       

      üìä eps otimizado: 2.6762


20ng6 | gte | dbscan:  31%|‚ñà‚ñà‚ñà‚ñè      | 15/48 [02:49<04:54,  8.94s/it]       

      üìä eps otimizado: 0.5902


20ng6 | bge | dbscan:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 21/48 [03:55<05:06, 11.35s/it]       

      üìä eps otimizado: 16.6587


20ng6 | bge | spectral:  46%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 22/48 [03:57<03:39,  8.44s/it]

## 4. Criar Tabela de Resultados e Salvar


In [None]:
# Criar DataFrame com resultados
results_df = create_results_dataframe(all_results)

# Exibir resumo
print("=" * 60)
print("RESUMO DOS RESULTADOS")
print("=" * 60)
print(f"\nTotal de combina√ß√µes: {len(results_df)}")
print(f"\nDatasets: {results_df['dataset'].unique()}")
print(f"Embeddings: {results_df['embedding'].unique()}")
print(f"Algoritmos: {results_df['algorithm'].unique()}")

# Exibir primeiras linhas
print("\n" + "=" * 60)
print("PRIMEIRAS LINHAS DA TABELA")
print("=" * 60)
print(results_df.head(10).to_string())

# Salvar tabela completa
save_results_table(results_df, "clustering_results", TABLES_DIR)

# Salvar tabelas separadas por dataset
for dataset in results_df['dataset'].unique():
    df_subset = results_df[results_df['dataset'] == dataset]
    save_results_table(df_subset, f"clustering_results_{dataset}", TABLES_DIR)

print("\n‚úÖ Tabelas salvas com sucesso!")


## 5. An√°lise e Visualiza√ß√£o dos Resultados

Visualiza√ß√£o r√°pida dos melhores resultados por m√©trica.


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Configurar estilo
plt.style.use('seaborn-v0_8')
sns.set_palette("Set2")

# Criar visualiza√ß√µes por m√©trica
metrics_to_plot = ['ari', 'nmi', 'purity', 'silhouette']

fig, axes = plt.subplots(2, 2, figsize=(16, 12))
axes = axes.flatten()

for idx, metric in enumerate(metrics_to_plot):
    ax = axes[idx]
    
    # Pivot table para heatmap
    pivot_data = results_df.pivot_table(
        values=metric,
        index='embedding',
        columns='algorithm',
        aggfunc='mean'
    )
    
    # Criar heatmap
    sns.heatmap(
        pivot_data,
        annot=True,
        fmt='.3f',
        cmap='YlOrRd',
        ax=ax,
        cbar_kws={'label': metric.upper()}
    )
    
    ax.set_title(f'{metric.upper()} - M√©dia entre Datasets', fontsize=12, fontweight='bold')
    ax.set_xlabel('Algoritmo', fontsize=10)
    ax.set_ylabel('Embedding', fontsize=10)

plt.tight_layout()
plt.savefig(FIGURES_DIR / 'clustering_metrics_heatmap.png', dpi=300, bbox_inches='tight')
print(f"‚úÖ Heatmap salvo em: {FIGURES_DIR / 'clustering_metrics_heatmap.png'}")
plt.show()

# Melhores resultados por m√©trica
print("\n" + "=" * 60)
print("MELHORES RESULTADOS POR M√âTRICA")
print("=" * 60)

for metric in metrics_to_plot:
    if metric in results_df.columns:
        best = results_df.nlargest(3, metric)[['dataset', 'embedding', 'algorithm', metric]]
        print(f"\nüèÜ Top 3 - {metric.upper()}:")
        print(best.to_string(index=False))
