## 聚类模型评估

评价clustering模型的方法

1. 如果有真实类别，可以计算 FPR 和 TPR 并绘制 ROC 曲线。

2. Pairwise Similarity 计算样本对是否在同一聚类中，并与真实类别进行对比

3. 只是想评估聚类质量，可以使用 ARI 或 NMI。

真实类别数目已知、类别数相近	ARI	调整了随机性，适用于类别稳定的情况

真实类别数目可能变化	NMI	归一化互信息，可用于不同类别数的情况

无监督聚类，无真实类别	Pairwise Similarity	通过样本对计算相似性，可用于无标签数据

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, pairwise_distances

def evaluate_clustering_models(labels_true, cluster_results, model_names):
    """
    计算聚类模型的评估指标（ARI, NMI 或 Pairwise Similarity），并以表格和可视化方式展示。
    """
    metrics = {}
    
    for name in model_names:
        if labels_true is not None:
            ari = adjusted_rand_score(labels_true, cluster_results[name])
            nmi = normalized_mutual_info_score(labels_true, cluster_results[name])
            metrics[name] = {"ARI": ari, "NMI": nmi}
        else:
            # 计算 Pairwise Similarity
            similarity = 1 - np.mean(pairwise_distances(cluster_results[name].reshape(-1, 1)))
            metrics[name] = {"Pairwise Similarity": similarity}
    
    # 转换为 DataFrame
    metrics_df = pd.DataFrame(metrics).T
    
    # 显示指标表格
    print("\nClustering Model Performance Metrics:")
    print(metrics_df)
    
    # 绘制指标对比图
    plt.figure(figsize=(8, 6))
    if labels_true is not None:
        plt.bar(model_names, [metrics[name]["ARI"] for name in model_names], alpha=0.6, label="ARI")
        plt.bar(model_names, [metrics[name]["NMI"] for name in model_names], alpha=0.6, label="NMI")
    else:
        plt.bar(model_names, [metrics[name]["Pairwise Similarity"] for name in model_names], color='blue')
    
    plt.xlabel("Models")
    plt.ylabel("Score")
    plt.title("Comparison of Clustering Performance")
    plt.legend()
    plt.show()
    
    return metrics_df

In [None]:
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize

import matplotlib.pyplot as plt

# Binarize the ground truth labels
n_classes = optimal_k  # Set n_classes to the number of clusters
y_train_binarized = label_binarize(CO2_train_labels, classes=range(n_classes))
y_test_binarized = label_binarize(CO2_test_labels, classes=range(n_classes))

# Binarize the clustering labels
train_kmeans_binarized = label_binarize(train_df['KMeans Cluster (Optimal)'], classes=range(optimal_k))
test_kmeans_binarized = label_binarize(test_df['KMeans Cluster (Optimal)'], classes=range(optimal_k))

# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()

for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_train_binarized[:, i], train_kmeans_binarized[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_train_binarized.ravel(), train_kmeans_binarized.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

# Plot ROC curve
plt.figure(figsize=(10, 6))
plt.plot(fpr["micro"], tpr["micro"], label=f'Micro-average ROC curve (area = {roc_auc["micro"]:.2f})', color='deeppink', linestyle=':', linewidth=4)

for i in range(n_classes):
    plt.plot(fpr[i], tpr[i], label=f'ROC curve of class {i} (area = {roc_auc[i]:.2f})')

plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for KMeans Clustering')
plt.legend(loc="lower right")
plt.grid()
plt.show()