In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from mpl_toolkits.mplot3d import Axes3D

X_train = data = np.load('../pro_data/X_train_cluster.npy')
y_train = label = np.load('../pro_data/y_encoded_train_cluster.npy')


sample_size = 15000 #10888 in total
if len(X_train) > sample_size:
    indices = np.random.choice(len(X_train), sample_size, replace=False)
    X_sample = X_train[indices]
    y_sample = y_train[indices]
else:
    X_sample = X_train
    y_sample = y_train



In [3]:
from sklearn.cluster import KMeans

def apply_kmeans(X, n_clusters=7, random_state=42):
    print("应用 K-Means 聚类...")
    kmeans = KMeans(n_clusters=n_clusters, random_state=random_state, n_init='auto')
    cluster_labels = kmeans.fit_predict(X)
    return cluster_labels

kmeans_labels = apply_kmeans(X_sample, n_clusters=7)

应用 K-Means 聚类...


In [9]:
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, fowlkes_mallows_score, v_measure_score
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
import numpy as np
from collections import Counter

def purity_score(y_true, y_pred):
    cluster_labels = np.unique(y_pred)
    total_correct = 0
    for cluster in cluster_labels:
        idx = np.where(y_pred == cluster)
        true_labels, counts = np.unique(y_true[idx], return_counts=True)
        total_correct += np.max(counts)
    return total_correct / len(y_true)

from scipy.spatial.distance import cdist
import numpy as np

def dunn_index(X, labels):
    clusters = np.unique(labels)
    cluster_data = [X[labels == c] for c in clusters]
    
    # 簇内最大距离（直径）
    intra_dists = [np.max(cdist(c, c)) for c in cluster_data if len(c) > 1]
    max_intra = max(intra_dists) if intra_dists else 1e-10
    
    # 簇间最小距离
    min_inter = np.inf
    for i in range(len(cluster_data)):
        for j in range(i + 1, len(cluster_data)):
            dist = np.min(cdist(cluster_data[i], cluster_data[j]))
            min_inter = min(min_inter, dist)
    
    return min_inter / max_intra


def evaluate_clustering(X, y_true, y_pred):
    print("📊 外部指标：")
    print(f"  ARI  (Adjusted Rand Index):     {adjusted_rand_score(y_true, y_pred):.4f}")
    print(f"  NMI  (Normalized Mutual Info):  {normalized_mutual_info_score(y_true, y_pred):.4f}")
    print(f"  FMI  (Fowlkes-Mallows Index):   {fowlkes_mallows_score(y_true, y_pred):.4f}")
    print(f"  V-measure:                     {v_measure_score(y_true, y_pred):.4f}")
    print(f"  Purity:                        {purity_score(y_true, y_pred):.4f}")
    
    print("\n📈 内部指标：")
    print(f"  Silhouette Score:               {silhouette_score(X, y_pred):.4f}")
    print(f"  Calinski-Harabasz Index:        {calinski_harabasz_score(X, y_pred):.2f}")
    print(f"  Davies-Bouldin Index:           {davies_bouldin_score(X, y_pred):.4f}")
    print(f"  Dunn Index:                     {dunn_index(X, y_pred):.4f}")


In [10]:
y_true = np.argmax(y_sample, axis=1)

# 评估 K-Means
print("🔵 K-Means 聚类评估结果：")
evaluate_clustering(X_sample, y_true, kmeans_labels)


🔵 K-Means 聚类评估结果：
📊 外部指标：
  ARI  (Adjusted Rand Index):     0.6633
  NMI  (Normalized Mutual Info):  0.7095
  FMI  (Fowlkes-Mallows Index):   0.7234
  V-measure:                     0.7095
  Purity:                        0.7954

📈 内部指标：
  Silhouette Score:               0.3089
  Calinski-Harabasz Index:        6219.84
  Davies-Bouldin Index:           1.1016
  Dunn Index:                     0.0080


In [5]:
from sklearn.cluster import AgglomerativeClustering

def apply_hierarchical_clustering(X, n_clusters=10):
    print("应用 Hierarchical Clustering...")
    hierarchical = AgglomerativeClustering(n_clusters=n_clusters)
    cluster_labels = hierarchical.fit_predict(X)
    return cluster_labels


In [11]:
y_true = np.argmax(y_sample, axis=1)

print("🔵 层次聚类评估结果：")
hierarchical_labels = apply_hierarchical_clustering(X_sample, n_clusters=7)
evaluate_clustering(X_sample, y_true, hierarchical_labels)

🔵 层次聚类评估结果：
应用 Hierarchical Clustering...
📊 外部指标：
  ARI  (Adjusted Rand Index):     0.6490
  NMI  (Normalized Mutual Info):  0.6963
  FMI  (Fowlkes-Mallows Index):   0.7118
  V-measure:                     0.6963
  Purity:                        0.8090

📈 内部指标：
  Silhouette Score:               0.2622
  Calinski-Harabasz Index:        5562.26
  Davies-Bouldin Index:           1.2477
  Dunn Index:                     0.0124
