### 第十四章 聚类方法

In [19]:
import numpy as np

def bottom_up_clustering(d: np.ndarray) -> np.ndarray:
    """
    聚合层次聚类算法
    :param d 样本距离矩阵
    :return cluster 聚类算法
    """
    # 初始化类
    n = len(d)
    cluster = np.arange(1, n+1)
    x = np.arange(n)

    clusters = [cluster.copy()]
    num_cluster = n
    cluster_id = n+1
    while num_cluster > 1:
        cluster_category = np.unique(cluster)
        
        # 找到距离最小的两个类
        min_dist = np.inf
        for i in range(num_cluster-1):
            for j in range(i+1, num_cluster):
                i_idx = cluster == cluster_category[i]
                j_idx = cluster == cluster_category[j]
                dist = min(d[i_id, j_id] for i_id in x[i_idx] for j_id in x[j_idx])
                if min_dist > dist:
                    merge = (i, j)
                    min_dist = dist
                    
        # 合并最小的两个类
        i,j = merge
        j_idx = cluster == cluster_category[j]
        i_idx = cluster == cluster_category[i]
        cluster[j_idx|i_idx] = cluster_id
        clusters.append(cluster.copy())
        num_cluster -= 1
        cluster_id += 1
    return np.array(clusters)
            
d = np.array([
    [0, 7, 2, 9, 3],
    [7, 0, 5, 4, 6],
    [2, 5, 0, 8, 1],
    [9, 4, 8, 0, 5],
    [3, 6, 1, 5, 0]
])
bottom_up_clustering(d)

array([[1, 2, 3, 4, 5],
       [1, 2, 6, 4, 6],
       [7, 2, 7, 4, 7],
       [7, 8, 7, 8, 7],
       [9, 9, 9, 9, 9]])

In [None]:
def k_means(X:np.array, k:int, max_iter: int=5) -> np.array:
    """
    k均值聚类算法
    :param X  样本数据
    :param k 类数
    :param max_iter最大迭代次数
    :return cluster 指派
    """
    n, m = X.shape
    centers = X[:k, :]
    
    num_iter = 0
    old_cluster = np.zeros(n)
    while num_iter < max_iter:
        num_iter+=1
        # 聚类
        dist =  np.sum(np.power(np.repeat(X.reshape(n,1,m), k, axis=1)-centers,2), axis=-1)
        cluster = np.argmin(dist, axis=-1)

        # 计算中心
        # centers = []
        # for i in range(k):
        #     idx = cluster == i
        #     centers.append(np.mean(X[idx],axis=0))
        # centers = np.array(centers)
        one_hot = np.eye(k)[cluster]  # shape (n_samples, k)
        sum_clusters = one_hot.T @ X  # shape (k, n_features)，等价于按簇求和
        count_clusters = np.sum(one_hot, axis=0)[:, np.newaxis]  # shape (k,1)，每个簇的样本数
        centers = sum_clusters / count_clusters
        if (old_cluster == cluster).all():
            print(f"在第{num_iter}步，新的类没有改变，停止")
            break
        old_cluster = cluster.copy()
    return cluster

X = np.array([
    [0, 0, 1, 5, 5],
    [2, 0, 0 ,0 ,2]
]).T
k_means(X, 2)

(2, 2)
在第2步，新的类没有改变，停止


array([0, 1, 1, 1, 0])