## Comparison of the K-Means and MiniBatchKMeasn clustering algorithm

In [None]:
import numpy as np

from sklearn.datasets import make_blobs

np.random.seed(0)

batch_size = 45
centers = [[1, 1], [-1, -1], [1, -1]]
n_clusters = len(centers)
X, labels_true = make_blobs(n_samples=3000, centers=centers, cluster_std=0.4)

### Generate the data
* np.random.seed(0) 固定随机数种子，让每次运行结果一致
* make_blobs 用来生成模拟的聚类数据（高斯分布点云）

In [None]:
import time

from sklearn.cluster import KMeans

k_means = KMeans(init = "k-means++", n_clusters=3, n_init=10)
t0 = time.time()
k_means.fit(X)
t_batch = time.time() - t0

### Compute clustering with KMeans
* KMeans(init ="k-means ++") 使用改进的初始化算法（比随机选出市中心要稳）
* n_init = 10 运行10次不同初始化，选择最佳结果
* fit(X) 执行聚类任务
* 训练前后打时间戳，t_batch 记录KMeans的运行时间

In [None]:
from sklearn.cluster import MiniBatchKMeans

mbk = MiniBatchKMeans(
    init="k-means++",
    n_clusters=3,
    batch_size=batch_size,
    n_init=10,
    max_no_improvement=10,
    verbose=0,
)
t0 = time.time()
mbk.fit(X)
t_mini_batch = time.time() - t0

### Compute clustering with MiniBatchKMeans
MiniBatchKMeans 是KMeans的小批量版本， 只去一部分样本更新
* max_no_improvement = 10 连续10次没有改进则停止

In [None]:
from sklearn.metrics.pairwise import pairwise_distances_argmin

k_means_cluster_centers = k_means.cluster_centers_
order = pairwise_distances_argmin(k_means.cluster_centers_, mbk.cluster_centers_)
mbk_means_cluster_centers = mbk.cluster_centers_[order]
k_means_labels = pairwise_distances_argmin(X, k_means_cluster_centers)
mbk_means_labels = pairwise_distances_argmin(X, mbk_means_cluster_centers)

### Establishing parity between clusters
重新排序Mini Batch的中心，使之与 KMeans 对齐
* pairwise_distances_argmin(A,B) 会找到A中每个点距离B中哪个最近

In [None]:
import matplotlib.pyplot as plt

fig = plt.figure(figsize=(8, 3))
fig.subplots_adjust(left=0.02, right=0.98, bottom=0.05, top=0.9)
colors = ["#4EACC5", "#FF9C34", "#4E9A06"]

#KMeans
ax = fig.add_subplot(1, 3, 1)
for k, col in zip(range(n_clusters), colors):
    my_members = k_means_labels == k
    cluster_centers = k_means_cluster_centers[k]
    ax.plot(X[my_members, 0], X[my_members, 1], "w", markerfacecolor=col, marker=".")
    ax.plot(
        cluster_centers[0],
        cluster_centers[1],
        "o",
        markerfacecolor=col,
        markeredgecolor="k",
        markersize=6,
    )
ax.set_title("KMeans")
ax.set_xticks(())
ax.set_yticks(())
plt.text(-3.5, 1.8, "train time: %.2fs\ninertia: %f" % (t_batch, k_means.inertia_))

# MiniBatchKMeans
ax = fig.add_subplot(1, 3, 2)
for k, col in zip(range(n_clusters), colors):
    my_members = mbk_means_labels == k
    cluster_center = mbk_means_cluster_centers[k]
    ax.plot(X[my_members, 0], X[my_members, 1], "w", markerfacecolor=col, marker=".")
    ax.plot(
        cluster_center[0],
        cluster_center[1],
        "o",
        markerfacecolor=col,
        markeredgecolor="k",
        markersize=6,
    )
ax.set_title("MiniBatchKMeans")
ax.set_xticks(())
ax.set_yticks(())
plt.text(-3.5, 1.8, "train time: %.2fs\ninertia: %f" % (t_mini_batch, mbk.inertia_))

# Initialize the different array to all False
different = mbk_means_labels == 4
ax = fig.add_subplot(1, 3, 3)

for k in range(n_clusters):
    different += (k_means_labels == k) != (mbk_means_labels == k)

identical = np.logical_not(different)
ax.plot(X[identical, 0], X[identical, 1], "w", markerfacecolor="#bbbbbb", marker=".")
ax.plot(X[different, 0], X[different, 1], "w", markerfacecolor="m", marker=".")
ax.set_title("Difference")
ax.set_xticks(())
ax.set_yticks(())

plt.show()

### Plotting the results
* KMeans 图像
* MiniBatchKMeans 图像
* 对比差异图
    * different 是一个布尔数组， True 表示该点被分到不同的簇
    * identical 表示一致
    * 灰色表示一致， 粉色表示不同
    一般来说， 差一点很少， 说明MiniBatch 的结果足够接近标准KMeans
| 算法                  | 特点          | 适用场景       |
| ------------------- | ----------- | ---------- |
| **KMeans**          | 精度高，计算全量数据  | 小数据或精确分析   |
| **MiniBatchKMeans** | 速度快，用随机批量近似 | 大规模数据或流式数据 |
