In [57]:
import numpy as np  
import matplotlib.pyplot as plt  
import matplotlib as mpl

from sklearn.datasets.samples_generator import make_blobs
from sklearn.cluster import MiniBatchKMeans, KMeans 

from sklearn import metrics
from sklearn.metrics.pairwise import pairwise_distances_argmin  

import time

In [58]:
## 设置属性防止中文乱码
mpl.rcParams['font.sans-serif'] = [u'SimHei']
mpl.rcParams['axes.unicode_minus'] = False

##### 1、产生模型数据

In [59]:
centers = [[1, 1], [-1, -1], [1, -1]] 
clusters = len(centers)       

X, Y = make_blobs(n_samples=3000, centers=centers, cluster_std=0.7, random_state=28)
Y # 样本类别，在实际工作中是人工给定的，专门用于判断聚类的效果的一个值
X.shape

(3000, 2)

##### 2.1、模型构建 + 训练

In [60]:
k_means = KMeans(init='k-means++', n_clusters=clusters, random_state=28)
k_means.fit(X)

K-Means算法模型训练消耗时间:0.0479s


##### 2.2、模型构建 + 训练
—— MiniBatchKMeans()
#batch_size：批处理参数，也可以理解为抽样参数，最大值为训练集样本总数

In [62]:
batch_size = 100
mbk = MiniBatchKMeans(init='k-means++', n_clusters=clusters, batch_size=batch_size, random_state=28)
mbk.fit(X)

Mini Batch K-Means算法模型训练消耗时间:0.0359s


##### 3、 + 预测

labels_ ：获取聚类标签

k_means.labels＿，和 k_means.predict(X) 效果是一样的 ★★

In [None]:
# km_y_hat = k_means.labels_
# mbkm_y_hat = mbk.labels_
# print(km_y_hat) # 样本所属类别

km_y_hat = k_means.predict(X)
mbkm_y_hat = mbk.predict(X)

##### 4.1、簇中心点

—— pairwise_distances_argmin(X,Y)：计算点X到点集Y的最小距离,也就是获取样本的类标签

In [64]:
order1 = pairwise_distances_argmin(X ,k_means.cluster_centers_)
order2 = pairwise_distances_argmin(X ,mbk.cluster_centers_)
print(order1)
print(order2)

[0 2 2 ... 1 1 0]
[1 0 0 ... 2 2 1]


In [65]:
print ("K-Means算法聚类中心点:\ncenter = ", k_means.cluster_centers_)
print()
print ("Mini Batch K-Means算法聚类中心点:\ncenter =", mbk.cluster_centers_)
print()
order = pairwise_distances_argmin(k_means.cluster_centers_ ,mbk.cluster_centers_)
# 比较两次聚类的结果的区别，计算KMeans相对于MiniBatchKMeans的索引，即排序作用
print(order)

K-Means算法聚类中心点:
center =  [[-1.0600799  -1.05662982]
 [ 1.02975208 -1.07435837]
 [ 1.01491055  1.02216649]]

Mini Batch K-Means算法聚类中心点:
center = [[ 0.99602094  1.10688195]
 [-1.00828286 -1.05983915]
 [ 1.07892315 -0.94286826]]

[1 2 0]


##### 4.2、效果评估
——metrics.adjusted_rand_score(label_true,label_predict)：ARI

——metrics.v_measure_score(label_true,label_predict)：均一性和完整性的加权平均

——metrics.adjusted_mutual_info_score(label_true,label_predict)：AMI

——metrics.mutual_info_score(label_true,label_predict)：互信息
#label_true：真实值（分类效果）
#label_predict：预测值

In [66]:
score_funcs = [
    metrics.adjusted_rand_score,   # ARI
    metrics.v_measure_score,   # 均一性和完整性的加权平均
    metrics.adjusted_mutual_info_score,   # AMI
    metrics.mutual_info_score,   # 互信息
]

# 迭代，对每个评估函数进行评估操作
for score_func in score_funcs:
    t0 = time.time()
    km_scores = score_func(Y,km_y_hat)   # 计算评估值
    print("K-Means算法:%s评估函数计算结果值:%.5f；计算消耗时间:%0.3fs" % (score_func.__name__,km_scores, time.time() - t0))
    
    t0 = time.time()
    mbkm_scores = score_func(Y,mbkm_y_hat)
    print("Mini Batch K-Means算法:%s评估函数计算结果值:%.5f；计算消耗时间:%0.3fs\n" % (score_func.__name__,mbkm_scores, time.time() - t0))

K-Means算法:adjusted_rand_score评估函数计算结果值:0.72526；计算消耗时间:0.004s
Mini Batch K-Means算法:adjusted_rand_score评估函数计算结果值:0.72421；计算消耗时间:0.003s

K-Means算法:v_measure_score评估函数计算结果值:0.65754；计算消耗时间:0.002s
Mini Batch K-Means算法:v_measure_score评估函数计算结果值:0.65780；计算消耗时间:0.002s

K-Means算法:adjusted_mutual_info_score评估函数计算结果值:0.65726；计算消耗时间:0.003s
Mini Batch K-Means算法:adjusted_mutual_info_score评估函数计算结果值:0.65757；计算消耗时间:0.003s

K-Means算法:mutual_info_score评估函数计算结果值:0.72231；计算消耗时间:0.001s
Mini Batch K-Means算法:mutual_info_score评估函数计算结果值:0.72264；计算消耗时间:0.002s



