### kmeans

#### setup

In [1]:
import numpy as np
import matplotlib.pyplot as plt

In [2]:
def loadDataSet(fileName):      #general function to parse tab -delimited floats
    dataMat = []                #assume last column is target value
    fr = open(fileName)
    for line in fr.readlines():
        curLine = line.strip().split('\t')
        fltLine = list(map(float,curLine)) #map all elements to float()
        dataMat.append(fltLine)
    return dataMat

In [3]:
# 计算某向量距离质心的欧式距离 
def distEculd(vecA,vecB):
    return np.sqrt(np.sum(np.power(vecA-vecB,2.0)))
    

In [4]:
# 生成dataSet个样本,k是需要聚类的出的类的个数.
def randCent(dataSet,k):
    m,n=dataSet.shape
    centroids=np.mat(np.zeros((k,n),np.float32)) # k行n列的中心坐标
    for j in range(n): # 对于每个特征
        minJ=min(dataSet[:,j])
        maxJ=max(dataSet[:,j])
        delatJ=float(maxJ-minJ)
        centroids[:,j]=minJ+delatJ*np.random.rand(k,1) # 第j个特征的每个样本的质心
    
    return centroids
    

In [5]:
X_train=loadDataSet("testSet.txt")
X_train=np.mat(X_train)
randCent(X_train,3)

matrix([[ 0.18961796,  2.45408273],
        [-2.44573784,  4.51775837],
        [-0.0731729 , -3.46519756]], dtype=float32)

In [53]:
distEculd(X_train[0],X_train[1])

5.184632816681332

#### kmeans

kmeans简图
![kmeans简图](https://note.youdao.com/yws/api/personal/file/9D61D024B3E745408C249F766CC26692?method=download&shareKey=fe44472dbea390a0bba14907c3a2324c)


In [42]:
def kmeans(dataSet,k,distMeas=distEculd,centSetup=randCent):
    m,n=dataSet.shape
    # 保存类别信息
    clusterAssment=np.mat(np.zeros((m,2),np.float32))
    #clusterAssment=np.zeros((m,2),np.float32)
    centroids=centSetup(dataSet,k) # 随机出一组中心点.
    print("setup:%s"%(centroids))
    clusterChanged=True 
    while clusterChanged: # 只要质心改变了,就要重新计算并更新一次.
        clusterChanged=False
        for i in range(m):
            # 每个样本计算对应的距离 
            mindistV=np.inf # step1. 求出每个样本对k个质心的最小的距离.
            minIndex=-1     # step1. 暂存
            for j in range(k): # 对k个聚类的质心.
                distV=distMeas(dataSet[i,:],centroids[j,:]) # 求其欧式距离
                #print("distV:%s,mindistV:%s"%(distV,mindistV))
                if mindistV>distV:
                    mindistV=distV # 记下最小的距离以及所属的k聚类的哪一类.
                    minIndex=j
            if clusterAssment[i,0]!=minIndex:
                clusterChanged=True # 如果该样本改变了所属类别,怎需要重新计算质心.
            #clusterAssment=minIndex,mindistV**2 # 这种写法在新的版本中会把数据格式解析错.
            clusterAssment[i,0]=minIndex
            clusterAssment[i,1]=mindistV**2
            #print(clusterAssment.shape)
            #print(clusterAssment)
            #print(clusterAssment[:,0])
            #print(centroids)
        # step2. 等到在某次质心情况下把所有样本都做完一次归类之后,重新计算质心.
        for cent in range(k):
            sameClust=dataSet[np.nonzero(clusterAssment[:,0].A==cent)[0]]
            #print("the %sth cluster's samples are : sameClust:%s"%(cent,sameClust))
            #print(np.nonzero(clusterAssment[:,0])) # 第一列是每个样本输入的k聚类之一的id.比如说是第一类.第二类,第三类.
            sameClust = dataSet[np.nonzero(clusterAssment[:,0].A==cent)[0]]#get all the point in this cluster
            centroids[cent,:]=np.mean(sameClust,axis=0) # 保持横轴形式,把列求和进而求均值.
            print("centroids[%s,:]=%s"%(cent,centroids[cent,:]))
    return centroids,clusterAssment

In [43]:
kmeans(X_train,3,distEculd,randCent)

setup:[[-1.06504762  0.26967973]
 [ 2.2790947  -2.93740654]
 [ 1.25116587  4.33137274]]
centroids[0,:]=[[-3.04212356 -0.1844383 ]]
centroids[1,:]=[[ 2.51621747 -2.9689095 ]]
centroids[2,:]=[[ 2.10377479  3.18830323]]
centroids[0,:]=[[-3.18695354 -0.35938492]]
centroids[1,:]=[[ 2.65077376 -2.79019022]]
centroids[2,:]=[[ 1.98283625  3.14652348]]
centroids[0,:]=[[-3.18695354 -0.35938492]]
centroids[1,:]=[[ 2.65077376 -2.79019022]]
centroids[2,:]=[[ 1.98283625  3.14652348]]


(matrix([[-3.18695354, -0.35938492],
         [ 2.65077376, -2.79019022],
         [ 1.98283625,  3.14652348]], dtype=float32),
 matrix([[  2.        ,   1.40131807],
         [  0.        ,  14.38757706],
         [  1.        ,   7.46973991],
         [  0.        ,  13.8245163 ],
         [  2.        ,   1.07012844],
         [  0.        ,   3.72100019],
         [  1.        ,   5.10287666],
         [  0.        ,   1.95344436],
         [  2.        ,   2.87820554],
         [  0.        ,  12.60713387],
         [  1.        ,   1.72819698],
         [  0.        ,   7.66752386],
         [  2.        ,   4.97829962],
         [  0.        ,  12.03944397],
         [  1.        ,   9.1285305 ],
         [  1.        ,  10.63785839],
         [  2.        ,   3.19328713],
         [  2.        ,   7.72812891],
         [  1.        ,   0.40704465],
         [  0.        ,   3.70393801],
         [  2.        ,   0.1987066 ],
         [  0.        ,  10.45096111],
         [  1.