### kmeans

#### setup

In [1]:
import numpy as np
import matplotlib.pyplot as plt

In [2]:
def loadDataSet(fileName):      #general function to parse tab -delimited floats
    dataMat = []                #assume last column is target value
    fr = open(fileName)
    for line in fr.readlines():
        curLine = line.strip().split('\t')
        fltLine = list(map(float,curLine)) #map all elements to float()
        dataMat.append(fltLine)
    return dataMat

In [3]:
# 计算某向量距离质心的欧式距离 
def distEculd(vecA,vecB):
    return np.sqrt(np.sum(np.power(vecA-vecB,2.0)))
    

In [4]:
# 生成dataSet个样本,k是需要聚类的出的类的个数.
def randCent(dataSet,k):
    m,n=dataSet.shape
    centroids=np.mat(np.zeros((k,n),np.float32)) # k行n列的中心坐标
    for j in range(n): # 对于每个特征
        minJ=min(dataSet[:,j])
        maxJ=max(dataSet[:,j])
        delatJ=float(maxJ-minJ)
        centroids[:,j]=minJ+delatJ*np.random.rand(k,1) # 第j个特征的每个样本的质心
    
    return centroids
    

In [5]:
X_train=loadDataSet("testSet.txt")
X_train=np.mat(X_train)
randCent(X_train,3)

matrix([[ 3.80027628, -1.97353005],
        [ 4.11201334,  2.37524199],
        [-3.20596671,  2.57699013]], dtype=float32)

In [6]:
distEculd(X_train[0],X_train[1])

5.184632816681332

#### kmeans

kmeans简图
![kmeans简图](https://note.youdao.com/yws/api/personal/file/9D61D024B3E745408C249F766CC26692?method=download&shareKey=fe44472dbea390a0bba14907c3a2324c)


In [7]:
def kmeans(dataSet,k,distMeas=distEculd,centSetup=randCent):
    m,n=dataSet.shape
    # 保存类别信息
    clusterAssment=np.mat(np.zeros((m,2),np.float32))
    #clusterAssment=np.zeros((m,2),np.float32)
    centroids=centSetup(dataSet,k) # 随机出一组中心点.
    clusterChanged=True 
    while clusterChanged: # 只要质心改变了,就要重新计算并更新一次.
        clusterChanged=False
        for i in range(m):
            # 每个样本计算对应的距离 
            mindistV=np.inf # step1. 求出每个样本对k个质心的最小的距离.
            minIndex=-1     # step1. 暂存
            for j in range(k): # 对k个聚类的质心.
                distV=distMeas(dataSet[i,:],centroids[j,:]) # 求其欧式距离
                #print("distV:%s,mindistV:%s"%(distV,mindistV))
                if mindistV>distV:
                    mindistV=distV # 记下最小的距离以及所属的k聚类的哪一类.
                    minIndex=j
            if clusterAssment[i,0]!=minIndex:
                clusterChanged=True # 如果该样本改变了所属类别,怎需要重新计算质心.
            #clusterAssment=minIndex,mindistV**2 # 这种写法在新的版本中会把数据格式解析错.
            clusterAssment[i,0]=minIndex
            clusterAssment[i,1]=mindistV**2

        # step2. 等到在某次质心情况下把所有样本都做完一次归类之后,重新计算质心.
        for cent in range(k):
            #sameClust=dataSet[np.nonzero(clusterAssment[:,0].A==cent)[0]]
            #print("the %sth cluster's samples are : sameClust:%s"%(cent,sameClust))
            #print(np.nonzero(clusterAssment[:,0])) # 第一列是每个样本输入的k聚类之一的id.比如说是第一类.第二类,第三类.
            sameClust = dataSet[np.nonzero(clusterAssment[:,0].A==cent)[0]]#get all the point in this cluster
            centroids[cent,:]=np.mean(sameClust,axis=0) # 保持横轴形式,把列求和进而求均值.
            print("centroids[%s,:]=%s"%(cent,centroids[cent,:]))
    return centroids,clusterAssment

In [8]:
centroids,clusterAssment=kmeans(X_train,3,distEculd,randCent)

centroids[0,:]=[[ 3.54988217 -2.50520706]]
centroids[1,:]=[[ 0.88931602  3.23979449]]
centroids[2,:]=[[-2.26425862 -1.47397423]]
centroids[0,:]=[[ 2.97753215 -2.65120673]]
centroids[1,:]=[[ 0.38075384  3.12396836]]
centroids[2,:]=[[-3.2466433  -2.07882643]]
centroids[0,:]=[[ 2.86927819 -2.54779124]]
centroids[1,:]=[[ 0.14460655  3.09399199]]
centroids[2,:]=[[-3.35883355 -2.57409072]]
centroids[0,:]=[[ 2.86927819 -2.54779124]]
centroids[1,:]=[[ 0.0469085   3.05287671]]
centroids[2,:]=[[-3.34887719 -2.76960039]]
centroids[0,:]=[[ 2.86927819 -2.54779124]]
centroids[1,:]=[[-0.02298687  2.99472904]]
centroids[2,:]=[[-3.38237047 -2.9473362 ]]
centroids[0,:]=[[ 2.86927819 -2.54779124]]
centroids[1,:]=[[-0.02298687  2.99472904]]
centroids[2,:]=[[-3.38237047 -2.9473362 ]]


In [12]:
# 显示聚类结果
# -------------
# dataSet: 数据集
# k: k个聚类
# clusterAssment: 包含有所有样本属于哪个类的信息.
# centroids: 包含k个聚类的质心
def show_clustter_image(dataSet,k,clusterAssment,centroids):
    dict_subclass={}
    axes = plt.gca()
    centroids=np.mat(centroids)
    markers=['o','v','+','x','^','<','>','1','2','3','4','s','p','*','h','H','D','d','|','_','.',',']
    for cent in range(k):
        sameClust=dataSet[np.nonzero(clusterAssment[:,0].A==cent)[0]]
        dict_subclass[cent]=sameClust
        axes.scatter(np.array(sameClust[:,0]), np.array(sameClust[:,1]), marker=markers[cent], s=60, linewidth=2, label="cluster%s"%(cent))
        centroidsA=centroids[cent].A # mat和array格式导致解析时候有些问题.在遇到这些问题不同格式可以都尝试下.
        plt.plot(centroidsA[0][0],centroidsA[0][1],marker='*',c='r')
        
    axes.set_xlabel("x feature0")
    axes.set_ylabel("x feature1")
    axes.set_title("kmeans K:%s" %(k))
    axes.legend(frameon= True, fancybox = True);
    
show_clustter_image(X_train,3,clusterAssment,centroids)

SyntaxError: positional argument follows keyword argument (<ipython-input-12-558455a6f186>, line 24)