#1. K-means 算法

导入数据集

In [1]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
iris = pd.read_csv("iris.csv",header = None)

In [3]:
iris.head()

Unnamed: 0,0,1,2,3,4
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [4]:
iris.shape

(150, 5)

构建计算距离函数（欧式距离）

In [5]:
"""
函数功能：计算两个数据集之间的欧式距离
输入：两个array数据集
返回：两个数据集之间的欧式距离（此处用距离平方和代替距离）
"""
def distEclud(arrA, arrB):
    d = arrA - arrB
    dist = np.sum(np.power(d, 2), axis = 1)
    return dist

编写自动生成随机质心函数

In [6]:
"""
函数功能：随机生成K个质心
参数说明：
    dataSet:包含标签的数据集
    k：簇的个数
返回：
    data_cent:K个质心
"""
def randCent(dataSet, k):
    n = dataSet.shape[1]
    data_min = dataSet.iloc[:, : n-1].min()
    data_max = dataSet.iloc[:, : n-1].max()
    data_cent = np.random.uniform(data_min,data_max,(k, n-1))
    return data_cent

In [12]:
np.random.uniform(1,10,(2,3))

array([[8.75474169, 8.75029957, 5.92501964],
       [5.19314295, 6.87618574, 2.2923776 ]])

In [7]:
iris_cent = randCent(iris,3)
iris_cent

array([[5.57723485, 3.03501986, 1.6305954 , 1.41557182],
       [7.26767893, 2.08227181, 3.12156553, 2.06634424],
       [5.11833516, 4.27393211, 2.50256906, 1.54641281]])

编写K-means 聚类函数

In [10]:
"""
函数功能：k-均值聚类算法
参数说明：
    dataSet:带标签数据集
    k：簇的个数
    distMeas：据类计算函数
    createCent:随机质心生成函数
返回：
    centroids：质心
"""
def kMeans(dataSet, k, distMeas=distEclud, createCent = randCent):
    m, n = dataSet.shape
    centroids = createCent(dataSet,k)
    clusterAssment = np.zeros((m,3))
    clusterAssment[:, 0] = np.inf
    clusterAssment[:, 1:3] = -1
    result_set = pd.concat([dataSet, pd.DataFrame(clusterAssment)], axis = 1, ignore_index = True)
    clusterChanged = True
    while clusterChanged:
        clusterChanged = False
        for i in range(m):
            dist = distMeas(dataSet.iloc[i,:n-1].values,centroids)
            result_set.iloc[i,n] = dist.min()
            result_set.iloc[i,n+1]= np.where(dist == dist.min())[0]
        clusterChanged = not(result_set.iloc[:,-1]==result_set.iloc[:,-2]).all()
        if clusterChanged:
            cent_df = result_set.groupby(n+1).mean()
            centroids = cent_df.iloc[:,n-1].values
            result_set.iloc[:, -1] = result_set.iloc[:,-2]
    return centroids,result_set

In [22]:
dist = distEclud(dataSet.iloc[1,:n-1].values,centroids)

In [23]:
dist

array([31.90934698685661, 37.86231330237488, 4.778526260810095],
      dtype=object)

In [13]:
dataSet = iris
k = 3

In [14]:
m,n = dataSet.shape

In [15]:
centroids = randCent(dataSet,k)

In [16]:
centroids

array([[7.24556735, 3.23467242, 6.42161461, 1.26582191],
       [7.45669777, 4.24474556, 6.72084655, 1.41029346],
       [6.19176857, 3.77006791, 1.32685412, 1.78477296]])

In [17]:
clusterAssment = np.zeros((m,3))
clusterAssment[:, 0] = np.inf
clusterAssment[:, 1:3] = -1

In [19]:
clusterAssment[:10]

array([[inf, -1., -1.],
       [inf, -1., -1.],
       [inf, -1., -1.],
       [inf, -1., -1.],
       [inf, -1., -1.],
       [inf, -1., -1.],
       [inf, -1., -1.],
       [inf, -1., -1.],
       [inf, -1., -1.],
       [inf, -1., -1.]])

In [20]:
result_set = pd.concat([dataSet, pd.DataFrame(clusterAssment)], axis = 1, ignore_index = True)

In [21]:
result_set

Unnamed: 0,0,1,2,3,4,5,6,7
0,5.1,3.5,1.4,0.2,setosa,inf,-1.0,-1.0
1,4.9,3.0,1.4,0.2,setosa,inf,-1.0,-1.0
2,4.7,3.2,1.3,0.2,setosa,inf,-1.0,-1.0
3,4.6,3.1,1.5,0.2,setosa,inf,-1.0,-1.0
4,5.0,3.6,1.4,0.2,setosa,inf,-1.0,-1.0
...,...,...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica,inf,-1.0,-1.0
146,6.3,2.5,5.0,1.9,virginica,inf,-1.0,-1.0
147,6.5,3.0,5.2,2.0,virginica,inf,-1.0,-1.0
148,6.2,3.4,5.4,2.3,virginica,inf,-1.0,-1.0


In [None]:
iris_cent,iris_result =kM