add: K-means

chgl16 · chgl16 · commit 1bf91b16787f · 2019-06-16T11:47:31.000+08:00
diff --git a/README.md b/README.md
@@ -33,7 +33,7 @@ e, f
 使用经典的Apriori算法，依次扫描交易记录集，计算出 *k-候选集Ck* 然后去除**支持度sup**小的项集获得 *k-频繁集Lk*， 只计算到 *3-频繁集* ，最后计算管理规则可信度即可。
 > 第k个候选集只会从k-1频繁集中的各项目组合连接，然后扫描记录集，以获取Ck中各项集的支持度。       
 
-算法输出
+#### 3.输出
 <center>
 <img alt="算法输出" src="https://i.loli.net/2019/06/16/5d05ad0e8f2e762317.png" width="80%" /> 
 </center>
@@ -77,15 +77,48 @@ clf.fit(x_train, y_train)
 with open("tree.dot", 'w') as f:
     f = tree.export_graphviz(clf, out_file=f)
 ```
-算法输出  
+#### 3.输出  
 <center>
 <img alt="算法输出" src="https://i.loli.net/2019/06/16/5d05b41f3cca371767.png" width="80%" /> 
 </center>
-决策树  
+
 <center>
+<span>决策树</span>  
+<br>
 <img alt="决策树" src="https://i.loli.net/2019/06/16/5d05b41f6850332395.png" width="80%" />
 </center>
 
 <hr>
 
-## 数据聚类K-means算法
+## 数据聚类K-means算法
+#### 1. 数据集
+数据集采用python类库有名的iris坐标点集
+```python
+from sklearn import datasets
+
+iris = datasets.load_iris()
+X, y = iris.data, iris.target
+```
+数据集样本如下
+```bash
+[1.5 0.2]
+[3.2 0.2]
+[3.1 0.2]
+[4.6 0.2]
+...
+```
+
+#### 2. [算法实现](数据聚类（K-means）/k-means.py)
+K-means算法需要先指定要分成k类，数据样本只有熟悉，没有类别。  
+大概步骤：  
+1. 从数据集X从随机选取k个数据样本作为聚类的初始化代表点，每一个代表点表示一个类别。
+2. 对于数据集中的任一样本点，都计算它与这k个初始化代表点的距离(d可用欧氏距离)，然后划分到距离最近的分类中去。完成一次聚类
+3. 划分好数据后，计算每个聚类的均值，并将之作为该聚类的新代表点，因此得到k个新代表点。
+4. 和第二步一样，再继续计算每个点到代表点的距离，划分到距离最小的类
+5. 重复3和4，直到各个聚类不再发生变化（样本点划分固定了），即误差平方和准则函数的值达到最优。
+  
+#### 3.输出
+<center>
+<img alt="决策树" src="https://i.loli.net/2019/06/16/5d05bb1a54a9561636.png" width="80%" />
+</center>
+
diff --git a/数据聚类（K-means）/k-means.py b/数据聚类（K-means）/k-means.py
@@ -0,0 +1,103 @@
+from sklearn import datasets
+import matplotlib.pyplot as plt
+import numpy as np
+
+iris = datasets.load_iris()
+X, y = iris.data, iris.target
+
+# 为了便于可视化，只取两个维度
+data = X[:,[1,3]] 
+
+print(data)
+
+plt.scatter(data[:,0],data[:,1])
+
+ck = 3
+'''
+随机选取k个点为聚类的初始代表点，即质点
+'''
+def rand_center(data,k):
+    """Generate k center within the range of data set."""
+    n = data.shape[1] # features
+    centroids = np.zeros((k,n)) # init with (0,0)....
+    for i in range(n):
+        dmin, dmax = np.min(data[:,i]), np.max(data[:,i])
+        centroids[:,i] = dmin + (dmax - dmin) * np.random.rand(k)
+    return centroids
+
+# 初始化点列表
+centroids = rand_center(data, ck)
+print(centroids)
+
+def kmeans(data,k=2):
+    def _distance(p1,p2):
+        """
+        Return Eclud distance between two points.
+        p1 = np.array([0,0]), p2 = np.array([1,1]) => 1.414
+        """
+        tmp = np.sum((p1-p2)**2)
+        return np.sqrt(tmp)
+    def _rand_center(data,k):
+        """Generate k center within the range of data set."""
+        n = data.shape[1] # features
+        centroids = np.zeros((k,n)) # init with (0,0)....
+        for i in range(n):
+            dmin, dmax = np.min(data[:,i]), np.max(data[:,i])
+            centroids[:,i] = dmin + (dmax - dmin) * np.random.rand(k)
+        return centroids
+    
+    def _converged(centroids1, centroids2):
+        
+        # if centroids not changed, we say 'converged'
+         set1 = set([tuple(c) for c in centroids1])
+         set2 = set([tuple(c) for c in centroids2])
+         return (set1 == set2)
+        
+    
+    n = data.shape[0] # number of entries
+    centroids = _rand_center(data,k)
+    label = np.zeros(n,dtype=np.int) # track the nearest centroid
+    assement = np.zeros(n) # for the assement of our model
+    converged = False
+    
+    while not converged:
+        old_centroids = np.copy(centroids)
+        for i in range(n):
+            # determine the nearest centroid and track it with label
+            min_dist, min_index = np.inf, -1
+            for j in range(k):
+                dist = _distance(data[i],centroids[j])
+                if dist < min_dist:
+                    min_dist, min_index = dist, j
+                    label[i] = j
+            assement[i] = _distance(data[i],centroids[label[i]])**2
+        
+        # update centroid
+        for m in range(k):
+            centroids[m] = np.mean(data[label==m],axis=0)
+        converged = _converged(old_centroids,centroids)    
+    return centroids, label, np.sum(assement)
+
+
+# 多运行
+best_assement = np.inf
+best_centroids = None
+best_label = None
+
+for i in range(10):
+    centroids, label, assement = kmeans(data,ck)
+    if assement < best_assement:
+        best_assement = assement
+        best_centroids = centroids
+        best_label = label
+
+data0 = data[best_label==0]
+data1 = data[best_label==1]
+
+# 打印展示
+fig, (ax1,ax2) = plt.subplots(1,2,figsize=(12,5))
+ax1.scatter(data[:,0],data[:,1],c='c',s=30,marker='o')
+ax2.scatter(data0[:,0],data0[:,1],c='r')
+ax2.scatter(data1[:,0],data1[:,1],c='c')
+ax2.scatter(centroids[:,0],centroids[:,1],c='b',s=120,marker='o')
+plt.show()