Added check to LocalKMeans.scala: kMeansPlusPlus initialization to ha…

…ndle case with fewer distinct data points than clusters k. Added two related unit tests to KMeansSuite.
xiliu82 · Jul 17, 2014 · 6c7a2ec · 6c7a2ec
1 parent 90ca532
commit 6c7a2ec
Show file tree

Hide file tree

Showing 2 changed files with 29 additions and 0 deletions.
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LocalKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LocalKMeans.scala
@@ -59,6 +59,11 @@ private[mllib] object LocalKMeans extends Logging {
         cumulativeScore += weights(j) * KMeans.pointCost(curCenters, points(j))
         j += 1
       }
+      if (j == 0) {
+        logWarning("kMeansPlusPlus initialization ran out of distinct points for centers." +
+          s" Using duplicate point for center k = $i.")
+        j = 1
+      }
       centers(i) = points(j-1).toDense
     }
 

diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/KMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/KMeansSuite.scala
@@ -61,6 +61,30 @@ class KMeansSuite extends FunSuite with LocalSparkContext {
     assert(model.clusterCenters.head === center)
   }
 
+  test("no distinct points") {
+    val data = sc.parallelize(Array(
+      Vectors.dense(1.0, 2.0, 3.0),
+      Vectors.dense(1.0, 2.0, 3.0),
+      Vectors.dense(1.0, 2.0, 3.0)
+    ))
+    val center = Vectors.dense(1.0, 2.0, 3.0)
+
+    // Make sure code runs.
+    var model = KMeans.train(data, k=2, maxIterations=1)
+    assert(model.clusterCenters.size === 2)
+  }
+
+  test("more clusters than points") {
+    val data = sc.parallelize(Array(
+      Vectors.dense(1.0, 2.0, 3.0),
+      Vectors.dense(1.0, 3.0, 4.0)
+    ))
+
+    // Make sure code runs.
+    var model = KMeans.train(data, k=3, maxIterations=1)
+    assert(model.clusterCenters.size === 3)
+  }
+
   test("single cluster with big dataset") {
     val smallData = Array(
       Vectors.dense(1.0, 2.0, 6.0),