Skip to content

Commit

Permalink
Added check to LocalKMeans.scala: kMeansPlusPlus initialization to ha…
Browse files Browse the repository at this point in the history
…ndle case with fewer distinct data points than clusters k. Added two related unit tests to KMeansSuite.
  • Loading branch information
jkbradley committed Jul 17, 2014
1 parent 90ca532 commit 6c7a2ec
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,11 @@ private[mllib] object LocalKMeans extends Logging {
cumulativeScore += weights(j) * KMeans.pointCost(curCenters, points(j))
j += 1
}
if (j == 0) {
logWarning("kMeansPlusPlus initialization ran out of distinct points for centers." +
s" Using duplicate point for center k = $i.")
j = 1
}
centers(i) = points(j-1).toDense
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,30 @@ class KMeansSuite extends FunSuite with LocalSparkContext {
assert(model.clusterCenters.head === center)
}

test("no distinct points") {
val data = sc.parallelize(Array(
Vectors.dense(1.0, 2.0, 3.0),
Vectors.dense(1.0, 2.0, 3.0),
Vectors.dense(1.0, 2.0, 3.0)
))
val center = Vectors.dense(1.0, 2.0, 3.0)

// Make sure code runs.
var model = KMeans.train(data, k=2, maxIterations=1)
assert(model.clusterCenters.size === 2)
}

test("more clusters than points") {
val data = sc.parallelize(Array(
Vectors.dense(1.0, 2.0, 3.0),
Vectors.dense(1.0, 3.0, 4.0)
))

// Make sure code runs.
var model = KMeans.train(data, k=3, maxIterations=1)
assert(model.clusterCenters.size === 3)
}

test("single cluster with big dataset") {
val smallData = Array(
Vectors.dense(1.0, 2.0, 6.0),
Expand Down

0 comments on commit 6c7a2ec

Please sign in to comment.