Skip to content

Commit

Permalink
[SPARK-17389][FOLLOW-UP][ML] Change KMeans k-means|| default init ste…
Browse files Browse the repository at this point in the history
…ps from 5 to 2.

## What changes were proposed in this pull request?
apache#14956 reduced default k-means|| init steps to 2 from 5 only for spark.mllib package, we should also do same change for spark.ml and PySpark.

## How was this patch tested?
Existing tests.

Author: Yanbo Liang <ybliang8@gmail.com>

Closes apache#15050 from yanboliang/spark-17389.
  • Loading branch information
yanboliang authored and wgtmac committed Sep 19, 2016
1 parent 1d4165f commit 7d309ff
Show file tree
Hide file tree
Showing 4 changed files with 11 additions and 11 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ private[clustering] trait KMeansParams extends Params with HasMaxIter with HasFe

/**
* Param for the number of steps for the k-means|| initialization mode. This is an advanced
* setting -- the default of 5 is almost always enough. Must be > 0. Default: 5.
* setting -- the default of 2 is almost always enough. Must be > 0. Default: 2.
* @group expertParam
*/
@Since("1.5.0")
Expand Down Expand Up @@ -262,7 +262,7 @@ class KMeans @Since("1.5.0") (
k -> 2,
maxIter -> 20,
initMode -> MLlibKMeans.K_MEANS_PARALLEL,
initSteps -> 5,
initSteps -> 2,
tol -> 1e-4)

@Since("1.5.0")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ class KMeansSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultR
assert(kmeans.getPredictionCol === "prediction")
assert(kmeans.getMaxIter === 20)
assert(kmeans.getInitMode === MLlibKMeans.K_MEANS_PARALLEL)
assert(kmeans.getInitSteps === 5)
assert(kmeans.getInitSteps === 2)
assert(kmeans.getTol === 1e-4)
}

Expand Down
10 changes: 5 additions & 5 deletions python/pyspark/ml/clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,14 +254,14 @@ class KMeans(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIter, HasTol

@keyword_only
def __init__(self, featuresCol="features", predictionCol="prediction", k=2,
initMode="k-means||", initSteps=5, tol=1e-4, maxIter=20, seed=None):
initMode="k-means||", initSteps=2, tol=1e-4, maxIter=20, seed=None):
"""
__init__(self, featuresCol="features", predictionCol="prediction", k=2, \
initMode="k-means||", initSteps=5, tol=1e-4, maxIter=20, seed=None)
initMode="k-means||", initSteps=2, tol=1e-4, maxIter=20, seed=None)
"""
super(KMeans, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.clustering.KMeans", self.uid)
self._setDefault(k=2, initMode="k-means||", initSteps=5, tol=1e-4, maxIter=20)
self._setDefault(k=2, initMode="k-means||", initSteps=2, tol=1e-4, maxIter=20)
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)

Expand All @@ -271,10 +271,10 @@ def _create_model(self, java_model):
@keyword_only
@since("1.5.0")
def setParams(self, featuresCol="features", predictionCol="prediction", k=2,
initMode="k-means||", initSteps=5, tol=1e-4, maxIter=20, seed=None):
initMode="k-means||", initSteps=2, tol=1e-4, maxIter=20, seed=None):
"""
setParams(self, featuresCol="features", predictionCol="prediction", k=2, \
initMode="k-means||", initSteps=5, tol=1e-4, maxIter=20, seed=None)
initMode="k-means||", initSteps=2, tol=1e-4, maxIter=20, seed=None)
Sets params for KMeans.
"""
Expand Down
6 changes: 3 additions & 3 deletions python/pyspark/mllib/clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,7 +306,7 @@ class KMeans(object):
@classmethod
@since('0.9.0')
def train(cls, rdd, k, maxIterations=100, runs=1, initializationMode="k-means||",
seed=None, initializationSteps=5, epsilon=1e-4, initialModel=None):
seed=None, initializationSteps=2, epsilon=1e-4, initialModel=None):
"""
Train a k-means clustering model.
Expand All @@ -330,9 +330,9 @@ def train(cls, rdd, k, maxIterations=100, runs=1, initializationMode="k-means||"
(default: None)
:param initializationSteps:
Number of steps for the k-means|| initialization mode.
This is an advanced setting -- the default of 5 is almost
This is an advanced setting -- the default of 2 is almost
always enough.
(default: 5)
(default: 2)
:param epsilon:
Distance threshold within which a center will be considered to
have converged. If all centers move less than this Euclidean
Expand Down

0 comments on commit 7d309ff

Please sign in to comment.