Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 6 additions & 18 deletions tensorflow/contrib/factorization/python/ops/kmeans.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,12 +54,8 @@ def __init__(self,
distance_metric=clustering_ops.SQUARED_EUCLIDEAN_DISTANCE,
random_seed=0,
use_mini_batch=True,
batch_size=128,
steps=10,
kmeans_plus_plus_num_retries=2,
continue_training=False,
config=None,
verbose=1):
config=None):
"""Creates a model for running KMeans training and inference.

Args:
Expand All @@ -72,25 +68,17 @@ def __init__(self,
random_seed: Python integer. Seed for PRNG used to initialize centers.
use_mini_batch: If true, use the mini-batch k-means algorithm. Else assume
full batch.
batch_size: See TensorFlowEstimator
steps: See TensorFlowEstimator
kmeans_plus_plus_num_retries: For each point that is sampled during
kmeans++ initialization, this parameter specifies the number of
additional points to draw from the current distribution before selecting
the best. If a negative value is specified, a heuristic is used to
sample O(log(num_to_sample)) additional points.
continue_training: See TensorFlowEstimator
config: See TensorFlowEstimator
verbose: See TensorFlowEstimator
config: See Estimator
"""
super(KMeansClustering, self).__init__(
model_dir=model_dir,
config=config)
self.batch_size = batch_size
self.steps = steps
self.kmeans_plus_plus_num_retries = kmeans_plus_plus_num_retries
self.continue_training = continue_training
self.verbose = verbose
self._num_clusters = num_clusters
self._training_initial_clusters = initial_clusters
self._training_graph = None
Expand All @@ -99,10 +87,10 @@ def __init__(self,
self._random_seed = random_seed
self._initialized = False

def fit(self, x, y=None, monitors=None, logdir=None, steps=None):
def fit(self, x, y=None, monitors=None, logdir=None, steps=None, batch_size=128):
"""Trains a k-means clustering on x.

Note: See TensorFlowEstimator for logic for continuous training and graph
Note: See Estimator for logic for continuous training and graph
construction across multiple calls to fit.

Args:
Expand All @@ -122,10 +110,10 @@ def fit(self, x, y=None, monitors=None, logdir=None, steps=None):
if logdir is not None:
self._model_dir = logdir
self._data_feeder = data_feeder.setup_train_data_feeder(
x, None, self._num_clusters, self.batch_size)
x, None, self._num_clusters, batch_size)
self._train_model(input_fn=self._data_feeder.input_builder,
feed_fn=self._data_feeder.get_feed_dict_fn(),
steps=steps or self.steps,
steps=steps,
monitors=monitors,
init_feed_fn=self._data_feeder.get_feed_dict_fn())
return self
Expand Down
63 changes: 28 additions & 35 deletions tensorflow/contrib/factorization/python/ops/kmeans_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,13 +53,14 @@ def setUp(self):

self.kmeans = KMeans(self.num_centers,
initial_clusters=kmeans_ops.RANDOM_INIT,
batch_size=self.batch_size,
use_mini_batch=self.use_mini_batch,
steps=30,
continue_training=True,
config=run_config.RunConfig(tf_random_seed=14),
config=self.config(14),
random_seed=12)

@staticmethod
def config(tf_random_seed):
return run_config.RunConfig(tf_random_seed=tf_random_seed)

@property
def batch_size(self):
return self.num_points
Expand All @@ -86,7 +87,7 @@ def make_random_points(centers, num_points, max_offset=20):

def test_clusters(self):
kmeans = self.kmeans
kmeans.fit(x=self.points, steps=0)
kmeans.fit(x=self.points, steps=1, batch_size=8)
clusters = kmeans.clusters()
self.assertAllEqual(list(clusters.shape),
[self.num_centers, self.num_dims])
Expand All @@ -97,32 +98,33 @@ def test_fit(self):
return
kmeans = self.kmeans
kmeans.fit(x=self.points,
steps=1)
steps=1, batch_size=self.batch_size)
score1 = kmeans.score(x=self.points)
kmeans.fit(x=self.points,
steps=15 * self.num_points // self.batch_size)
steps=15 * self.num_points // self.batch_size,
batch_size=self.batch_size)
score2 = kmeans.score(x=self.points)
self.assertTrue(score1 > score2)
self.assertNear(self.true_score, score2, self.true_score * 0.05)

def test_infer(self):
kmeans = self.kmeans
kmeans.fit(x=self.points)
kmeans.fit(x=self.points, steps=10, batch_size=128)
clusters = kmeans.clusters()

# Make a small test set
points, true_assignments, true_offsets = self.make_random_points(clusters,
10)
# Test predict
assignments = kmeans.predict(points)
assignments = kmeans.predict(points, batch_size=self.batch_size)
self.assertAllEqual(assignments, true_assignments)

# Test score
score = kmeans.score(points)
score = kmeans.score(points, batch_size=128)
self.assertNear(score, np.sum(true_offsets), 0.01 * score)

# Test transform
transform = kmeans.transform(points)
transform = kmeans.transform(points, batch_size=128)
true_transform = np.maximum(
0,
np.sum(np.square(points), axis=1, keepdims=True) -
Expand All @@ -140,12 +142,9 @@ def test_fit_with_cosine_distance(self):
initial_clusters=kmeans_ops.RANDOM_INIT,
distance_metric=kmeans_ops.COSINE_DISTANCE,
use_mini_batch=self.use_mini_batch,
batch_size=4,
steps=30,
continue_training=True,
config=run_config.RunConfig(tf_random_seed=2),
config=self.config(2),
random_seed=12)
kmeans.fit(x=points)
kmeans.fit(x=points, steps=10, batch_size=4)
centers = normalize(kmeans.clusters())
self.assertAllClose(np.sort(centers, axis=0),
np.sort(true_centers, axis=0))
Expand All @@ -163,18 +162,16 @@ def test_transform_with_cosine_distance(self):
initial_clusters=kmeans_ops.RANDOM_INIT,
distance_metric=kmeans_ops.COSINE_DISTANCE,
use_mini_batch=self.use_mini_batch,
batch_size=8,
continue_training=True,
config=run_config.RunConfig(tf_random_seed=3))
kmeans.fit(x=points, steps=30)
config=self.config(3))
kmeans.fit(x=points, steps=30, batch_size=8)

centers = normalize(kmeans.clusters())
self.assertAllClose(np.sort(centers, axis=0),
np.sort(true_centers, axis=0),
atol=1e-2)

true_transform = 1 - cosine_similarity(points, centers)
transform = kmeans.transform(points)
transform = kmeans.transform(points, batch_size=8)
self.assertAllClose(transform, true_transform, atol=1e-3)

def test_predict_with_cosine_distance(self):
Expand All @@ -196,20 +193,18 @@ def test_predict_with_cosine_distance(self):
initial_clusters=kmeans_ops.RANDOM_INIT,
distance_metric=kmeans_ops.COSINE_DISTANCE,
use_mini_batch=self.use_mini_batch,
batch_size=8,
continue_training=True,
config=run_config.RunConfig(tf_random_seed=3))
kmeans.fit(x=points, steps=30)
config=self.config(3))
kmeans.fit(x=points, steps=30, batch_size=8)

centers = normalize(kmeans.clusters())
self.assertAllClose(np.sort(centers, axis=0),
np.sort(true_centers, axis=0), atol=1e-2)

assignments = kmeans.predict(points)
assignments = kmeans.predict(points, batch_size=8)
self.assertAllClose(centers[assignments],
true_centers[true_assignments], atol=1e-2)

score = kmeans.score(points)
score = kmeans.score(points, batch_size=8)
self.assertAllClose(score, true_score, atol=1e-2)

def test_predict_with_cosine_distance_and_kmeans_plus_plus(self):
Expand All @@ -233,29 +228,27 @@ def test_predict_with_cosine_distance_and_kmeans_plus_plus(self):
initial_clusters=kmeans_ops.KMEANS_PLUS_PLUS_INIT,
distance_metric=kmeans_ops.COSINE_DISTANCE,
use_mini_batch=self.use_mini_batch,
batch_size=12,
continue_training=True,
config=run_config.RunConfig(tf_random_seed=3))
kmeans.fit(x=points, steps=30)
config=self.config(3))
kmeans.fit(x=points, steps=30, batch_size=12)

centers = normalize(kmeans.clusters())
self.assertAllClose(sorted(centers.tolist()),
sorted(true_centers.tolist()),
atol=1e-2)

assignments = kmeans.predict(points)
assignments = kmeans.predict(points, batch_size=12)
self.assertAllClose(centers[assignments],
true_centers[true_assignments], atol=1e-2)

score = kmeans.score(points)
score = kmeans.score(points, batch_size=12)
self.assertAllClose(score, true_score, atol=1e-2)

def test_fit_raise_if_num_clusters_larger_than_num_points_random_init(self):
points = np.array([[2.0, 3.0], [1.6, 8.2]])

with self.assertRaisesOpError('less'):
kmeans = KMeans(num_clusters=3, initial_clusters=kmeans_ops.RANDOM_INIT)
kmeans.fit(x=points)
kmeans.fit(x=points, steps=10, batch_size=8)

def test_fit_raise_if_num_clusters_larger_than_num_points_kmeans_plus_plus(
self):
Expand All @@ -264,7 +257,7 @@ def test_fit_raise_if_num_clusters_larger_than_num_points_kmeans_plus_plus(
with self.assertRaisesOpError(AssertionError):
kmeans = KMeans(num_clusters=3,
initial_clusters=kmeans_ops.KMEANS_PLUS_PLUS_INIT)
kmeans.fit(x=points)
kmeans.fit(x=points, steps=10, batch_size=8)


class MiniBatchKMeansTest(KMeansTest):
Expand Down