Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[tf.data] Check in a strictly faster rejection resampling transformation. #18730

Merged
merged 28 commits into from
May 2, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
f0df670
[tf.data] Check in a strictly faster rejection resampling
joel-shor Apr 20, 2018
b106711
[tf.data] Clean up resampler and update BUILD files.
joel-shor Apr 20, 2018
0cba8b7
[tf.data] Fix `absl` build rule.
joel-shor Apr 20, 2018
8cc506f
[tf.data] Reorder BUILD rule deps and add `xrange` from `six`.
joel-shor Apr 20, 2018
a10708d
[tf.data] Second reorder BUILD rule deps.
joel-shor Apr 20, 2018
81a34fb
[tf.data] Just replace old resample with new.
joel-shor Apr 26, 2018
ddbc654
Merge branch 'master' into master
joel-shor Apr 26, 2018
f1e0068
[tf.data] Make documentation changes, and add correct import.
joel-shor Apr 27, 2018
011a5fd
Merge remote-tracking branch 'origin/master'
joel-shor Apr 27, 2018
dd24a09
[tf.data] Pass a Tensor to `tensor_util.constant_value` instead of po…
joel-shor Apr 27, 2018
ac24161
[tf.data] Fix indentation.
joel-shor Apr 27, 2018
c45b051
[tf.data] A change to use Jenkins to test the Winsows build.
joel-shor Apr 28, 2018
b384c33
[tf.data] Possible bug fix to fix Winsows build.
joel-shor Apr 28, 2018
9033bb2
[tf.data] Undo previously unsuccessful bugfix, and try another one to…
joel-shor Apr 29, 2018
9310de4
[tf.data] Add a bunch of debugging for Jenkins to run on the Windows …
joel-shor Apr 29, 2018
fc23d94
[tf.data] Add a bunch of debugging for Jenkins to run on the Windows …
joel-shor Apr 30, 2018
44ecd94
[tf.data] Add a bunch of debugging for Jenkins to run on the Windows …
joel-shor Apr 30, 2018
d4aa90c
[tf.data] Fix logging ops debug statement.
joel-shor Apr 30, 2018
19e7b12
[tf.data] Properly format debug statements.
joel-shor Apr 30, 2018
541bd48
[tf.data] Explicitly make test's dataset int64.
joel-shor Apr 30, 2018
5da0d00
[tf.data] Removed debug code.
joel-shor Apr 30, 2018
449b9e5
[tf.data] More debug code, since the previous 'fix' wasn't a fix.
joel-shor May 1, 2018
2364000
[tf.data] More debug code, since the previous 'fix' wasn't a fix.
joel-shor May 1, 2018
03cecc5
[tf.data] Fix BUILD file.
joel-shor May 1, 2018
da02e19
[tf.data] Fix debug output.
joel-shor May 1, 2018
19ad98e
[tf.data] Fix debug output.
joel-shor May 1, 2018
b2aebe0
[tf.data] Try fixing the Windows build by adding the directed interle…
joel-shor May 1, 2018
29cd3f9
[tf.data] Remove debug code.
joel-shor May 1, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions tensorflow/contrib/cmake/tf_core_kernels.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ if(tensorflow_BUILD_CONTRIB_KERNELS)
"${tensorflow_source_dir}/tensorflow/contrib/coder/kernels/range_coder_ops.cc"
"${tensorflow_source_dir}/tensorflow/contrib/coder/kernels/range_coder_ops_util.cc"
"${tensorflow_source_dir}/tensorflow/contrib/coder/ops/coder_ops.cc"
"${tensorflow_source_dir}/tensorflow/contrib/data/kernels/directed_interleave_dataset_op.cc"
"${tensorflow_source_dir}/tensorflow/contrib/data/kernels/ignore_errors_dataset_op.cc"
"${tensorflow_source_dir}/tensorflow/contrib/data/kernels/prefetching_kernels.cc"
"${tensorflow_source_dir}/tensorflow/contrib/data/kernels/threadpool_dataset_op.cc"
Expand Down
4 changes: 4 additions & 0 deletions tensorflow/contrib/data/python/kernel_tests/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -320,11 +320,15 @@ py_test(
deps = [
"//tensorflow/contrib/data/python/ops:resampling",
"//tensorflow/python:client_testlib",
"//tensorflow/python:dtypes",
"//tensorflow/python:errors",
"//tensorflow/python:math_ops",
"//tensorflow/python:random_ops",
"//tensorflow/python:string_ops",
"//tensorflow/python:util",
"//tensorflow/python/data/ops:dataset_ops",
"//third_party/py/numpy",
"@absl_py//absl/testing:parameterized",
],
)

Expand Down
109 changes: 88 additions & 21 deletions tensorflow/contrib/data/python/kernel_tests/resample_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@
from __future__ import print_function

import numpy as np
from six.moves import xrange # pylint: disable=redefined-builtin
import time
from absl.testing import parameterized

from tensorflow.contrib.data.python.ops import resampling
from tensorflow.python.data.ops import dataset_ops
Expand All @@ -30,52 +33,98 @@
from tensorflow.python.util import compat


class ResampleTest(test.TestCase):
def _time_resampling(
test_obj, data_np, target_dist, init_dist, num_to_sample):
dataset = dataset_ops.Dataset.from_tensor_slices(data_np).repeat()

def testInitialKnownDistribution(self):
self._testDistribution(initial_known=True)
# Reshape distribution via rejection sampling.
dataset = dataset.apply(
resampling.rejection_resample(
class_func=lambda x: x,
target_dist=target_dist,
initial_dist=init_dist,
seed=142))

def testInitialNotKnownDistribution(self):
self._testDistribution(initial_known=False)
get_next = dataset.make_one_shot_iterator().get_next()

def _testDistribution(self, initial_known):
with test_obj.test_session() as sess:
start_time = time.time()
for _ in xrange(num_to_sample):
sess.run(get_next)
end_time = time.time()

return end_time - start_time


class ResampleTest(test.TestCase, parameterized.TestCase):

@parameterized.named_parameters(
("InitialDistributionKnown", True),
("InitialDistributionUnknown", False))
def testDistribution(self, initial_known):
classes = np.random.randint(5, size=(20000,)) # Uniformly sampled
target_dist = [0.9, 0.05, 0.05, 0.0, 0.0]
initial_dist = [0.2] * 5 if initial_known else None
iterator = (dataset_ops.Dataset.from_tensor_slices(classes).shuffle(
200, seed=21).map(lambda c: (c, string_ops.as_string(c))).apply(
resampling.rejection_resample(
target_dist=target_dist,
initial_dist=initial_dist,
class_func=lambda c, _: c,
seed=27)).make_one_shot_iterator())
get_next = iterator.get_next()
classes = math_ops.to_int64(classes) # needed for Windows build.
dataset = dataset_ops.Dataset.from_tensor_slices(classes).shuffle(
200, seed=21).map(lambda c: (c, string_ops.as_string(c))).repeat()

get_next = dataset.apply(
resampling.rejection_resample(
target_dist=target_dist,
initial_dist=initial_dist,
class_func=lambda c, _: c,
seed=27)).make_one_shot_iterator().get_next()

with self.test_session() as sess:
returned = []
with self.assertRaises(errors.OutOfRangeError):
while True:
returned.append(sess.run(get_next))
while len(returned) < 4000:
returned.append(sess.run(get_next))

returned_classes, returned_classes_and_data = zip(*returned)
_, returned_data = zip(*returned_classes_and_data)
self.assertAllEqual([compat.as_bytes(str(c))
for c in returned_classes], returned_data)
total_returned = len(returned_classes)
# Subsampling rejects a large percentage of the initial data in
# this case.
self.assertGreater(total_returned, 20000 * 0.2)
class_counts = np.array([
len([True for v in returned_classes if v == c])
for c in range(5)])
returned_dist = class_counts / total_returned
self.assertAllClose(target_dist, returned_dist, atol=1e-2)

@parameterized.named_parameters(
("OnlyInitial", True),
("NotInitial", False))
def testEdgeCasesSampleFromInitialDataset(self, only_initial_dist):
init_dist = [0.5, 0.5]
target_dist = [0.5, 0.5] if only_initial_dist else [0.0, 1.0]
num_classes = len(init_dist)
# We don't need many samples to test that this works.
num_samples = 100
data_np = np.random.choice(num_classes, num_samples, p=init_dist)

dataset = dataset_ops.Dataset.from_tensor_slices(data_np)

# Reshape distribution.
dataset = dataset.apply(
resampling.rejection_resample(
class_func=lambda x: x,
target_dist=target_dist,
initial_dist=init_dist))

get_next = dataset.make_one_shot_iterator().get_next()

with self.test_session() as sess:
returned = []
with self.assertRaises(errors.OutOfRangeError):
while True:
returned.append(sess.run(get_next))

def testRandomClasses(self):
init_dist = [0.25, 0.25, 0.25, 0.25]
target_dist = [0.0, 0.0, 0.0, 1.0]
num_classes = len(init_dist)
# We don't need many samples to test a dirac-delta target distribution
# We don't need many samples to test a dirac-delta target distribution.
num_samples = 100
data_np = np.random.choice(num_classes, num_samples, p=init_dist)

Expand Down Expand Up @@ -109,5 +158,23 @@ def _remap_fn(_):

self.assertAllClose(target_dist, bincount, atol=1e-2)


class ResampleDatasetBenchmark(test.Benchmark):

def benchmarkResamplePerformance(self):
init_dist = [0.25, 0.25, 0.25, 0.25]
target_dist = [0.0, 0.0, 0.0, 1.0]
num_classes = len(init_dist)
# We don't need many samples to test a dirac-delta target distribution
num_samples = 1000
data_np = np.random.choice(num_classes, num_samples, p=init_dist)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This might be np.int32 on Windows.


resample_time = _time_resampling(
self, data_np, target_dist, init_dist, num_to_sample=1000)

self.report_benchmark(
iters=1000, wall_time=resample_time, name="benchmark_resample")


if __name__ == "__main__":
test.main()
2 changes: 2 additions & 0 deletions tensorflow/contrib/data/python/ops/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,7 @@ py_library(
srcs_version = "PY2AND3",
deps = [
":batching",
":interleave_ops",
":scan_ops",
"//tensorflow/python:array_ops",
"//tensorflow/python:control_flow_ops",
Expand All @@ -202,6 +203,7 @@ py_library(
"//tensorflow/python:math_ops",
"//tensorflow/python:random_ops",
"//tensorflow/python/data/ops:dataset_ops",
"//third_party/py/numpy",
],
)

Expand Down