Skip to content

Commit

Permalink
[tf.data] Fixing internal test failures.
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 253113807
  • Loading branch information
jsimsa authored and tensorflower-gardener committed Jun 13, 2019
1 parent c026183 commit 0b48315
Show file tree
Hide file tree
Showing 4 changed files with 31 additions and 31 deletions.
13 changes: 11 additions & 2 deletions tensorflow/core/kernels/data/cache_dataset_ops.cc
Expand Up @@ -16,6 +16,7 @@ limitations under the License.
#include "tensorflow/core/framework/partial_tensor_shape.h"
#include "tensorflow/core/framework/resource_mgr.h"
#include "tensorflow/core/framework/tensor.h"
#include "tensorflow/core/lib/core/errors.h"
#include "tensorflow/core/lib/strings/stringprintf.h"
#include "tensorflow/core/platform/env.h"
#include "tensorflow/core/util/tensor_bundle/tensor_bundle.h"
Expand Down Expand Up @@ -211,8 +212,9 @@ class CacheDatasetOp : public UnaryDatasetOpKernel {
std::vector<Tensor>* out_tensors,
bool* end_of_sequence) override {
mutex_lock l(mu_);
TF_RETURN_IF_ERROR(EnsureLockFileExists());
TF_RETURN_IF_ERROR(writer_->status());
TF_RETURN_IF_ERROR(
HandleEOF(EnsureLockFileExists(), end_of_sequence));
TF_RETURN_IF_ERROR(HandleEOF(writer_->status(), end_of_sequence));
if (cur_index_ >= kMaxItems) {
// As a courtesy, close the [truncated] cache file.
Status s = Finish();
Expand Down Expand Up @@ -417,6 +419,13 @@ class CacheDatasetOp : public UnaryDatasetOpKernel {
return Status::OK();
}

Status HandleEOF(Status s, bool* end_of_sequence) {
if (errors::IsOutOfRange(s)) {
*end_of_sequence = true;
}
return s;
}

mutex mu_;
size_t cur_index_ GUARDED_BY(mu_);
// Index of the current shard. This gets incremented whenever a new
Expand Down
10 changes: 0 additions & 10 deletions tensorflow/core/kernels/data/name_utils.h
Expand Up @@ -27,16 +27,6 @@ namespace name_utils {
extern const char kDelimiter[];
extern const char kDefaultDatasetDebugStringPrefix[];

// Merges the given strings or numbers with the specified delimiter.
//
// e.g. StrJoin("_", "A", 1, "B") -> "A_1_B".
template <typename... Args>
string StrJoin(const string& separator, const Args&... args) {
auto args_list = std::vector<StringPiece>{
static_cast<const strings::AlphaNum&>(args).Piece()...};
return absl::StrJoin(args_list, separator);
}

// Merge the given args in the format of "(arg1, arg2, ..., argn)".
//
// e.g. ArgsToString({"1", "2", "3"}) -> "(1, 2, 3)".
Expand Down
7 changes: 0 additions & 7 deletions tensorflow/core/kernels/data/name_utils_test.cc
Expand Up @@ -40,13 +40,6 @@ TEST(NameUtilsTest, DatasetDebugString) {
"ShuffleDatasetOp(10, 1, 2)::FixedSeedDataset");
}

TEST(NameUtilsTest, StrJoin) {
EXPECT_EQ(name_utils::StrJoin("_"), "");
EXPECT_EQ(name_utils::StrJoin("_", "A"), "A");
EXPECT_EQ(name_utils::StrJoin("_", "A", 1, "B"), "A_1_B");
EXPECT_EQ(name_utils::StrJoin(", ", "a", "b", "c"), "a, b, c");
}

} // namespace
} // namespace data
} // namespace tensorflow
32 changes: 20 additions & 12 deletions tensorflow/core/kernels/data/shuffle_dataset_op.cc
Expand Up @@ -15,6 +15,7 @@ limitations under the License.
#include "tensorflow/core/kernels/data/shuffle_dataset_op.h"

#include <deque>
#include <tuple>
#include <vector>

#include "tensorflow/core/framework/partial_tensor_shape.h"
Expand Down Expand Up @@ -254,20 +255,23 @@ class ShuffleDatasetOpBase::ShuffleDatasetBase : public DatasetBase {
TF_RETURN_IF_ERROR(
writer->WriteScalar(this->full_name(kSlicesSize), slices_.size()));
for (size_t i = 0; i < slices_.size(); ++i) {
TF_RETURN_IF_ERROR(
writer->WriteScalar(this->full_name(absl::StrJoin(
std::make_tuple(kSlicesStart, i), "_")),
slices_[i]->start));
TF_RETURN_IF_ERROR(writer->WriteScalar(
this->full_name(name_utils::StrJoin("_", kSlicesStart, i)),
slices_[i]->start));
TF_RETURN_IF_ERROR(writer->WriteScalar(
this->full_name(name_utils::StrJoin("_", kSlicesEnd, i)),
this->full_name(absl::StrJoin(std::make_tuple(kSlicesEnd, i), "_")),
slices_[i]->end));
for (size_t j = slices_[i]->start; j < slices_[i]->end; ++j) {
size_t index = j % this->dataset()->buffer_size_;
TF_RETURN_IF_ERROR(writer->WriteScalar(
this->full_name(name_utils::StrJoin("_", kBuffer, index, kSize)),
this->full_name(
absl::StrJoin(std::make_tuple(kBuffer, index, kSize), "_")),
buffer_[index].size()));
for (size_t k = 0; k < buffer_[index].size(); ++k) {
TF_RETURN_IF_ERROR(writer->WriteTensor(
this->full_name(name_utils::StrJoin("_", kBuffer, index, k)),
this->full_name(
absl::StrJoin(std::make_tuple(kBuffer, index, k), "_")),
buffer_[index][k]));
}
}
Expand Down Expand Up @@ -310,23 +314,27 @@ class ShuffleDatasetOpBase::ShuffleDatasetBase : public DatasetBase {
this->dataset()->buffer_size_);
for (size_t i = 0; i < slices_size; ++i) {
int64 start;
TF_RETURN_IF_ERROR(reader->ReadScalar(
this->full_name(name_utils::StrJoin("_", kSlicesStart, i)),
&start));
TF_RETURN_IF_ERROR(
reader->ReadScalar(this->full_name(absl::StrJoin(
std::make_tuple(kSlicesStart, i), "_")),
&start));
int64 end;
TF_RETURN_IF_ERROR(reader->ReadScalar(
this->full_name(name_utils::StrJoin("_", kSlicesEnd, i)), &end));
this->full_name(absl::StrJoin(std::make_tuple(kSlicesEnd, i), "_")),
&end));
slices_.push_back(absl::make_unique<Slice>(start, end));
for (size_t j = start; j < end; ++j) {
size_t index = j % this->dataset()->buffer_size_;
int64 list_size;
TF_RETURN_IF_ERROR(reader->ReadScalar(
this->full_name(name_utils::StrJoin("_", kBuffer, index, kSize)),
this->full_name(
absl::StrJoin(std::make_tuple(kBuffer, index, kSize), "_")),
&list_size));
buffer_[index] = std::vector<Tensor>(list_size);
for (int k = 0; k < list_size; ++k) {
TF_RETURN_IF_ERROR(reader->ReadTensor(
this->full_name(name_utils::StrJoin("_", kBuffer, index, k)),
this->full_name(
absl::StrJoin(std::make_tuple(kBuffer, index, k), "_")),
&buffer_[index][k]));
}
}
Expand Down

5 comments on commit 0b48315

@feihugis
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jsimsa Thanks for your fix! It looks like the internal checks are very helpful for finding the potential issues. Do you know what's the issue here?

@jsimsa
Copy link
Contributor Author

@jsimsa jsimsa commented on 0b48315 Jun 14, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My best guess is that static_cast<const strings::AlphaNum&>(args).Piece()... goes out of scope by the time absl::StrJoin is called. That aside, there is no need to create a helper for this as you can achieve the same behavior by passing the values directly into absl::StrJoin as a std::tuple.

@jsimsa
Copy link
Contributor Author

@jsimsa jsimsa commented on 0b48315 Jun 14, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To be clear, one of the internal checks (AddressSanitizer) was failing with "stack-use-after-scope" which indicates that an object allocated on stack is being used after it went out of scope.

@feihugis
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jsimsa Thanks for the information! Yes, the helper func is not needed. I will try if I can reproduce the issue on my laptop.

@feihugis
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The error can be reproduced when using bazel test //tensorflow/core/kernels/data:name_utils_test --copt=-fsanitize=address --linkopt=-fsanitize=address --copt=-fno-omit-frame-pointer --copt=-O1. Will add AddressSanitizer check in the future when I run the tests on my laptop. Thanks for your help, @jsimsa!

Please sign in to comment.