Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Branch 161598138 #11439

Merged
merged 21 commits into from Jul 12, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
ea3d9ab
Removed the _tensor_summary_v2 op.
tensorflower-gardener Jul 11, 2017
8281e23
tfprof further clean ups
tensorflower-gardener Jul 11, 2017
9e89636
[XLA:CPU] Support for CPU outfeed and a xfeed (infeed/outfeed) test.
tensorflower-gardener Jul 11, 2017
ba45775
[TF:XLA] Fix copy-and-paste bug in CHECK statement.
hawkinsp Jul 11, 2017
18a5510
Update toolchain configuration artifacts to work with latest version …
tensorflower-gardener Jul 11, 2017
b1f9e2c
Add an axis parameter to tf.gather. Fixes GitHub issue #11223.
rryan Jul 11, 2017
ad814f9
Move the tf.contrib.data README to the programmers' guide.
mrry Jul 11, 2017
0478774
Slight changes and improvements of style of comment to tf.contrib.lay…
tensorflower-gardener Jul 11, 2017
dbe029d
Update ops-related pbtxt files.
tensorflower-gardener Jul 11, 2017
e6df395
Go: Update generated wrapper functions for TensorFlow ops.
tensorflower-gardener Jul 11, 2017
082ede6
Changed title (first-level header).
tensorflower-gardener Jul 11, 2017
c1b6f48
Switch FlatMap and FlatSet to use a non-identity hasher for pointers.
Jul 11, 2017
c332254
Remove HashStr and HashStringPiece (a one-off).
Jul 11, 2017
75b936e
Speed up HeapSimulator's UniqueOperandSourceBuffers.
Jul 11, 2017
1de8cce
fix #11372, #11396
tensorflower-gardener Jul 11, 2017
a80035d
Make ScheduledEmbeddingTrainingHelper more readable and consistent wi…
adarob Jul 11, 2017
ad7fb4d
TFTS: Increase test timeouts to avoid ASan failures
allenlavoie Jul 11, 2017
cbe1ef0
Add missing deprecation warnings.
MarkDaoust Jul 11, 2017
8e82134
Use C API to implement Operation._input_types
tensorflower-gardener Jul 11, 2017
14f7d7f
Fix lint errors in ops.py and ops_test.py
tensorflower-gardener Jul 11, 2017
65e02b2
Merge commit for internal changes
Jul 11, 2017
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
62 changes: 50 additions & 12 deletions tensorflow/compiler/tf2xla/kernels/gather_op.cc
Expand Up @@ -29,6 +29,7 @@ class GatherOp : public XlaOpKernel {

void Compile(XlaOpKernelContext* ctx) override {
const TensorShape params_shape = ctx->InputShape(0);
const auto params_dims = params_shape.dims();
const TensorShape indices_shape = ctx->InputShape(1);
OP_REQUIRES(
ctx, TensorShapeUtils::IsVectorOrHigher(params_shape),
Expand All @@ -38,20 +39,51 @@ class GatherOp : public XlaOpKernel {
OP_REQUIRES(ctx, index_type == DT_INT32 || index_type == DT_INT64,
errors::InvalidArgument("index must be int32 or int64"));

// GatherV2 added an axis argument. We support both Gather and GatherV2 in
// this kernel by defaulting axis to 0 if there are 2 inputs.
int64 axis = 0;
if (ctx->num_inputs() == 3) {
const TensorShape axis_shape = ctx->InputShape(2);
OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(axis_shape),
errors::InvalidArgument("axis must be scalar"));
DataType axis_type = input_type(2);
OP_REQUIRES(ctx, axis_type == DT_INT32 || axis_type == DT_INT64,
errors::InvalidArgument("axis must be int32 or int64"));

xla::Literal literal;
OP_REQUIRES_OK(ctx, ctx->ConstantInput(2, &literal));
int64 axis_input = axis_type == DT_INT32 ? literal.Get<int32>({})
: literal.Get<int64>({});
axis = axis_input < 0 ? axis_input + params_dims : axis_input;
OP_REQUIRES(ctx, 0 <= axis && axis < params_dims,
errors::InvalidArgument("Expected axis in the range [",
-params_dims, ", ", params_dims,
"), but got ", axis_input));
}

// Check that we have enough index space.
const int64 limit = index_type == DT_INT32
? std::numeric_limits<int32>::max()
: std::numeric_limits<int64>::max();
OP_REQUIRES(
ctx, params_shape.dim_size(0) <= limit,
errors::InvalidArgument("params.shape[0] too large for ",
DataTypeString(index_type), " indexing: ",
params_shape.dim_size(0), " > ", limit));

// The result shape is indices.shape + params.shape[1:].
TensorShape result_shape = indices_shape;
for (int i = 1; i < params_shape.dims(); i++) {
OP_REQUIRES(ctx, params_shape.dim_size(axis) <= limit,
errors::InvalidArgument(
"params.shape[", axis, "] too large for ",
DataTypeString(index_type),
" indexing: ", params_shape.dim_size(axis), " > ", limit));

// The result shape is params.shape[0:axis] + indices.shape +
// params.shape[axis + 1:].
TensorShape result_shape;
int64 outer_size = 1;
int64 inner_size = 1;
for (int i = 0; i < axis; i++) {
result_shape.AddDim(params_shape.dim_size(i));
outer_size *= params_shape.dim_size(i);
}
result_shape.AppendShape(indices_shape);
for (int i = axis + 1; i < params_dims; i++) {
result_shape.AddDim(params_shape.dim_size(i));
inner_size *= params_shape.dim_size(i);
}

XlaContext& tc = XlaContext::Get(ctx);
Expand All @@ -67,10 +99,12 @@ class GatherOp : public XlaOpKernel {
args.push_back(tc.GetOrCreateRuntimeContextParameter());
args.push_back(b.ConstantLiteral(
*xla::Literal::CreateR0<int64>(indices_shape.num_elements())));
args.push_back(
b.ConstantLiteral(*xla::Literal::CreateR0<int64>(outer_size)));
args.push_back(b.ConstantLiteral(
*xla::Literal::CreateR0<int64>(params_shape.dim_size(0))));
args.push_back(b.ConstantLiteral(*xla::Literal::CreateR0<int64>(
params_shape.num_elements() / params_shape.dim_size(0))));
*xla::Literal::CreateR0<int64>(params_shape.dim_size(axis))));
args.push_back(
b.ConstantLiteral(*xla::Literal::CreateR0<int64>(inner_size)));
args.push_back(ctx->Input(0));
args.push_back(ctx->Input(1));

Expand All @@ -97,6 +131,10 @@ REGISTER_XLA_OP(Name("Gather")
.TypeConstraint("Tparams", DT_FLOAT)
.Device(DEVICE_CPU_XLA_JIT),
GatherOp);
REGISTER_XLA_OP(Name("GatherV2")
.TypeConstraint("Tparams", DT_FLOAT)
.Device(DEVICE_CPU_XLA_JIT),
GatherOp);

} // namespace
} // namespace tensorflow
21 changes: 12 additions & 9 deletions tensorflow/compiler/tf2xla/kernels/gather_op_kernel_float_int32.cc
Expand Up @@ -26,28 +26,31 @@ namespace tensorflow {

EIGEN_STRONG_INLINE void gather_float_int32_xla_impl(float* out, void** data) {
// data is managed by the JIT code so msan can't tell it's initialized.
TF_ANNOTATE_MEMORY_IS_INITIALIZED(data, 6 * sizeof(void*));
TF_ANNOTATE_MEMORY_IS_INITIALIZED(data, 7 * sizeof(void*));

int64 indices_size = *static_cast<int64*>(data[1]);
int64 params_x = *static_cast<int64*>(data[2]);
int64 params_y = *static_cast<int64*>(data[3]);
int64 params_z = *static_cast<int64*>(data[4]);

float* in = static_cast<float*>(data[4]);
float* in = static_cast<float*>(data[5]);

int32* indices = static_cast<int32*>(data[5]);
Eigen::DSizes<Eigen::DenseIndex, 2> in_eig_sizes;
int32* indices = static_cast<int32*>(data[6]);
Eigen::DSizes<Eigen::DenseIndex, 3> in_eig_sizes;
in_eig_sizes[0] = params_x;
in_eig_sizes[1] = params_y;
tensorflow::TTypes<float, 2>::ConstMatrix in_eig(in, in_eig_sizes);
in_eig_sizes[2] = params_z;
tensorflow::TTypes<float, 3>::ConstTensor in_eig(in, in_eig_sizes);

Eigen::DSizes<Eigen::DenseIndex, 1> indices_eig_sizes;
indices_eig_sizes[0] = indices_size;
tensorflow::TTypes<int32>::ConstFlat indices_eig(indices, indices_eig_sizes);

Eigen::DSizes<Eigen::DenseIndex, 2> out_eig_sizes;
out_eig_sizes[0] = indices_size;
out_eig_sizes[1] = params_y;
tensorflow::TTypes<float>::Matrix out_eig(out, out_eig_sizes);
Eigen::DSizes<Eigen::DenseIndex, 3> out_eig_sizes;
out_eig_sizes[0] = params_x;
out_eig_sizes[1] = indices_size;
out_eig_sizes[2] = params_z;
tensorflow::TTypes<float, 3>::Tensor out_eig(out, out_eig_sizes);

tensorflow::functor::GatherFunctorCPU<float, int32> f;
const int64 bad_i = f(in_eig, indices_eig, out_eig);
Expand Down
21 changes: 12 additions & 9 deletions tensorflow/compiler/tf2xla/kernels/gather_op_kernel_float_int64.cc
Expand Up @@ -26,28 +26,31 @@ namespace tensorflow {

EIGEN_STRONG_INLINE void gather_float_int64_xla_impl(float* out, void** data) {
// data is managed by the JIT code so msan can't tell it's initialized.
TF_ANNOTATE_MEMORY_IS_INITIALIZED(data, 6 * sizeof(void*));
TF_ANNOTATE_MEMORY_IS_INITIALIZED(data, 7 * sizeof(void*));

int64 indices_size = *static_cast<int64*>(data[1]);
int64 params_x = *static_cast<int64*>(data[2]);
int64 params_y = *static_cast<int64*>(data[3]);
int64 params_z = *static_cast<int64*>(data[4]);

float* in = static_cast<float*>(data[4]);
float* in = static_cast<float*>(data[5]);

int64* indices = static_cast<int64*>(data[5]);
Eigen::DSizes<Eigen::DenseIndex, 2> in_eig_sizes;
int64* indices = static_cast<int64*>(data[6]);
Eigen::DSizes<Eigen::DenseIndex, 3> in_eig_sizes;
in_eig_sizes[0] = params_x;
in_eig_sizes[1] = params_y;
tensorflow::TTypes<float, 2>::ConstMatrix in_eig(in, in_eig_sizes);
in_eig_sizes[2] = params_z;
tensorflow::TTypes<float, 3>::ConstTensor in_eig(in, in_eig_sizes);

Eigen::DSizes<Eigen::DenseIndex, 1> indices_eig_sizes;
indices_eig_sizes[0] = indices_size;
tensorflow::TTypes<int64>::ConstFlat indices_eig(indices, indices_eig_sizes);

Eigen::DSizes<Eigen::DenseIndex, 2> out_eig_sizes;
out_eig_sizes[0] = indices_size;
out_eig_sizes[1] = params_y;
tensorflow::TTypes<float>::Matrix out_eig(out, out_eig_sizes);
Eigen::DSizes<Eigen::DenseIndex, 3> out_eig_sizes;
out_eig_sizes[0] = params_x;
out_eig_sizes[1] = indices_size;
out_eig_sizes[2] = params_z;
tensorflow::TTypes<float, 3>::Tensor out_eig(out, out_eig_sizes);

tensorflow::functor::GatherFunctorCPU<float, int64> f;
const int64 bad_i = f(in_eig, indices_eig, out_eig);
Expand Down
2 changes: 1 addition & 1 deletion tensorflow/compiler/tf2xla/kernels/while_op.cc
Expand Up @@ -180,7 +180,7 @@ void XlaWhileOp::Compile(XlaOpKernelContext* ctx) {
} else {
CHECK(!body.xla_input_shapes.empty());
body_input_shape = body.xla_input_shapes[0];
CHECK(!body.xla_input_shapes.empty());
CHECK(!cond.xla_input_shapes.empty());
cond_input_shape = cond.xla_input_shapes[0];
}

Expand Down
12 changes: 12 additions & 0 deletions tensorflow/compiler/xla/literal_util.cc
Expand Up @@ -631,6 +631,18 @@ string Literal::ToString() const {
return literal;
}

/* static */ std::unique_ptr<Literal> Literal::MakeTupleOwned(
std::vector<std::unique_ptr<Literal>> elements) {
auto literal = MakeUnique<Literal>();
std::vector<Shape> shape;
for (auto& tuple_element : elements) {
shape.push_back(tuple_element->shape());
literal->add_tuple_literals()->Swap(tuple_element.get());
}
*literal->mutable_shape() = ShapeUtil::MakeTupleShape(shape);
return literal;
}

const void* Literal::InternalData() const {
return const_cast<const void*>(
const_cast<Literal*>(this)->MutableInternalData());
Expand Down
10 changes: 10 additions & 0 deletions tensorflow/compiler/xla/literal_util.h
Expand Up @@ -481,6 +481,16 @@ class Literal {
static std::unique_ptr<Literal> MakeTuple(
tensorflow::gtl::ArraySlice<const Literal*> elements);

// As above, but intended to be invoked with move semantics; i.e.
//
// std::vector<std::unique_ptr<Literal>> elements = ...;
// auto result = Literal::MakeTupleOwned(std::move(elements));
//
// This would have been declared as an overload, but there is ambiguity
// in invocation between the above signature and this one.
static std::unique_ptr<Literal> MakeTupleOwned(
std::vector<std::unique_ptr<Literal>> elements);

// Validates that the data payload of the literal matches the literal shape;
// if it does not, an appropriate status is returned.
tensorflow::Status ValidateLiteral() const;
Expand Down
8 changes: 4 additions & 4 deletions tensorflow/compiler/xla/service/cpu/BUILD
Expand Up @@ -326,11 +326,11 @@ cc_library(
name = "cpu_runtime",
srcs = [
"cpu_runtime.cc",
"infeed_manager.cc",
"xfeed_manager.cc",
],
hdrs = [
"cpu_runtime.h",
"infeed_manager.h",
"xfeed_manager.h",
],
copts = runtime_copts(),
deps = [
Expand Down Expand Up @@ -416,9 +416,9 @@ cc_test(
)

cc_test(
name = "infeed_manager_test",
name = "xfeed_manager_test",
size = "small",
srcs = ["infeed_manager_test.cc"],
srcs = ["xfeed_manager_test.cc"],
deps = [
":cpu_runtime",
"//tensorflow/core:lib",
Expand Down
35 changes: 27 additions & 8 deletions tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
Expand Up @@ -24,8 +24,8 @@ namespace xla {
namespace cpu {
namespace runtime {

InfeedManager* GetInfeedManager() {
static InfeedManager* manager = new InfeedManager;
XfeedManager* GetXfeedManager() {
static XfeedManager* manager = new XfeedManager;
return manager;
}

Expand All @@ -35,17 +35,36 @@ InfeedManager* GetInfeedManager() {

void* __xla_cpu_runtime_AcquireInfeedBufferForDequeue(
xla::int32 buffer_length) {
xla::cpu::runtime::InfeedManager* infeed =
xla::cpu::runtime::GetInfeedManager();
VLOG(2) << "AcquireInfeedBufferForDequeue";
xla::cpu::runtime::XfeedManager* xfeed = xla::cpu::runtime::GetXfeedManager();
// Wait until there's a buffer to dequeue.
xla::cpu::runtime::InfeedBuffer* buffer = infeed->BlockingDequeueBuffer();
xla::cpu::runtime::XfeedBuffer* buffer =
xfeed->infeed()->BlockingDequeueBuffer();
CHECK_EQ(buffer->length(), buffer_length);
return buffer->data();
}

void __xla_cpu_runtime_ReleaseInfeedBufferAfterDequeue(xla::int32 buffer_length,
void* buffer_ptr) {
xla::cpu::runtime::InfeedManager* infeed =
xla::cpu::runtime::GetInfeedManager();
infeed->ReleaseCurrentBuffer(buffer_length, buffer_ptr);
VLOG(2) << "ReleaseInfeedBufferAfterDequeue";
xla::cpu::runtime::XfeedManager* xfeed = xla::cpu::runtime::GetXfeedManager();
xfeed->infeed()->ReleaseCurrentBuffer(buffer_length, buffer_ptr);
}

void* __xla_cpu_runtime_AcquireOutfeedBufferForPopulation(
xla::int32 buffer_length) {
VLOG(2) << "AcquireOutfeedBufferForPopulation";
xla::cpu::runtime::XfeedManager* xfeed = xla::cpu::runtime::GetXfeedManager();
// Wait until there's a buffer to dequeue.
xla::cpu::runtime::XfeedBuffer* buffer =
xfeed->outfeed()->BlockingDequeueBuffer();
CHECK_EQ(buffer->length(), buffer_length);
return buffer->data();
}

void __xla_cpu_runtime_ReleaseOutfeedBufferAfterPopulation(
xla::int32 buffer_length, void* buffer_ptr) {
VLOG(2) << "ReleaseOutfeedBufferAfterPopulation";
xla::cpu::runtime::XfeedManager* xfeed = xla::cpu::runtime::GetXfeedManager();
xfeed->outfeed()->ReleaseCurrentBuffer(buffer_length, buffer_ptr);
}
25 changes: 23 additions & 2 deletions tensorflow/compiler/xla/service/cpu/cpu_runtime.h
Expand Up @@ -26,7 +26,7 @@ limitations under the License.
#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_RUNTIME_H_
#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_RUNTIME_H_

#include "tensorflow/compiler/xla/service/cpu/infeed_manager.h"
#include "tensorflow/compiler/xla/service/cpu/xfeed_manager.h"
#include "tensorflow/compiler/xla/types.h"

namespace xla {
Expand Down Expand Up @@ -54,9 +54,13 @@ constexpr char kAcquireInfeedBufferForDequeueSymbolName[] =
"__xla_cpu_runtime_AcquireInfeedBufferForDequeue";
constexpr char kReleaseInfeedBufferAfterDequeueSymbolName[] =
"__xla_cpu_runtime_ReleaseInfeedBufferAfterDequeue";
constexpr char kAcquireOutfeedBufferForPopulationSymbolName[] =
"__xla_cpu_runtime_AcquireOutfeedBufferForPopulation";
constexpr char kReleaseOutfeedBufferAfterPopulationSymbolName[] =
"__xla_cpu_runtime_ReleaseOutfeedBufferAfterPopulation";

// Returns the infeed manager used by the CPU runtime.
InfeedManager* GetInfeedManager();
XfeedManager* GetXfeedManager();

} // namespace runtime
} // namespace cpu
Expand Down Expand Up @@ -86,6 +90,23 @@ extern void* __xla_cpu_runtime_AcquireInfeedBufferForDequeue(
// that can be returned out of order.
extern void __xla_cpu_runtime_ReleaseInfeedBufferAfterDequeue(
xla::int32 buffer_length, void* buffer_ptr);

// Blocks until the next outfeed buffer is available to be populated, then
// returns it.
extern void* __xla_cpu_runtime_AcquireOutfeedBufferForPopulation(
xla::int32 buffer_length);

// Relinquishes the outfeed buffer after it has been populated.
// buffer_ptr must have been previously returned by
// __xla_cpu_runtime_AcquireOutfeedBufferForPopulation.
// Once this call completes, buffer_ptr may no longer be accessed.
// buffer_length must match the length passed to the call to
// __xla_cpu_runtime_AcquireInfeedBufferForDequeue that returned
// buffer_ptr. This function must be called before the next buffer is
// acquired, i.e., there may only be one outstanding outfeed buffer in
// use by the runtime.
extern void __xla_cpu_runtime_ReleaseOutfeedBufferAfterPopulation(
xla::int32 buffer_length, void* buffer_ptr);
}

#endif // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_RUNTIME_H_