Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Branch 184376425 #16726

Merged
merged 10 commits into from
Feb 3, 2018
26 changes: 15 additions & 11 deletions tensorflow/c/eager/c_api.cc
Original file line number Diff line number Diff line change
Expand Up @@ -253,27 +253,31 @@ TFE_Op* TFE_NewOp(TFE_Context* ctx, const char* op_or_function_name,

void TFE_DeleteOp(TFE_Op* op) { delete op; }

static void TFE_OpSetDeviceHelper(TFE_Op* op, tensorflow::Device* device,
TF_Status* status) {
// Questionable heuristic: Place the op on the same device as the first input
// placed outside of host memory?
if (IsCPU(op->device) && !IsCPU(device)) {
op->device = device;
}
}

void TFE_OpSetDevice(TFE_Op* op, const char* device_name, TF_Status* status) {
tensorflow::Device* d = nullptr;
if (device_name != nullptr && strlen(device_name) > 0) {
status->status =
op->ctx->session->device_mgr->LookupDevice(device_name, &d);
if (!status->status.ok()) return;
}
TFE_OpSetDeviceHelper(op, d, status);
op->device = d;
}

const char* TFE_OpGetDevice(TFE_Op* op, TF_Status* status) {
tensorflow::Device* device =
(op->device == nullptr) ? op->ctx->devices()[0] : op->device;
return device->name().c_str();
}

void TFE_OpAddInput(TFE_Op* op, TFE_TensorHandle* h, TF_Status* status) {
TFE_OpSetDeviceHelper(op, h->d, status);
// Questionable heuristic ...
//
// Motivation: After an 'op' is placed on GPU because some of its earlier
// inputs are on GPU, we want to keep the 'op' there, even if some later
// inputs of it are not on GPU.
if (IsCPU(op->device) && !IsCPU(h->d)) {
op->device = h->d;
}
if (!status->status.ok()) return;
op->inputs.push_back(h->t);
op->input_devices.push_back(h->d);
Expand Down
3 changes: 3 additions & 0 deletions tensorflow/c/eager/c_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,9 @@ TF_CAPI_EXPORT extern void TFE_DeleteOp(TFE_Op* op);

TF_CAPI_EXPORT extern void TFE_OpSetDevice(TFE_Op* op, const char* device_name,
TF_Status* status);
// The returned string remains valid throughout the lifetime of 'op'.
TF_CAPI_EXPORT extern const char* TFE_OpGetDevice(TFE_Op* op,
TF_Status* status);

TF_CAPI_EXPORT extern void TFE_OpAddInput(TFE_Op* op, TFE_TensorHandle* h, TF_Status* status);

Expand Down
92 changes: 66 additions & 26 deletions tensorflow/c/eager/c_api_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,31 @@ TFE_Op* MatMulOp(TFE_Context* ctx, TFE_TensorHandle* a, TFE_TensorHandle* b) {
return op;
}

// If there is a GPU device, returns true and sets 'gpu_device_name'
// accordingly.
bool GetGPUDeviceName(TFE_Context* ctx, string* gpu_device_name) {
std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
TF_NewStatus(), TF_DeleteStatus);
TF_DeviceList* devices = TFE_ContextListDevices(ctx, status.get());
CHECK_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());

const int num_devices = TF_DeviceListCount(devices);
for (int i = 0; i < num_devices; ++i) {
const string device_type(TF_DeviceListType(devices, i, status.get()));
CHECK_EQ(TF_GetCode(status.get()), TF_OK) << TF_Message(status.get());
const string device_name(TF_DeviceListName(devices, i, status.get()));
CHECK_EQ(TF_GetCode(status.get()), TF_OK) << TF_Message(status.get());
if (device_type == "GPU") {
*gpu_device_name = device_name;
LOG(INFO) << "Found GPU device " << device_name;
TF_DeleteDeviceList(devices);
return true;
}
}
TF_DeleteDeviceList(devices);
return false;
}

void BM_InitOp(int iters) {
tensorflow::testing::StopTiming();
TF_Status* status = TF_NewStatus();
Expand Down Expand Up @@ -288,22 +313,15 @@ TEST(CAPI, TensorHandleSilentCopy) {
TF_Tensor* t = TFE_TensorHandleResolve(hcpu, status.get());
ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());

TF_DeviceList* devices = TFE_ContextListDevices(ctx, status.get());
ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
const int num_devices = TF_DeviceListCount(devices);

// Disable the test if no GPU is present.
if (num_devices > 1) {
const int device_to_use = 1;
const string name(TF_DeviceListName(devices, device_to_use, status.get()));
ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());

TFE_TensorHandle* hgpu =
TFE_TensorHandleCopyToDevice(hcpu, ctx, name.c_str(), status.get());
string gpu_device_name;
if (GetGPUDeviceName(ctx, &gpu_device_name)) {
TFE_TensorHandle* hgpu = TFE_TensorHandleCopyToDevice(
hcpu, ctx, gpu_device_name.c_str(), status.get());
ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());

TFE_Op* matmul = MatMulOp(ctx, hcpu, hgpu);
TFE_OpSetDevice(matmul, name.c_str(), status.get());
TFE_OpSetDevice(matmul, gpu_device_name.c_str(), status.get());
ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
TFE_TensorHandle* retvals[1];
int num_retvals = 1;
Expand All @@ -314,7 +332,6 @@ TEST(CAPI, TensorHandleSilentCopy) {
TFE_DeleteTensorHandle(hgpu);
}

TF_DeleteDeviceList(devices);
TF_DeleteTensor(t);
TFE_DeleteTensorHandle(hcpu);
TFE_DeleteContext(ctx, status.get());
Expand All @@ -337,22 +354,15 @@ TEST(CAPI, TensorHandleSilentCopyLocal) {
TF_Tensor* t = TFE_TensorHandleResolve(hcpu, status.get());
ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());

TF_DeviceList* devices = TFE_ContextListDevices(ctx, status.get());
ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
const int num_devices = TF_DeviceListCount(devices);

// Disable the test if no GPU is present.
if (num_devices > 1) {
const int device_to_use = 1;
const string name(TF_DeviceListName(devices, device_to_use, status.get()));
ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());

TFE_TensorHandle* hgpu =
TFE_TensorHandleCopyToDevice(hcpu, ctx, name.c_str(), status.get());
string gpu_device_name;
if (GetGPUDeviceName(ctx, &gpu_device_name)) {
TFE_TensorHandle* hgpu = TFE_TensorHandleCopyToDevice(
hcpu, ctx, gpu_device_name.c_str(), status.get());
ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());

TFE_Op* matmul = MatMulOp(ctx, hcpu, hgpu);
TFE_OpSetDevice(matmul, name.c_str(), status.get());
TFE_OpSetDevice(matmul, gpu_device_name.c_str(), status.get());
ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
TFE_TensorHandle* retvals[1];
int num_retvals = 1;
Expand All @@ -363,13 +373,43 @@ TEST(CAPI, TensorHandleSilentCopyLocal) {
TFE_DeleteTensorHandle(hgpu);
}

TF_DeleteDeviceList(devices);
TF_DeleteTensor(t);
TFE_DeleteTensorHandle(hcpu);
TFE_DeleteContext(ctx, status.get());
EXPECT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
}

TEST(CAPI, SetAndGetOpDevices) {
TF_Status* status = TF_NewStatus();
TFE_ContextOptions* opts = TFE_NewContextOptions();
TFE_Context* ctx = TFE_NewContext(opts, status);
CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
TFE_DeleteContextOptions(opts);

TFE_TensorHandle* m = TestMatrixTensorHandle();
TFE_Op* matmul = MatMulOp(ctx, m, m);

// Disable the test if no GPU is present.
string gpu_device_name;
if (GetGPUDeviceName(ctx, &gpu_device_name)) {
TFE_OpSetDevice(matmul, "GPU:0", status);
ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
const char* device_name = TFE_OpGetDevice(matmul, status);
ASSERT_TRUE(strstr(device_name, "GPU:0") != nullptr);

TFE_OpSetDevice(matmul, "CPU:0", status);
ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
device_name = TFE_OpGetDevice(matmul, status);
ASSERT_TRUE(strstr(device_name, "CPU:0") != nullptr);
}

TFE_DeleteOp(matmul);
TFE_DeleteTensorHandle(m);
TFE_DeleteContext(ctx, status);
ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
TF_DeleteStatus(status);
}

TEST(CAPI, Execute) {
TF_Status* status = TF_NewStatus();
TFE_ContextOptions* opts = TFE_NewContextOptions();
Expand Down
6 changes: 5 additions & 1 deletion tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,11 @@ static bool BuffersInvariantWithinConsumer(
llvm_ir::IrArray HloToIrBindings::GetIrArray(const HloInstruction& hlo,
const HloInstruction& consumer,
const ShapeIndex& shape_index) {
llvm_ir::IrArray ir_array(GetBasePointer(hlo, shape_index),
llvm::Value* base_ptr = GetBasePointer(hlo, shape_index);
CHECK_NE(base_ptr, nullptr)
<< "Buffer not assigned for shape_index " << shape_index.ToString()
<< " of " << hlo.ToString();
llvm_ir::IrArray ir_array(base_ptr,
ShapeUtil::GetSubshape(hlo.shape(), shape_index));
alias_analysis_.AddAliasingInformationToIrArray(hlo, &ir_array);

Expand Down
22 changes: 11 additions & 11 deletions tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1657,24 +1657,24 @@ Status IrEmitterUnnested::HandleReduce(HloInstruction* reduce) {
}

Status IrEmitterUnnested::HandleTuple(HloInstruction* tuple) {
tensorflow::gtl::ArraySlice<HloInstruction*> operands(tuple->operands());
bool all_tuple_elements_have_buffer = std::all_of(
operands.begin(), operands.end(), [this](HloInstruction* tuple_element) {
bool all_tuple_elements_have_buffer =
c_all_of(tuple->operands(), [&](HloInstruction* tuple_element) {
return ir_emitter_context_->buffer_assignment().HasTopLevelAllocation(
tuple_element);
});
// Tuples (especially output tuples) can take too many tuple elements,
// causing the kernel emitted exceeds the parameter space limit
// (b/31336476). As an optimization, if all tuple elements have a buffer, we
// collect their buffer addresses in a host array, and then copy that array
// to the tuple's buffer.
// Tuples (especially tuples that are the final result of a computation) can
// be so huge that if we were to emit a kernel that took each tuple element as
// a parameter, we would exceed the max allowable number of parameters to a
// GPU kernel, b/31336476. As an optimization, if all tuple elements have a
// buffer, we collect their buffer addresses in a host array, and then copy
// that array to the tuple's buffer.
//
// Some tuple elements (e.g. const or bitcast of const) might not have a
// buffer -- their contents are stored in code. In that case, we fall back
// to emitting kernels which have access to their buffer addresses in code.
// buffer -- their contents are stored in code. In that case, we fall back to
// emitting kernels which have access to their buffer addresses in code.
if (all_tuple_elements_have_buffer) {
std::vector<BufferAllocation::Slice> tuple_element_buffers;
for (const HloInstruction* tuple_element : operands) {
for (const HloInstruction* tuple_element : tuple->operands()) {
tuple_element_buffers.push_back(GetAllocationSlice(*tuple_element));
}
thunk_sequence_->emplace_back(MakeUnique<TupleThunk>(
Expand Down
5 changes: 5 additions & 0 deletions tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,11 @@ class ParallelLoopEmitter : public llvm_ir::LoopEmitter {
const LaunchDimensions& launch_dimensions,
llvm::IRBuilder<>* ir_builder);

// Constructs a loop emitter for a loop that generates on element of each of N
// arrays on each iteration.
//
// This is used in multi-output fusion. target_element_generator should
// produce a struct with N elements, one for each of target_arrays.
ParallelLoopEmitter(
const llvm_ir::ElementGenerator& target_element_generator,
tensorflow::gtl::ArraySlice<llvm_ir::IrArray> target_arrays,
Expand Down
4 changes: 2 additions & 2 deletions tensorflow/compiler/xla/service/hlo_instruction.h
Original file line number Diff line number Diff line change
Expand Up @@ -874,8 +874,8 @@ class HloInstruction {
// Returns true if this instruction is a fusion instruction that generates
// multiple outputs.
const bool IsMultiOutputFusion() const {
return (opcode() == HloOpcode::kFusion &&
fused_expression_root()->opcode() == HloOpcode::kTuple);
return opcode() == HloOpcode::kFusion &&
fused_expression_root()->opcode() == HloOpcode::kTuple;
}

FusionKind fusion_kind() const {
Expand Down
6 changes: 6 additions & 0 deletions tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,12 @@ namespace xla {

// Unlike IrEmitter, this creates host functions which emit IR to generate the
// output element at the given index. It is used to generate fused operations.
//
// This class handles both vanilla fusion and multi-output fusion. In the MOF
// case, the fusion node ends with a kTuple instruction, and the root generator
// returned by this emitter returns an LLVM struct with N elements, one for each
// element of the arrays in the tuple. It follows that the arrays in the tuple
// must have the same length.
class FusedIrEmitter : public DfsHloVisitorWithDefault {
public:
using Generator = llvm_ir::ElementGenerator;
Expand Down
57 changes: 30 additions & 27 deletions tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -51,37 +51,40 @@ LoopEmitter::LoopEmitter(const ElementGenerator& target_element_generator,
shape_(target_array.GetShape()),
ir_builder_(ir_builder) {}

static LoopEmitter::BodyEmitter MakeBodyEmitterForMultiOutputFusion(
const ElementGenerator& target_element_generator,
const std::vector<IrArray>& target_arrays, llvm::IRBuilder<>* ir_builder) {
return [=](const llvm_ir::IrArray::Index array_index) {
TF_ASSIGN_OR_RETURN(llvm::Value * target_element,
target_element_generator(array_index));
CHECK(target_element->getType()->isStructTy())
<< "This BodyEmitter is for multi-output fusion, but target element "
"generator does not produce values of struct type.";
CHECK_EQ(target_element->getType()->getStructNumElements(),
target_arrays.size());

for (int64 i = 0; i < target_arrays.size(); ++i) {
target_arrays[i].EmitWriteArrayElement(
array_index, ir_builder->CreateExtractValue(target_element, i),
ir_builder);
}
return Status::OK();
};
}

LoopEmitter::LoopEmitter(const ElementGenerator& target_element_generator,
tensorflow::gtl::ArraySlice<IrArray> target_arrays,
llvm::IRBuilder<>* ir_builder)
: body_emitter_([=](const llvm_ir::IrArray::Index array_index)
-> ::tensorflow::Status {
// Convert target_element_generator to a BodyEmitter.
TF_ASSIGN_OR_RETURN(llvm::Value * target_element,
target_element_generator(array_index));
if (target_arrays.size() == 1) {
target_arrays[0].EmitWriteArrayElement(array_index, target_element,
ir_builder);
return tensorflow::Status::OK();
}

for (int64 i = 0; i < target_arrays.size(); ++i) {
target_arrays[i].EmitWriteArrayElement(
array_index, ir_builder_->CreateExtractValue(target_element, i),
ir_builder);
}
return tensorflow::Status::OK();
}),
: body_emitter_(MakeBodyEmitterForMultiOutputFusion(
target_element_generator,
std::vector<IrArray>(target_arrays.begin(), target_arrays.end()),
ir_builder)),
shape_(target_arrays[0].GetShape()),
ir_builder_(ir_builder) {
if (target_arrays.size() > 1) {
// The sanity check for multiple outputs.
shape_ = target_arrays[0].GetShape();
for (int64 i = 1; i < target_arrays.size(); ++i) {
const Shape& element_shape = target_arrays[i].GetShape();
CHECK(ShapeUtil::SameDimensions(shape_, element_shape));
}
} else {
shape_ = target_arrays[0].GetShape();
// Sanity check: In multi-output fusion, all shapes produced must have the
// same dimensions.
for (const IrArray& array : target_arrays) {
CHECK(ShapeUtil::SameDimensions(shape_, array.GetShape()));
}
}

Expand Down
8 changes: 7 additions & 1 deletion tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,16 @@ class LoopEmitter {
// element of the given target array.
LoopEmitter(const ElementGenerator& target_element_generator,
const IrArray& target_array, llvm::IRBuilder<>* ir_builder);
// Same as previous method except emits multiple targets in an array.

// Constructs a LoopEmitter that emits one element into each of N separate
// arrays on each iteration of the loop.
//
// This is used for multi-output fusion. target_element_generator must
// produce an LLVM struct with N elements.
LoopEmitter(const ElementGenerator& target_element_generator,
tensorflow::gtl::ArraySlice<IrArray> target_arrays,
llvm::IRBuilder<>* ir_builder);

LoopEmitter(const LoopEmitter&) = delete;
LoopEmitter& operator=(const LoopEmitter&) = delete;
virtual ~LoopEmitter() = default;
Expand Down
6 changes: 3 additions & 3 deletions tensorflow/compiler/xla/service/service.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1453,9 +1453,9 @@ tensorflow::Status Service::Op(const OpRequest* arg, OpResponse* result) {
handle_status = computation->AddInfeedInstruction(arg->infeed_request());
break;
case OpRequest::kOutfeedRequest:
TF_RETURN_IF_ERROR(
computation->AddOutfeedInstruction(arg->outfeed_request()));
return tensorflow::Status::OK();
handle_status =
computation->AddOutfeedInstruction(arg->outfeed_request());
break;
case OpRequest::kMapRequest: {
TF_ASSIGN_OR_RETURN(
UserComputation * to_apply,
Expand Down