Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Try to find an allocator when the engine is not assigned a device. #21508

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
24 changes: 24 additions & 0 deletions tensorflow/contrib/tensorrt/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -280,6 +280,7 @@ tf_cuda_library(
"//tensorflow/core/grappler:grappler_item",
"//tensorflow/core/grappler:utils",
"//tensorflow/core:framework_lite",
"//tensorflow/core:gpu_runtime",
"//tensorflow/core:graph",
"//tensorflow/core:lib",
"//tensorflow/core:lib_internal",
Expand All @@ -293,6 +294,29 @@ tf_cuda_library(
]) + tf_custom_op_library_additional_deps(),
)

tf_cuda_cc_test(
name = "convert_graph_test",
size = "medium",
srcs = ["convert/convert_graph_test.cc"],
tags = [
"no_cuda_on_cpu_tap",
"no_windows",
"nomac",
],
deps = [
":trt_conversion",
"//tensorflow/core/grappler:grappler_item",
"//tensorflow/core/grappler/clusters:cluster",
"//tensorflow/core:core_cpu",
"//tensorflow/core:direct_session",
"//tensorflow/core:protos_all_cc",
"//tensorflow/core:test",
"//tensorflow/core:test_main",
] + if_tensorrt([
"@local_config_tensorrt//:nv_infer",
]),
)

# Library for the segmenting portion of TensorRT operation creation
cc_library(
name = "segment",
Expand Down
73 changes: 49 additions & 24 deletions tensorflow/contrib/tensorrt/convert/convert_graph.cc
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@ limitations under the License.
#include "tensorflow/contrib/tensorrt/resources/trt_resources.h"
#include "tensorflow/contrib/tensorrt/segment/segment.h"
#include "tensorflow/contrib/tensorrt/test/utils.h"
#include "tensorflow/core/common_runtime/gpu/gpu_id.h"
#include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h"
#include "tensorflow/core/common_runtime/gpu/gpu_process_state.h"
#include "tensorflow/core/framework/function.h"
#include "tensorflow/core/framework/graph_to_functiondef.h"
#include "tensorflow/core/framework/node_def_builder.h"
Expand Down Expand Up @@ -772,33 +775,55 @@ std::pair<int, tensorflow::Allocator*> GetDeviceAndAllocator(
const ConversionParams& params, const EngineInfo& engine) {
int cuda_device_id = -1;
tensorflow::Allocator* dev_allocator = nullptr;
if (params.cluster) {
std::vector<tensorflow::Device*> devices;
if (!engine.device.empty() && params.cluster->GetDeviceSet()) {
DeviceNameUtils::ParsedName parsed_name;
if (DeviceNameUtils::ParseFullName(engine.device, &parsed_name) &&
parsed_name.has_id) {
params.cluster->GetDeviceSet()->FindMatchingDevices(parsed_name,
&devices);
if (params.cluster == nullptr || params.cluster->GetDeviceSet() == nullptr ||
engine.device.empty()) {
// If device is not set, use the first found GPU device for the conversion.
for (int tf_gpu_id_value = 0; tf_gpu_id_value < 100; ++tf_gpu_id_value) {

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The max is hard coded to 100.
How would this work in a virtual environment (e.g. with kubernetes) where a single GPU can be shared by possibly more than 100 users?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's a good point. Currently TfGpuId is always starting from 0, so if there are any gpu device initialized before, gpu 0 should always be available. Note that TfGpuId is a virtual identifier of the gpu device owned by the process, not the physical gpu id. But if we hard coded 0 here, changes to BaseGpuDevice initialization flow can break the integration, so I added the loop here to reduce that risk.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Got it. Looks okay.

TfGpuId tf_gpu_id(tf_gpu_id_value);
CudaGpuId cuda_gpu_id;
Status s = GpuIdManager::TfToCudaGpuId(tf_gpu_id, &cuda_gpu_id);
if (s.ok()) {
VLOG(1) << "Found TF GPU " << tf_gpu_id.value() << " at cuda device "
<< cuda_gpu_id.value();
cuda_device_id = cuda_gpu_id.value();
GPUOptions gpu_options;
// If the TF to Cuda gpu id mapping exist, the device and corresponding
// allocator must have been initialized already, so the
// GetGPUAllocator() call won't create a new allocator.
dev_allocator = GPUProcessState::singleton()->GetGPUAllocator(
gpu_options, tf_gpu_id, 1);
break;
}
LOG(ERROR) << "TF GPU with id " << tf_gpu_id_value << " does not exist "
<< s;
}
if (!devices.empty()) {
if (devices.size() > 1) {
string msg = "Found multiple matching devices using name '";
StrAppend(&msg, engine.device, "': ");
for (auto d : devices) StrAppend(&msg, d->name(), ", ");
StrAppend(&msg, ". Will get the allocator from first one.");
LOG(WARNING) << msg;
}
tensorflow::AllocatorAttributes alloc_attr;
cuda_device_id = devices[0]->tensorflow_gpu_device_info()->gpu_id;
dev_allocator = devices[0]->GetAllocator(alloc_attr);
VLOG(1) << "Using allocator " << dev_allocator->Name()
<< " and cuda_device_id " << cuda_device_id;
} else {
LOG(WARNING) << "Cluster is set but device '" << engine.device
<< "' is not found in the cluster";
return std::make_pair(cuda_device_id, dev_allocator);
}

// Use the device requested by the engine.
auto device_set = params.cluster->GetDeviceSet();
std::vector<tensorflow::Device*> devices;
DeviceNameUtils::ParsedName parsed_name;
if (DeviceNameUtils::ParseFullName(engine.device, &parsed_name) &&
parsed_name.has_id) {
device_set->FindMatchingDevices(parsed_name, &devices);
}
if (!devices.empty()) {
if (devices.size() > 1) {
string msg = "Found multiple matching devices using name '";
StrAppend(&msg, engine.device, "': ");
for (auto d : devices) StrAppend(&msg, d->name(), ", ");
StrAppend(&msg, ". Will get the allocator from first one.");
LOG(WARNING) << msg;
}
tensorflow::AllocatorAttributes alloc_attr;
cuda_device_id = devices[0]->tensorflow_gpu_device_info()->gpu_id;
dev_allocator = devices[0]->GetAllocator(alloc_attr);
VLOG(1) << "Using allocator " << dev_allocator->Name()
<< " and cuda_device_id " << cuda_device_id;
} else {
LOG(WARNING) << "Cluster is set but device '" << engine.device
<< "' is not found in the cluster";

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't we need to return here after LOG(WARNING)?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In that case cuda_device_id remains -1 and dev_allocator is nullptr.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay. Makes sense.

}
return std::make_pair(cuda_device_id, dev_allocator);
}
Expand Down
6 changes: 6 additions & 0 deletions tensorflow/contrib/tensorrt/convert/convert_graph.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ limitations under the License.

#include <vector>

#include "tensorflow/contrib/tensorrt/convert/convert_nodes.h"
#include "tensorflow/core/framework/graph.pb.h"
#include "tensorflow/core/grappler/clusters/cluster.h"
#include "tensorflow/core/grappler/costs/graph_properties.h"
Expand Down Expand Up @@ -84,6 +85,11 @@ std::vector<int> GetLinkedTensorRTVersion();

// Return runtime time TensorRT library version information.
std::vector<int> GetLoadedTensorRTVersion();

// Helper method for the conversion, expose for testing.
std::pair<int, tensorflow::Allocator*> GetDeviceAndAllocator(
const ConversionParams& params, const EngineInfo& engine);

} // namespace convert
} // namespace tensorrt
} // namespace tensorflow
Expand Down
140 changes: 140 additions & 0 deletions tensorflow/contrib/tensorrt/convert/convert_graph_test.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/

#include "tensorflow/contrib/tensorrt/convert/convert_graph.h"

#include "tensorflow/contrib/tensorrt/convert/convert_nodes.h"
#include "tensorflow/core/common_runtime/device_mgr.h"
#include "tensorflow/core/common_runtime/device_set.h"
#include "tensorflow/core/grappler/clusters/cluster.h"
#include "tensorflow/core/grappler/grappler_item.h"
#include "tensorflow/core/lib/core/status.h"
#include "tensorflow/core/lib/core/status_test_util.h"
#include "tensorflow/core/platform/test.h"
#include "tensorflow/core/protobuf/config.pb.h" // NOLINT
#include "tensorflow/core/public/session.h"

#if GOOGLE_CUDA
#if GOOGLE_TENSORRT

namespace tensorflow {
namespace tensorrt {
namespace convert {

class FakeCluster : public grappler::Cluster {
public:
FakeCluster() : Cluster(0) {}

void SetDeviceSet(const DeviceSet* device_set) { device_set_ = device_set; }

const DeviceSet* GetDeviceSet() const override { return device_set_; }

string type() const override { return ""; }
Status Provision() override { return Status::OK(); }
Status Initialize(const grappler::GrapplerItem& item) override {
return Status::OK();
}
virtual Status Run(const GraphDef& graph_def,
const std::vector<std::pair<string, Tensor>>& feed,
const std::vector<string>& fetch,
RunMetadata* metadata) override {
return Status::OK();
}

private:
const DeviceSet* device_set_;
};

TEST(ConvertGraphTest, GetDeviceAndAllocator) {
ConversionParams params;
EngineInfo engine_info;
{
// params.cluster is not set, and no gpu device is available.
auto result = GetDeviceAndAllocator(params, engine_info);
EXPECT_EQ(-1, result.first);
EXPECT_EQ(nullptr, result.second);
}

// Create a session with two (virtual) gpu device.
SessionOptions options;
ConfigProto* config = &options.config;
GPUOptions* gpu_options = config->mutable_gpu_options();
auto virtual_devices =
gpu_options->mutable_experimental()->add_virtual_devices();
virtual_devices->add_memory_limit_mb(200);
virtual_devices->add_memory_limit_mb(200);
std::unique_ptr<Session> session(NewSession(options));

{
// params.cluster is not set, should find and return first gpu id and
// corresponding allocator.
auto result = GetDeviceAndAllocator(params, engine_info);
EXPECT_EQ(0, result.first);
EXPECT_NE(nullptr, result.second);
EXPECT_EQ("GPU_0_bfc", result.second->Name());
}

FakeCluster cluster;
params.cluster = &cluster;
{
// params.cluster->GetDeviceSet() returns null, should find and return first
// gpu id and corresponding allocator.
auto result = GetDeviceAndAllocator(params, engine_info);
EXPECT_EQ(0, result.first);
EXPECT_NE(nullptr, result.second);
EXPECT_EQ("GPU_0_bfc", result.second->Name());
}

// Build the DeviceSet.
DeviceSet device_set;
const DeviceMgr* device_mgr = nullptr;
TF_ASSERT_OK(session->LocalDeviceManager(&device_mgr));
for (auto d : device_mgr->ListDevices()) {
device_set.AddDevice(d);
}
cluster.SetDeviceSet(&device_set);
{
// engine_info.device is not set, should find and return first gpu id and
// corresponding allocator.
auto result = GetDeviceAndAllocator(params, engine_info);
EXPECT_EQ(0, result.first);
EXPECT_NE(nullptr, result.second);
EXPECT_EQ("GPU_0_bfc", result.second->Name());
}

engine_info.device = "/GPU:1";
{
// Set to use second device.
auto result = GetDeviceAndAllocator(params, engine_info);
EXPECT_EQ(0, result.first);
EXPECT_NE(nullptr, result.second);
EXPECT_EQ("GPU_1_bfc", result.second->Name());
}

engine_info.device = "/GPU:3";
{
// Set to use nonexistent device.
auto result = GetDeviceAndAllocator(params, engine_info);
EXPECT_EQ(-1, result.first);
EXPECT_EQ(nullptr, result.second);
}
}

} // namespace convert
} // namespace tensorrt
} // namespace tensorflow

#endif // GOOGLE_TENSORRT
#endif // GOOGLE_CUDA
4 changes: 4 additions & 0 deletions tensorflow/contrib/tensorrt/convert/convert_nodes.cc
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,10 @@ limitations under the License.

namespace tensorflow {
namespace tensorrt {
// TODO(aaroey): put these constants into some class.
const char* const kInputPHName = "TensorRTInputPH_";
const char* const kOutputPHName = "TensorRTOutputPH_";

namespace convert {
using ::tensorflow::str_util::Split;
using ::tensorflow::strings::StrAppend;
Expand Down
5 changes: 3 additions & 2 deletions tensorflow/contrib/tensorrt/convert/convert_nodes.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,9 @@ limitations under the License.

namespace tensorflow {
namespace tensorrt {
static const char* kInputPHName = "TensorRTInputPH_";
static const char* kOutputPHName = "TensorRTOutputPH_";
extern const char* const kInputPHName;
extern const char* const kOutputPHName;

namespace convert {

struct EngineConnection {
Expand Down