Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimization pass and Memory allocator integration #18909

Merged
merged 11 commits into from
May 2, 2018
11 changes: 9 additions & 2 deletions tensorflow/contrib/tensorrt/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -197,10 +197,12 @@ tf_py_wrap_cc(
tf_cuda_library(
name = "trt_resources",
srcs = [
"resources/trt_allocator.cc",
"resources/trt_int8_calibrator.cc",
"resources/trt_resource_manager.cc",
],
hdrs = [
"resources/trt_allocator.h",
"resources/trt_int8_calibrator.h",
"resources/trt_resource_manager.h",
"resources/trt_resources.h",
Expand All @@ -221,18 +223,24 @@ tf_cuda_library(
srcs = [
"convert/convert_graph.cc",
"convert/convert_nodes.cc",
"convert/trt_optimization_pass.cc",
],
hdrs = [
"convert/convert_graph.h",
"convert/convert_nodes.h",
"convert/trt_optimization_pass.h",
],
deps = [
":segment",
":trt_logging",
":trt_resources",
"//tensorflow/core/grappler/clusters:cluster",
"//tensorflow/core/grappler/optimizers:custom_graph_optimizer",
"//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
"//tensorflow/core/grappler:grappler_item",
"//tensorflow/core/grappler:utils",
"//tensorflow/core:framework",
"//tensorflow/core:gpu_runtime",
"//tensorflow/core:framework_lite",
"//tensorflow/core:graph",
"//tensorflow/core:lib",
Expand All @@ -241,8 +249,7 @@ tf_cuda_library(
"//tensorflow/core/grappler:devices",
"//tensorflow/core/grappler/clusters:virtual_cluster",
"//tensorflow/core/grappler/costs:graph_properties",
"//tensorflow/core/grappler/optimizers:constant_folding",
"//tensorflow/core/grappler/optimizers:layout_optimizer",
"//tensorflow/core/grappler/optimizers:meta_optimizer",
] + if_tensorrt([
"@local_config_tensorrt//:nv_infer",
]) + tf_custom_op_library_additional_deps(),
Expand Down
119 changes: 88 additions & 31 deletions tensorflow/contrib/tensorrt/convert/convert_graph.cc
Original file line number Diff line number Diff line change
Expand Up @@ -24,15 +24,17 @@ limitations under the License.

#include "tensorflow/contrib/tensorrt/convert/convert_nodes.h"
#include "tensorflow/contrib/tensorrt/segment/segment.h"
#include "tensorflow/core/common_runtime/gpu/gpu_id.h"
#include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h"
#include "tensorflow/core/common_runtime/gpu/process_state.h"
#include "tensorflow/core/graph/algorithm.h"
#include "tensorflow/core/graph/graph.h"
#include "tensorflow/core/graph/graph_constructor.h"
#include "tensorflow/core/grappler/clusters/virtual_cluster.h"
#include "tensorflow/core/grappler/costs/graph_properties.h"
#include "tensorflow/core/grappler/devices.h"
#include "tensorflow/core/grappler/grappler_item.h"
#include "tensorflow/core/grappler/optimizers/constant_folding.h"
#include "tensorflow/core/grappler/optimizers/layout_optimizer.h"
#include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
#include "tensorflow/core/grappler/utils.h"
#include "tensorflow/core/lib/core/errors.h"
#include "tensorflow/core/lib/core/status.h"
Expand Down Expand Up @@ -144,15 +146,19 @@ struct ConvertGraphParams {
size_t max_supported_batch_size, size_t max_consumed_workspace_size_bytes,
const tensorflow::grappler::GraphProperties& current_graph_properties,
std::unordered_map<string, std::pair<int, string>>* output_edges,
int engine_precision_mode)
int engine_precision_mode, const string& device_name,
std::shared_ptr<nvinfer1::IGpuAllocator> allocator, int cuda_gpu_id)
: graph(inp_graph),
output_names(output_node_names),
subgraph_node_ids(subgraph_node_id_numbers),
max_batch_size(max_supported_batch_size),
max_workspace_size_bytes(max_consumed_workspace_size_bytes),
graph_properties(current_graph_properties),
output_edge_map(output_edges),
precision_mode(engine_precision_mode) {}
precision_mode(engine_precision_mode),
device_name_(device_name),
allocator_(allocator),
cuda_gpu_id_(cuda_gpu_id) {}
tensorflow::Graph& graph;
const std::vector<string>& output_names;
const std::set<int>& subgraph_node_ids;
Expand All @@ -161,6 +167,9 @@ struct ConvertGraphParams {
const tensorflow::grappler::GraphProperties& graph_properties;
std::unordered_map<string, std::pair<int, string>>* output_edge_map;
int precision_mode;
string device_name_;
std::shared_ptr<nvinfer1::IGpuAllocator> allocator_;
int cuda_gpu_id_;
std::vector<std::pair<int, int>> subgraph_inputs;
std::vector<std::pair<int, int>> subgraph_outputs;
tensorflow::EdgeSet subgraph_incoming_edges;
Expand Down Expand Up @@ -194,7 +203,7 @@ static tensorflow::Status FillSubGraphEdgeSets(ConvertGraphParams* p) {
subgraph_outputs_set.begin(),
subgraph_outputs_set.end());
return tensorflow::Status::OK();
};
}

tensorflow::Status GetCalibNode(ConvertGraphParams* params) {
TF_RETURN_IF_ERROR(FillSubGraphEdgeSets(params));
Expand All @@ -203,7 +212,8 @@ tensorflow::Status GetCalibNode(ConvertGraphParams* params) {
params->subgraph_inputs, params->subgraph_outputs,
params->max_batch_size, params->max_workspace_size_bytes,
params->graph_properties, params->output_edge_map,
&trt_node_def, params->precision_mode);
&trt_node_def, params->precision_mode, params->device_name_,
params->allocator_, params->cuda_gpu_id_);
TF_RETURN_IF_ERROR(InjectCalibrationNode(s));
tensorflow::Status status;
tensorflow::Node* trt_node = params->graph.AddNode(trt_node_def, &status);
Expand Down Expand Up @@ -233,7 +243,8 @@ tensorflow::Status ConvertSubGraphToTensorRT(ConvertGraphParams* params) {
params->subgraph_inputs, params->subgraph_outputs,
params->max_batch_size, params->max_workspace_size_bytes,
params->graph_properties, params->output_edge_map,
&trt_node_def, params->precision_mode);
&trt_node_def, params->precision_mode, params->device_name_,
params->allocator_, params->cuda_gpu_id_);
TF_RETURN_IF_ERROR(ConvertSubGraphToTensorRTNodeDef(s));
tensorflow::Status status;
tensorflow::Node* trt_node = params->graph.AddNode(trt_node_def, &status);
Expand Down Expand Up @@ -331,47 +342,51 @@ tensorflow::Status ConvertGraphDefToTensorRT(
// optimization pass
tensorflow::grappler::GrapplerItem item;
item.fetch = output_names;
tensorflow::GraphDef gdef;

// Layout optimization
item.graph = graph_def;
tensorflow::grappler::LayoutOptimizer optimizer;
tensorflow::grappler::Cluster* cluster;

// virtual cluster
tensorflow::DeviceProperties device_properties;

device_properties.set_type("GPU");
device_properties.mutable_environment()->insert({"architecture", "6"});
cluster =
tensorflow::grappler::Cluster* cluster =
new tensorflow::grappler::VirtualCluster({{"/GPU:0", device_properties}});

// single machine
int num_cpu_cores = tensorflow::grappler::GetNumAvailableLogicalCPUCores();
int num_gpus = tensorflow::grappler::GetNumAvailableGPUs();
VLOG(2) << "cpu_cores: " << num_cpu_cores;
VLOG(2) << "gpus: " << num_gpus;

TF_RETURN_IF_ERROR(optimizer.Optimize(cluster, item, &gdef));
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please remove the definition of optimizer since it's not used.


// constant folding
tensorflow::RewriterConfig rw_cfg;
tensorflow::grappler::MetaOptimizer meta_opt(nullptr, rw_cfg);
tensorflow::GraphDef gdef;
TF_RETURN_IF_ERROR(meta_opt.Optimize(cluster, item, &gdef));
item.graph = gdef;
tensorflow::grappler::ConstantFolding fold(nullptr);
TF_RETURN_IF_ERROR(fold.Optimize(nullptr, item, &gdef));

// AJ refactoring shape inference through grappler/GraphProperties.
tensorflow::grappler::GraphProperties static_graph_properties(item);
TF_RETURN_IF_ERROR(static_graph_properties.InferStatically(false));
TF_RETURN_IF_ERROR(static_graph_properties.InferStatically(true));
// Build full graph

return ConvertAfterShapes(gdef, output_names, max_batch_size,
max_workspace_size_bytes, new_graph_def,
precision_mode, minimum_segment_size,
static_graph_properties, nullptr);
}

tensorflow::Status ConvertAfterShapes(
const tensorflow::GraphDef& gdef, const std::vector<string>& output_names,
size_t max_batch_size, size_t max_workspace_size_bytes,
tensorflow::GraphDef* new_graph_def, int precision_mode,
int minimum_segment_size,
const tensorflow::grappler::GraphProperties& graph_properties,
const tensorflow::grappler::Cluster* cluster) {
// Segment the graph into subgraphs that can be converted to TensorRT
tensorflow::tensorrt::segment::SegmentOptions segment_options;
tensorflow::FunctionLibraryDefinition flib(tensorflow::OpRegistry::Global(),
gdef.library());
tensorflow::Graph graph(flib);
TF_RETURN_IF_ERROR(tensorflow::ConvertGraphDefToGraph(
tensorflow::GraphConstructorOptions(), gdef, &graph));

// Segment the graph into subgraphs that can be converted to TensorRT
tensorflow::tensorrt::segment::SegmentOptions segment_options;

// TODO(ben,jie,sami): exclude output nodes (DISCUSS IT)
for (auto node : output_names) {
segment_options.exclude_node_list.insert(node);
Expand All @@ -381,7 +396,7 @@ tensorflow::Status ConvertGraphDefToTensorRT(
segment_options.minimum_segment_size = minimum_segment_size;
tensorflow::tensorrt::segment::SegmentNodesVector segments;
TF_RETURN_IF_ERROR(tensorrt::segment::SegmentGraph(
gdef, IsTensorRTCandidate, segment_options, &segments));
&graph, IsTensorRTCandidate, segment_options, &segments));
if (segments.size() > 1) {
VLOG(0) << "MULTIPLE tensorrt candidate conversion: " << segments.size();
}
Expand All @@ -391,9 +406,21 @@ tensorflow::Status ConvertGraphDefToTensorRT(
int count = 0;
float total_num_nodes_in_segments = 0.;
for (auto s : segments) {
total_num_nodes_in_segments += s.size();
total_num_nodes_in_segments += s.first.size();
}
for (const std::set<string>& subgraph_node_names : segments) {
// We create the map here since cluster may not be available in all cases.
std::map<string, tensorflow::Device*> name_to_device_map;
if (cluster) {
// TODO(aaroey): consider using DeviceSet::FindDeviceByName(), as in a
// distributed environment, devices from different workers can have same
// short name.
for (const auto dm : cluster->GetDeviceSet()->devices()) {
name_to_device_map[dm->name()] = dm;
}
}
for (const auto& segment_nodes_and_device : segments) {
const std::set<string>& subgraph_node_names =
segment_nodes_and_device.first;
std::set<int> subgraph_node_ids;
size_t max_mem_per_engine =
max_workspace_size_bytes *
Expand All @@ -403,10 +430,40 @@ tensorflow::Status ConvertGraphDefToTensorRT(
oss << " " << node_name;
subgraph_node_ids.insert(node_map.at(node_name)->id());
}
VLOG(2) << "Subgraph nodes" << oss.str();
VLOG(1) << "Subgraph nodes at device " << segment_nodes_and_device.second
<< " : " << oss.str();
auto target_device =
name_to_device_map.find(segment_nodes_and_device.second);
std::shared_ptr<nvinfer1::IGpuAllocator> allocator(0);

int cuda_device_id = 0;
if (target_device != name_to_device_map.end()) {
tensorflow::TfGpuId tf_gpu_id(target_device->second->parsed_name().id);
CudaGpuId cuda_gpu_id;
Status s = GpuIdManager::TfToCudaGpuId(tf_gpu_id, &cuda_gpu_id);
if (!s.ok()) {
LOG(ERROR)
<< "Cuda device identification failed, using device 0. Error= "
<< s;
} else {
cuda_device_id = cuda_gpu_id.value();
}
tensorflow::GPUOptions gpuoptions;
// we need to us PM here since in python path there is no way to get to
// allocators
auto pm = tensorflow::ProcessState::singleton();
// this should be instantiated by now
auto dev_allocator = pm->GetGPUAllocator(gpuoptions, tf_gpu_id, 1);
VLOG(1) << "Got an allocator for device tf_device=" << tf_gpu_id.value()
<< " cuda device= " << cuda_device_id << " at " << dev_allocator;
allocator = std::make_shared<TRTDeviceAllocator>(dev_allocator);
} else { // device unknown or not available
allocator = std::make_shared<TRTCudaAllocator>();
}
ConvertGraphParams p(graph, output_names, subgraph_node_ids, max_batch_size,
max_mem_per_engine, static_graph_properties,
&output_edge_map, precision_mode);
max_mem_per_engine, graph_properties, &output_edge_map,
precision_mode, segment_nodes_and_device.second,
allocator, cuda_device_id);
if (precision_mode == INT8MODE) {
tensorflow::Status status = GetCalibNode(&p);
if (status != tensorflow::Status::OK()) {
Expand Down
10 changes: 10 additions & 0 deletions tensorflow/contrib/tensorrt/convert/convert_graph.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ limitations under the License.
#include <vector>

#include "tensorflow/core/framework/graph.pb.h"
#include "tensorflow/core/grappler/clusters/cluster.h"
#include "tensorflow/core/grappler/costs/graph_properties.h"
#include "tensorflow/core/lib/core/status.h"
#include "tensorflow/core/platform/types.h"

Expand All @@ -43,6 +45,14 @@ tensorflow::Status ConvertGraphDefToTensorRT(
size_t max_workspace_size_bytes, tensorflow::GraphDef* new_graph_def,
int precision_mode, int minimum_segment_size);

// Method to call from optimization pass
tensorflow::Status ConvertAfterShapes(
const tensorflow::GraphDef& graph, const std::vector<string>& output_names,
size_t max_batch_size, size_t max_workspace_size_bytes,
tensorflow::GraphDef* new_graph_def, int precision_mode,
int minimum_segment_size,
const tensorflow::grappler::GraphProperties& graph_properties,
const tensorflow::grappler::Cluster* cluster);
} // namespace convert
} // namespace tensorrt
} // namespace tensorflow
Expand Down