341 lines (286 sloc) 13.2 KB
syntax = "proto3";
package tensorflow;
option cc_enable_arenas = true;
option java_outer_classname = "ConfigProtos";
option java_multiple_files = true;
option java_package = "org.tensorflow.framework";
import "tensorflow/core/framework/cost_graph.proto";
import "tensorflow/core/framework/graph.proto";
import "tensorflow/core/framework/step_stats.proto";
import "tensorflow/core/protobuf/debug.proto";
import "tensorflow/core/protobuf/cluster.proto";
import "tensorflow/core/protobuf/rewriter_config.proto";
message GPUOptions {
// A value between 0 and 1 that indicates what fraction of the
// available GPU memory to pre-allocate for each process. 1 means
// to pre-allocate all of the GPU memory, 0.5 means the process
// allocates ~50% of the available GPU memory.
double per_process_gpu_memory_fraction = 1;
// The type of GPU allocation strategy to use.
// Allowed values:
// "": The empty string (default) uses a system-chosen default
// which may change over time.
// "BFC": A "Best-fit with coalescing" algorithm, simplified from a
// version of dlmalloc.
string allocator_type = 2;
// Delay deletion of up to this many bytes to reduce the number of
// interactions with gpu driver code. If 0, the system chooses
// a reasonable default (several MBs).
int64 deferred_deletion_bytes = 3;
// If true, the allocator does not pre-allocate the entire specified
// GPU memory region, instead starting small and growing as needed.
bool allow_growth = 4;
// A comma-separated list of GPU ids that determines the 'visible'
// to 'virtual' mapping of GPU devices. For example, if TensorFlow
// can see 8 GPU devices in the process, and one wanted to map
// visible GPU devices 5 and 3 as "/gpu:0", and "/gpu:1", then one
// would specify this field as "5,3". This field is similar in
// spirit to the CUDA_VISIBLE_DEVICES environment variable, except
// it applies to the visible GPU devices in the process.
// NOTE: The GPU driver provides the process with the visible GPUs
// in an order which is not guaranteed to have any correlation to
// the *physical* GPU id in the machine. This field is used for
// remapping "visible" to "virtual", which means this operates only
// after the process starts. Users are required to use vendor
// specific mechanisms (e.g., CUDA_VISIBLE_DEVICES) to control the
// physical to visible device mapping prior to invoking TensorFlow.
string visible_device_list = 5;
// In the event polling loop sleep this many microseconds between
// PollEvents calls, when the queue is not empty. If value is not
// set or set to 0, gets set to a non-zero default.
int32 polling_active_delay_usecs = 6;
// In the event polling loop sleep this many millisconds between
// PollEvents calls, when the queue is empty. If value is not
// set or set to 0, gets set to a non-zero default.
int32 polling_inactive_delay_msecs = 7;
// Force all tensors to be gpu_compatible. On a GPU-enabled TensorFlow,
// enabling this option forces all CPU tensors to be allocated with Cuda
// pinned memory. Normally, TensorFlow will infer which tensors should be
// allocated as the pinned memory. But in case where the inference is
// incomplete, this option can significantly speed up the cross-device memory
// copy performance as long as it fits the memory.
// Note that this option is not something that should be
// enabled by default for unknown or very large models, since all Cuda pinned
// memory is unpageable, having too much pinned memory might negatively impact
// the overall host system performance.
bool force_gpu_compatible = 8;
// Options passed to the graph optimizer
message OptimizerOptions {
// If true, optimize the graph using common subexpression elimination.
bool do_common_subexpression_elimination = 1;
// If true, perform constant folding optimization on the graph.
bool do_constant_folding = 2;
// If true, perform function inlining on the graph.
bool do_function_inlining = 4;
// Optimization level
enum Level {
// L1 is the default level.
// Optimization performed at L1 :
// 1. Common subexpression elimination
// 2. Constant folding
L1 = 0;
// No optimizations
L0 = -1;
Level opt_level = 3;
// Control the use of the compiler/jit. Experimental.
enum GlobalJitLevel {
DEFAULT = 0; // Default setting ("off" now, but later expected to be "on")
OFF = -1;
// The following settings turn on compilation, with higher values being
// more aggressive. Higher values may reduce opportunities for parallelism
// and may use more memory. (At present, there is no distinction, but this
// is expected to change.)
ON_1 = 1;
ON_2 = 2;
GlobalJitLevel global_jit_level = 5;
message GraphOptions {
// Removed, use optimizer_options below.
reserved "skip_common_subexpression_elimination";
reserved 1;
// If true, use control flow to schedule the activation of Recv nodes.
// (Currently ignored.)
bool enable_recv_scheduling = 2;
// Options controlling how graph is optimized.
OptimizerOptions optimizer_options = 3;
// The number of steps to run before returning a cost model detailing
// the memory usage and performance of each node of the graph. 0 means
// no cost model.
int64 build_cost_model = 4;
// The number of steps to skip before collecting statistics for the
// cost model.
int64 build_cost_model_after = 9;
// Annotate each Node with Op output shape data, to the extent it can
// be statically inferred.
bool infer_shapes = 5;
// Only place the subgraphs that are run, rather than the entire graph.
// This is useful for interactive graph building, where one might
// produce graphs that cannot be placed during the debugging
// process. In particular, it allows the client to continue work in
// a session after adding a node to a graph whose placement
// constraints are unsatisfiable.
bool place_pruned_graph = 6;
// If true, transfer float values between processes as bfloat16.
bool enable_bfloat16_sendrecv = 7;
// If > 0, record a timeline every this many steps.
// EXPERIMENTAL: This currently has no effect in MasterSession.
int32 timeline_step = 8;
// Options that control the type and amount of graph rewriting.
// Not currently configurable via the public Python API (i.e. there is no API
// stability guarantee if you import RewriterConfig explicitly).
RewriterConfig rewrite_options = 10;
message ThreadPoolOptionProto {
// The number of threads in the pool.
// 0 means the system picks a value based on where this option proto is used
// (see the declaration of the specific field for more info).
int32 num_threads = 1;
// The global name of the threadpool.
// If empty, then the threadpool is made and used according to the scope it's
// in - e.g., for a session threadpool, it is used by that session only.
// If non-empty, then:
// - a global threadpool associated with this name is looked
// up or created. This allows, for example, sharing one threadpool across
// many sessions (e.g., like the default behavior, if
// inter_op_parallelism_threads is not configured), but still partitioning
// into a large and small pool.
// - if the threadpool for this global_name already exists, then it is an
// error if the existing pool was created using a different num_threads
// value as is specified on this call.
// - threadpools created this way are never garbage collected.
string global_name = 2;
message RPCOptions {
// If true, always use RPC to contact the session target.
// If false (the default option), TensorFlow may use an optimized
// transport for client-master communication that avoids the RPC
// stack. This option is primarily for used testing the RPC stack.
bool use_rpc_for_inprocess_master = 1;
// Session configuration parameters.
// The system picks appropriate values for fields that are not set.
message ConfigProto {
// Map from device type name (e.g., "CPU" or "GPU" ) to maximum
// number of devices of that type to use. If a particular device
// type is not found in the map, the system picks an appropriate
// number.
map<string, int32> device_count = 1;
// The execution of an individual op (for some op types) can be
// parallelized on a pool of intra_op_parallelism_threads.
// 0 means the system picks an appropriate number.
int32 intra_op_parallelism_threads = 2;
// Nodes that perform blocking operations are enqueued on a pool of
// inter_op_parallelism_threads available in each process.
// 0 means the system picks an appropriate number.
// Note that the first Session created in the process sets the
// number of threads for all future sessions unless use_per_session_threads is
// true or session_inter_op_thread_pool is configured.
int32 inter_op_parallelism_threads = 5;
// If true, use a new set of threads for this session rather than the global
// pool of threads. Only supported by direct sessions.
// If false, use the global threads created by the first session, or the
// per-session thread pools configured by session_inter_op_thread_pool.
// This option is deprecated. The same effect can be achieved by setting
// session_inter_op_thread_pool to have one element, whose num_threads equals
// inter_op_parallelism_threads.
bool use_per_session_threads = 9;
// This option is experimental - it may be replaced with a different mechanism
// in the future.
// Configures session thread pools. If this is configured, then RunOptions for
// a Run call can select the thread pool to use.
// The intended use is for when some session invocations need to run in a
// background pool limited to a small number of threads:
// - For example, a session may be configured to have one large pool (for
// regular compute) and one small pool (for periodic, low priority work);
// using the small pool is currently the mechanism for limiting the inter-op
// parallelism of the low priority work. Note that it does not limit the
// parallelism of work spawned by a single op kernel implementation.
// - Using this setting is normally not needed in training, but may help some
// serving use cases.
// - It is also generally recommended to set the global_name field of this
// proto, to avoid creating multiple large pools. It is typically better to
// run the non-low-priority work, even across sessions, in a single large
// pool.
repeated ThreadPoolOptionProto session_inter_op_thread_pool = 12;
// Assignment of Nodes to Devices is recomputed every placement_period
// steps until the system warms up (at which point the recomputation
// typically slows down automatically).
int32 placement_period = 3;
// When any filters are present sessions will ignore all devices which do not
// match the filters. Each filter can be partially specified, e.g. "/job:ps"
// "/job:worker/replica:3", etc.
repeated string device_filters = 4;
// Options that apply to all GPUs.
GPUOptions gpu_options = 6;
// Whether soft placement is allowed. If allow_soft_placement is true,
// an op will be placed on CPU if
// 1. there's no GPU implementation for the OP
// or
// 2. no GPU devices are known or registered
// or
// 3. need to co-locate with reftype input(s) which are from CPU.
bool allow_soft_placement = 7;
// Whether device placements should be logged.
bool log_device_placement = 8;
// Options that apply to all graphs.
GraphOptions graph_options = 10;
// Global timeout for all blocking operations in this session. If non-zero,
// and not overridden on a per-operation basis, this value will be used as the
// deadline for all blocking operations.
int64 operation_timeout_in_ms = 11;
// Options that apply when this session uses the distributed runtime.
RPCOptions rpc_options = 13;
// Optional list of all workers to use in this session.
ClusterDef cluster_def = 14;
// Next: 15
// Options for a single Run() call.
message RunOptions {
// TODO(pbar) Turn this into a TraceOptions proto which allows
// tracing to be controlled in a more orthogonal manner?
enum TraceLevel {
TraceLevel trace_level = 1;
// Time to wait for operation to complete in milliseconds.
int64 timeout_in_ms = 2;
// The thread pool to use, if session_inter_op_thread_pool is configured.
int32 inter_op_thread_pool = 3;
// Whether the partition graph(s) executed by the executor(s) should be
// outputted via RunMetadata.
bool output_partition_graphs = 5;
// EXPERIMENTAL. Options used to initialize DebuggerState, if enabled.
DebugOptions debug_options = 6;
reserved 4;
// Metadata output (i.e., non-Tensor) for a single Run() call.
message RunMetadata {
// Statistics traced for this step. Populated if tracing is turned on via the
// "RunOptions" proto.
// EXPERIMENTAL: The format and set of events may change in future versions.
StepStats step_stats = 1;
// The cost graph for the computation defined by the run call.
CostGraphDef cost_graph = 2;
// Graphs of the partitions executed by executors.
repeated GraphDef partition_graphs = 3;