tensorflow-master/tensorflow/python/training/server_lib.py

# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""A Python interface for creating TensorFlow servers."""

from tensorflow.core.protobuf import cluster_pb2
from tensorflow.core.protobuf import device_filters_pb2
from tensorflow.core.protobuf import tensorflow_server_pb2
from tensorflow.python.client import pywrap_tf_session as c_api
from tensorflow.python.framework import errors
from tensorflow.python.util import compat
from tensorflow.python.util import deprecation
from tensorflow.python.util.tf_export import tf_export


def _make_server_def(server_or_cluster_def, job_name, task_index, protocol,
                     config):
  """Creates a `tf.train.ServerDef` protocol buffer.

  Args:
    server_or_cluster_def: A `tf.train.ServerDef` or `tf.train.ClusterDef`
      protocol buffer, or a `tf.train.ClusterSpec` object, describing the server
      to be defined and/or the cluster of which it is a member.
    job_name: (Optional.) Specifies the name of the job of which the server is a
      member. Defaults to the value in `server_or_cluster_def`, if specified.
    task_index: (Optional.) Specifies the task index of the server in its job.
      Defaults to the value in `server_or_cluster_def`, if specified. Otherwise
      defaults to 0 if the server's job has only one task.
    protocol: (Optional.) Specifies the protocol to be used by the server.
      Acceptable values include `"grpc", "grpc+verbs"`. Defaults to the value in
      `server_or_cluster_def`, if specified. Otherwise defaults to `"grpc"`.
    config: (Options.) A `tf.compat.v1.ConfigProto` that specifies default
      configuration options for all sessions that run on this server.

  Returns:
    A `tf.train.ServerDef`.

  Raises:
    TypeError: If the arguments do not have the appropriate type.
    ValueError: If an argument is not specified and cannot be inferred.
  """
  server_def = tensorflow_server_pb2.ServerDef()
  if isinstance(server_or_cluster_def, tensorflow_server_pb2.ServerDef):
    server_def.MergeFrom(server_or_cluster_def)
    if job_name is not None:
      server_def.job_name = job_name
    if task_index is not None:
      server_def.task_index = task_index
    if protocol is not None:
      server_def.protocol = protocol
    if config is not None:
      server_def.default_session_config.MergeFrom(config)
  else:
    try:
      cluster_spec = ClusterSpec(server_or_cluster_def)
    except TypeError:
      raise TypeError("Could not convert `server_or_cluster_def` to a "
                      "`tf.train.ServerDef` or `tf.train.ClusterSpec`.")
    if job_name is None:
      if len(cluster_spec.jobs) == 1:
        job_name = cluster_spec.jobs[0]
      else:
        raise ValueError("Must specify an explicit `job_name`.")
    if task_index is None:
      task_indices = cluster_spec.task_indices(job_name)
      if len(task_indices) == 1:
        task_index = task_indices[0]
      else:
        raise ValueError("Must specify an explicit `task_index`.")
    if protocol is None:
      protocol = "grpc"

    server_def = tensorflow_server_pb2.ServerDef(
        cluster=cluster_spec.as_cluster_def(),
        job_name=job_name,
        task_index=task_index,
        protocol=protocol)
    if config is not None:
      server_def.default_session_config.MergeFrom(config)
  return server_def


@tf_export("distribute.Server", v1=["distribute.Server", "train.Server"])
@deprecation.deprecated_endpoints("train.Server")
class Server:
  """An in-process TensorFlow server, for use in distributed training.

  A `tf.distribute.Server` instance encapsulates a set of devices and a
  `tf.compat.v1.Session` target that
  can participate in distributed training. A server belongs to a
  cluster (specified by a `tf.train.ClusterSpec`), and
  corresponds to a particular task in a named job. The server can
  communicate with any other server in the same cluster.
  """

  def __init__(self,
               server_or_cluster_def,
               job_name=None,
               task_index=None,
               protocol=None,
               config=None,
               start=True):
    """Creates a new server with the given definition.

    The `job_name`, `task_index`, and `protocol` arguments are optional, and
    override any information provided in `server_or_cluster_def`.

    Args:
      server_or_cluster_def: A `tf.train.ServerDef` or `tf.train.ClusterDef`
        protocol buffer, or a `tf.train.ClusterSpec` object, describing the
        server to be created and/or the cluster of which it is a member.
      job_name: (Optional.) Specifies the name of the job of which the server is
        a member. Defaults to the value in `server_or_cluster_def`, if
        specified.
      task_index: (Optional.) Specifies the task index of the server in its job.
        Defaults to the value in `server_or_cluster_def`, if specified.
        Otherwise defaults to 0 if the server's job has only one task.
      protocol: (Optional.) Specifies the protocol to be used by the server.
        Acceptable values include `"grpc", "grpc+verbs"`. Defaults to the value
        in `server_or_cluster_def`, if specified. Otherwise defaults to
        `"grpc"`.
      config: (Options.) A `tf.compat.v1.ConfigProto` that specifies default
        configuration options for all sessions that run on this server.
      start: (Optional.) Boolean, indicating whether to start the server after
        creating it. Defaults to `True`.

    Raises:
      tf.errors.OpError: Or one of its subclasses if an error occurs while
        creating the TensorFlow server.
    """
    self._server_def = _make_server_def(server_or_cluster_def, job_name,
                                        task_index, protocol, config)
    self._server = c_api.TF_NewServer(self._server_def.SerializeToString())
    if start:
      self.start()

  def __del__(self):
    # At shutdown, `errors` may have been garbage collected.
    if errors is not None:
      exception = errors.UnimplementedError
    else:
      exception = Exception
    try:
      c_api.TF_ServerStop(self._server)
      # Clean shutdown of servers is not yet implemented, so
      # we leak instead of calling c_api.TF_DeleteServer here.
      # See:
      # https://github.com/tensorflow/tensorflow/blob/0495317a6e9dd4cac577b9d5cf9525e62b571018/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h#L73
    except AttributeError:
      # At shutdown, `c_api` may have been garbage collected.
      pass
    except exception:
      pass
    self._server = None

  def start(self):
    """Starts this server.

    Raises:
      tf.errors.OpError: Or one of its subclasses if an error occurs while
        starting the TensorFlow server.
    """
    c_api.TF_ServerStart(self._server)

  def join(self):
    """Blocks until the server has shut down.

    This method currently blocks forever.

    Raises:
      tf.errors.OpError: Or one of its subclasses if an error occurs while
        joining the TensorFlow server.
    """
    c_api.TF_ServerJoin(self._server)

  @property
  def server_def(self):
    """Returns the `tf.train.ServerDef` for this server.

    Returns:
      A `tf.train.ServerDef` protocol buffer that describes the configuration
      of this server.
    """
    return self._server_def

  @property
  def target(self):
    """Returns the target for a `tf.compat.v1.Session` to connect to this server.

    To create a
    `tf.compat.v1.Session` that
    connects to this server, use the following snippet:

    ```python
    server = tf.distribute.Server(...)
    with tf.compat.v1.Session(server.target):
      # ...
    ```

    Returns:
      A string containing a session target for this server.
    """
    return c_api.TF_ServerTarget(self._server)

  @staticmethod
  def create_local_server(config=None, start=True):
    """Creates a new single-process cluster running on the local host.

    This method is a convenience wrapper for creating a
    `tf.distribute.Server` with a `tf.train.ServerDef` that specifies a
    single-process cluster containing a single task in a job called
    `"local"`.

    Args:
      config: (Options.) A `tf.compat.v1.ConfigProto` that specifies default
        configuration options for all sessions that run on this server.
      start: (Optional.) Boolean, indicating whether to start the server after
        creating it. Defaults to `True`.

    Returns:
      A local `tf.distribute.Server`.
    """
    # Specifying port 0 means that the OS will choose a free port for the
    # server.
    return Server({"localhost": ["localhost:0"]},
                  protocol="grpc",
                  config=config,
                  start=start)


@tf_export("train.ClusterSpec")
class ClusterSpec:
  """Represents a cluster as a set of "tasks", organized into "jobs".

  A `tf.train.ClusterSpec` represents the set of processes that
  participate in a distributed TensorFlow computation. Every
  `tf.distribute.Server` is constructed in a particular cluster.

  To create a cluster with two jobs and five tasks, you specify the
  mapping from job names to lists of network addresses (typically
  hostname-port pairs).

  ```python
  cluster = tf.train.ClusterSpec({"worker": ["worker0.example.com:2222",
                                             "worker1.example.com:2222",
                                             "worker2.example.com:2222"],
                                  "ps": ["ps0.example.com:2222",
                                         "ps1.example.com:2222"]})
  ```

  Each job may also be specified as a sparse mapping from task indices
  to network addresses. This enables a server to be configured without
  needing to know the identity of (for example) all other worker
  tasks:

  ```python
  cluster = tf.train.ClusterSpec({"worker": {1: "worker1.example.com:2222"},
                                  "ps": ["ps0.example.com:2222",
                                         "ps1.example.com:2222"]})
  ```
  """

  def __init__(self, cluster):
    """Creates a `ClusterSpec`.

    Args:
      cluster: A dictionary mapping one or more job names to (i) a list of
        network addresses, or (ii) a dictionary mapping integer task indices to
        network addresses; or a `tf.train.ClusterDef` protocol buffer.

    Raises:
      TypeError: If `cluster` is not a dictionary mapping strings to lists
        of strings, and not a `tf.train.ClusterDef` protobuf.
    """
    if isinstance(cluster, dict):
      self._cluster_spec = {}
      for job_name, tasks in cluster.items():
        if isinstance(tasks, (list, tuple)):
          job_tasks = {i: task for i, task in enumerate(tasks)}
        elif isinstance(tasks, dict):
          job_tasks = {int(i): task for i, task in tasks.items()}
        else:
          raise TypeError("The tasks for job %r must be a list or a dictionary "
                          "from integers to strings." % job_name)
        self._cluster_spec[job_name] = job_tasks
      self._make_cluster_def()
    elif isinstance(cluster, cluster_pb2.ClusterDef):
      self._cluster_def = cluster
      self._cluster_spec = {}
      for job_def in self._cluster_def.job:
        self._cluster_spec[job_def.name] = {
            i: t for i, t in job_def.tasks.items()
        }
    elif isinstance(cluster, ClusterSpec):
      self._cluster_def = cluster_pb2.ClusterDef()
      self._cluster_def.MergeFrom(cluster.as_cluster_def())
      self._cluster_spec = {}
      for job_def in self._cluster_def.job:
        self._cluster_spec[job_def.name] = {
            i: t for i, t in job_def.tasks.items()
        }
    else:
      raise TypeError("`cluster` must be a dictionary mapping one or more "
                      "job names to lists of network addresses, or a "
                      "`ClusterDef` protocol buffer")

  def __bool__(self):
    return bool(self._cluster_spec)

  # Python 2.x
  __nonzero__ = __bool__

  def __eq__(self, other):
    return self._cluster_spec == other

  def __ne__(self, other):
    return self._cluster_spec != other

  def __repr__(self):
    key_values = self.as_dict()
    string_items = [
        repr(k) + ": " + repr(key_values[k]) for k in sorted(key_values)
    ]
    return "ClusterSpec({" + ", ".join(string_items) + "})"

  def as_dict(self):
    """Returns a dictionary from job names to their tasks.

    For each job, if the task index space is dense, the corresponding
    value will be a list of network addresses; otherwise it will be a
    dictionary mapping (sparse) task indices to the corresponding
    addresses.

    Returns:
      A dictionary mapping job names to lists or dictionaries
      describing the tasks in those jobs.
    """
    ret = {}
    for job in self.jobs:
      task_indices = self.task_indices(job)
      if len(task_indices) == 0:
        ret[job] = {}
        continue
      if max(task_indices) + 1 == len(task_indices):
        # Return a list because the task indices are dense. This
        # matches the behavior of `as_dict()` before support for
        # sparse jobs was added.
        ret[job] = self.job_tasks(job)
      else:
        ret[job] = {i: self.task_address(job, i) for i in task_indices}
    return ret

  def as_cluster_def(self):
    """Returns a `tf.train.ClusterDef` protocol buffer based on this cluster."""
    return self._cluster_def

  @property
  def jobs(self):
    """Returns a list of job names in this cluster.

    Returns:
      A list of strings, corresponding to the names of jobs in this cluster.
    """
    return list(self._cluster_spec.keys())

  def num_tasks(self, job_name):
    """Returns the number of tasks defined in the given job.

    Args:
      job_name: The string name of a job in this cluster.

    Returns:
      The number of tasks defined in the given job.

    Raises:
      ValueError: If `job_name` does not name a job in this cluster.
    """
    try:
      job = self._cluster_spec[job_name]
    except KeyError:
      raise ValueError("No such job in cluster: %r" % job_name)
    return len(job)

  def task_indices(self, job_name):
    """Returns a list of valid task indices in the given job.

    Args:
      job_name: The string name of a job in this cluster.

    Returns:
      A list of valid task indices in the given job.

    Raises:
      ValueError: If `job_name` does not name a job in this cluster,
      or no task with index `task_index` is defined in that job.
    """
    try:
      job = self._cluster_spec[job_name]
    except KeyError:
      raise ValueError("No such job in cluster: %r" % job_name)
    return list(sorted(job.keys()))

  def task_address(self, job_name, task_index):
    """Returns the address of the given task in the given job.

    Args:
      job_name: The string name of a job in this cluster.
      task_index: A non-negative integer.

    Returns:
      The address of the given task in the given job.

    Raises:
      ValueError: If `job_name` does not name a job in this cluster,
      or no task with index `task_index` is defined in that job.
    """
    try:
      job = self._cluster_spec[job_name]
    except KeyError:
      raise ValueError("No such job in cluster: %r" % job_name)
    try:
      return job[task_index]
    except KeyError:
      raise ValueError("No task with index %r in job %r" %
                       (task_index, job_name))

  def job_tasks(self, job_name):
    """Returns a mapping from task ID to address in the given job.

    NOTE: For backwards compatibility, this method returns a list. If
    the given job was defined with a sparse set of task indices, the
    length of this list may not reflect the number of tasks defined in
    this job. Use the `tf.train.ClusterSpec.num_tasks` method
    to find the number of tasks defined in a particular job.

    Args:
      job_name: The string name of a job in this cluster.

    Returns:
      A list of task addresses, where the index in the list
      corresponds to the task index of each task. The list may contain
      `None` if the job was defined with a sparse set of task indices.

    Raises:
      ValueError: If `job_name` does not name a job in this cluster.
    """
    try:
      job = self._cluster_spec[job_name]
    except KeyError:
      raise ValueError("No such job in cluster: %r" % job_name)
    ret = [None for _ in range(max(job.keys()) + 1)]
    for i, task in job.items():
      ret[i] = task
    return ret

  def _make_cluster_def(self):
    """Creates a `tf.train.ClusterDef` based on the given `cluster_spec`.

    Raises:
      TypeError: If `cluster_spec` is not a dictionary mapping strings to lists
        of strings.
    """
    self._cluster_def = cluster_pb2.ClusterDef()

    # NOTE(mrry): Sort by job_name to produce deterministic protobufs.
    for job_name, tasks in sorted(self._cluster_spec.items()):
      try:
        job_name = compat.as_bytes(job_name)
      except TypeError:
        raise TypeError("Job name %r must be bytes or unicode" % job_name)

      job_def = self._cluster_def.job.add()
      job_def.name = job_name

      for i, task_address in sorted(tasks.items()):
        try:
          task_address = compat.as_bytes(task_address)
        except TypeError:
          raise TypeError("Task address %r must be bytes or unicode" %
                          task_address)
        job_def.tasks[i] = task_address


@tf_export("config.experimental.ClusterDeviceFilters")
class ClusterDeviceFilters:
  """Represent a collection of device filters for the remote workers in cluster.

  NOTE: this is an experimental API and subject to changes.

  Set device filters for selective jobs and tasks. For each remote worker, the
  device filters are a list of strings. When any filters are present, the remote
  worker will ignore all devices which do not match any of its filters. Each
  filter can be partially specified, e.g. "/job:ps", "/job:worker/replica:3",
  etc. Note that a device is always visible to the worker it is located on.

  For example, to set the device filters for a parameter server cluster:

  ```python
  cdf = tf.config.experimental.ClusterDeviceFilters()
  for i in range(num_workers):
    cdf.set_device_filters('worker', i, ['/job:ps'])
  for i in range(num_ps):
    cdf.set_device_filters('ps', i, ['/job:worker'])

  tf.config.experimental_connect_to_cluster(cluster_def,
                                            cluster_device_filters=cdf)
  ```

  The device filters can be partically specified. For remote tasks that do not
  have device filters specified, all devices will be visible to them.
  """

  def __init__(self):
    # `_device_filters` is a dict mapping job names to job device filters.
    # Job device filters further maps task IDs to task device filters.
    # Task device filters are a list of strings, each one is a device filter.
    self._device_filters = {}

    # Serialized protobuf for cluster device filters.
    self._cluster_device_filters = None

  def set_device_filters(self, job_name, task_index, device_filters):
    """Set the device filters for given job name and task id."""
    assert all(isinstance(df, str) for df in device_filters)
    self._device_filters.setdefault(job_name, {})
    self._device_filters[job_name][task_index] = [df for df in device_filters]
    # Due to updates in data, invalidate the serialized proto cache.
    self._cluster_device_filters = None

  def _as_cluster_device_filters(self):
    """Returns a serialized protobuf of cluster device filters."""
    if self._cluster_device_filters:
      return self._cluster_device_filters

    self._make_cluster_device_filters()
    return self._cluster_device_filters

  def _make_cluster_device_filters(self):
    """Creates `ClusterDeviceFilters` proto based on the `_device_filters`.

    Raises:
      TypeError: If `_device_filters` is not a dictionary mapping strings to
      a map of task indices and device filters.
    """
    self._cluster_device_filters = device_filters_pb2.ClusterDeviceFilters()

    # Sort by job_name to produce deterministic protobufs.
    for job_name, tasks in sorted(self._device_filters.items()):
      try:
        job_name = compat.as_bytes(job_name)
      except TypeError:
        raise TypeError("Job name %r must be bytes or unicode" % job_name)

      jdf = self._cluster_device_filters.jobs.add()
      jdf.name = job_name

      for i, task_device_filters in sorted(tasks.items()):
        for tdf in task_device_filters:
          try:
            tdf = compat.as_bytes(tdf)
          except TypeError:
            raise TypeError("Device filter %r must be bytes or unicode" % tdf)
          jdf.tasks[i].device_filters.append(tdf)