diff --git a/docs/.buildinfo b/docs/.buildinfo index 36fb09ab..54c36710 100644 --- a/docs/.buildinfo +++ b/docs/.buildinfo @@ -1,4 +1,4 @@ # Sphinx build info version 1 # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. -config: abbb35398bf3c41c0f421213a6263bf9 +config: c31f3f4c132de601af2c7dd3d80ea76f tags: 645f666f9bcd5a90fca523b33c5a78b7 diff --git a/docs/_modules/index.html b/docs/_modules/index.html index 10c1b7d0..821b0c21 100644 --- a/docs/_modules/index.html +++ b/docs/_modules/index.html @@ -6,7 +6,7 @@ - Overview: module code — TensorFlowOnSpark 1.2.1 documentation + Overview: module code — TensorFlowOnSpark 1.3.0 documentation @@ -25,7 +25,7 @@

Navigation

  • modules |
  • - + @@ -35,11 +35,15 @@

    Navigation

    All modules for which code is available

    -
    + return tb_url +
    [docs]def run(sc, map_fun, tf_args, num_executors, num_ps, tensorboard=False, input_mode=InputMode.TENSORFLOW, - log_dir=None, driver_ps_nodes=False, queues=['input', 'output']): + log_dir=None, driver_ps_nodes=False, master_node=None, reservation_timeout=600, queues=['input', 'output', 'error']): """Starts the TensorFlowOnSpark cluster and Runs the TensorFlow "main" function on the Spark executors Args: @@ -247,6 +252,8 @@

    Source code for tensorflowonspark.TFCluster

         :input_mode: TFCluster.InputMode
         :log_dir: directory to save tensorboard event logs.  If None, defaults to a fixed path on local filesystem.
         :driver_ps_nodes: run the PS nodes on the driver locally instead of on the spark executors; this help maximizing computing resources (esp. GPU). You will need to set cluster_size = num_executors + num_ps
    +    :master_node: name of the "master" or "chief" node in the cluster_template, used for `tf.estimator` applications.
    +    :reservation_timeout: number of seconds after which cluster reservation times out (600 sec default)
         :queues: *INTERNAL_USE*
     
       Returns:
    @@ -261,8 +268,13 @@ 

    Source code for tensorflowonspark.TFCluster

       # build a cluster_spec template using worker_nums
       cluster_template = {}
       cluster_template['ps'] = range(num_ps)
    -  cluster_template['worker'] = range(num_ps, num_executors)
    -  logging.info("worker node range %s, ps node range %s" % (cluster_template['worker'], cluster_template['ps']))
    +  if master_node is None:
    +    cluster_template['worker'] = range(num_ps, num_executors)
    +  else:
    +    cluster_template[master_node] = range(num_ps, num_ps + 1)
    +    if num_executors > num_ps + 1:
    +      cluster_template['worker'] = range(num_ps + 1, num_executors)
    +  logging.info("cluster_template: {}".format(cluster_template))
     
       # get default filesystem from spark
       defaultFS = sc._jsc.hadoopConfiguration().get("fs.defaultFS")
    @@ -308,20 +320,28 @@ 

    Source code for tensorflowonspark.TFCluster

           ps_thread.start()
     
       # start TF on a background thread (on Spark driver) to allow for feeding job
    -  def _start():
    -    nodeRDD.foreachPartition(TFSparkNode.run(map_fun,
    -                                              tf_args,
    -                                              cluster_meta,
    -                                              tensorboard,
    -                                              log_dir,
    -                                              queues,
    -                                              background=(input_mode == InputMode.SPARK)))
    -  t = threading.Thread(target=_start)
    +  def _start(status):
    +    try:
    +      nodeRDD.foreachPartition(TFSparkNode.run(map_fun,
    +                                                tf_args,
    +                                                cluster_meta,
    +                                                tensorboard,
    +                                                log_dir,
    +                                                queues,
    +                                                background=(input_mode == InputMode.SPARK)))
    +    except Exception as e:
    +      logging.error("Exception in TF background thread")
    +      status['error'] = str(e)
    +
    +  t = threading.Thread(target=_start, args=(tf_status,))
    +  # run as daemon thread so that in spark mode main thread can exit
    +  # if feeder spark stage fails and main thread can't do explicit shutdown
    +  t.daemon = True
       t.start()
     
       # wait for executors to register and start TFNodes before continuing
       logging.info("Waiting for TFSparkNodes to start")
    -  cluster_info = server.await_reservations()
    +  cluster_info = server.await_reservations(sc, tf_status, reservation_timeout)
       logging.info("All TFSparkNodes started")
     
       # print cluster_info and extract TensorBoard URL
    @@ -338,13 +358,17 @@ 

    Source code for tensorflowonspark.TFCluster

         logging.info("")
         logging.info("========================================================================================")
     
    -  # since our "primary key" for each executor's TFManager is (host, ppid), sanity check for duplicates
    +  # since our "primary key" for each executor's TFManager is (host, executor_id), sanity check for duplicates
       # Note: this may occur if Spark retries failed Python tasks on the same executor.
       tb_nodes = set()
       for node in cluster_info:
    -    node_id = (node['host'],node['ppid'])
    +    node_id = (node['host'], node['executor_id'])
         if node_id in tb_nodes:
    -      raise Exception("Duplicate cluster node id detected (host={0}, ppid={1}).  Please ensure that (1) the number of executors >= number of TensorFlow nodes, (2) the number of tasks per executors == 1, and (3) TFCluster.shutdown() is successfully invoked when done.".format(node_id[0], node_id[1]))
    +      raise Exception("Duplicate cluster node id detected (host={0}, executor_id={1})".format(node_id[0], node_id[1]) +
    +                      "Please ensure that:\n" +
    +                      "1. Number of executors >= number of TensorFlow nodes\n" +
    +                      "2. Number of tasks per executors is 1\n" +
    +                      "3, TFCluster.shutdown() is successfully invoked when done.")
         else:
           tb_nodes.add(node_id)
     
    @@ -358,8 +382,8 @@ 

    Source code for tensorflowonspark.TFCluster

       cluster.input_mode = input_mode
       cluster.queues = queues
       cluster.server = server
    -
    -  return cluster
    +
    + return cluster
    @@ -369,12 +393,14 @@

    Source code for tensorflowonspark.TFCluster

             
    @@ -390,13 +416,13 @@

    Navigation

  • modules |
  • - +
    \ No newline at end of file diff --git a/docs/_modules/tensorflowonspark/TFManager.html b/docs/_modules/tensorflowonspark/TFManager.html index e61dbfba..05236afc 100644 --- a/docs/_modules/tensorflowonspark/TFManager.html +++ b/docs/_modules/tensorflowonspark/TFManager.html @@ -6,7 +6,7 @@ - tensorflowonspark.TFManager — TensorFlowOnSpark 1.2.1 documentation + tensorflowonspark.TFManager — TensorFlowOnSpark 1.3.0 documentation @@ -25,7 +25,7 @@

    Navigation

  • modules |
  • - +
    @@ -146,7 +146,7 @@

    Navigation

  • modules |
  • - +
    diff --git a/docs/_modules/tensorflowonspark/TFNode.html b/docs/_modules/tensorflowonspark/TFNode.html index 68d592f0..4f68f2df 100644 --- a/docs/_modules/tensorflowonspark/TFNode.html +++ b/docs/_modules/tensorflowonspark/TFNode.html @@ -6,7 +6,7 @@ - tensorflowonspark.TFNode — TensorFlowOnSpark 1.2.1 documentation + tensorflowonspark.TFNode — TensorFlowOnSpark 1.3.0 documentation @@ -25,7 +25,7 @@

    Navigation

  • modules |
  • - + @@ -59,6 +59,7 @@

    Source code for tensorflowonspark.TFNode

     from six.moves.queue import Empty
     from . import marker
     
    +
     
    [docs]def hdfs_path(ctx, path): """Convenience function to create a Tensorflow-compatible absolute HDFS path from relative paths @@ -85,6 +86,7 @@

    Source code for tensorflowonspark.TFNode

           logging.warn("Unknown scheme {0} with relative path: {1}".format(ctx.defaultFS, path))
    return "{0}/{1}".format(ctx.defaultFS, path) +
    [docs]def start_cluster_server(ctx, num_gpus=1, rdma=False): """Function that wraps the creation of TensorFlow ``tf.train.Server`` for a node in a distributed TensorFlow cluster. @@ -109,7 +111,8 @@

    Source code for tensorflowonspark.TFNode

       if tf.test.is_built_with_cuda():
         # GPU
         gpu_initialized = False
    -    while not gpu_initialized:
    +    retries = 3
    +    while not gpu_initialized and retries > 0:
           try:
             # override PS jobs to only reserve one GPU
             if ctx.job_name == 'ps':
    @@ -135,7 +138,10 @@ 

    Source code for tensorflowonspark.TFNode

           except Exception as e:
             print(e)
             logging.error("{0}: Failed to allocate GPU, trying again...".format(ctx.worker_num))
    +        retries -= 1
             time.sleep(10)
    +    if not gpu_initialized:
    +      raise Exception("Failed to allocate GPU")
       else:
         # CPU
         os.environ['CUDA_VISIBLE_DEVICES'] = ''
    @@ -149,10 +155,12 @@ 

    Source code for tensorflowonspark.TFNode

     
    return (cluster, server) +
    [docs]def next_batch(mgr, batch_size, qname='input'): """*DEPRECATED*. Use TFNode.DataFeed class instead."""
    raise Exception("DEPRECATED: Use TFNode.DataFeed class instead") +
    [docs]def export_saved_model(sess, export_dir, tag_set, signatures): """Convenience function to export a saved_model using provided arguments @@ -186,25 +194,29 @@

    Source code for tensorflowonspark.TFNode

       signature_def_map = {}
       for key, sig in signatures.items():
         signature_def_map[key] = tf.saved_model.signature_def_utils.build_signature_def(
    -              inputs={ name:tf.saved_model.utils.build_tensor_info(tensor) for name, tensor in sig['inputs'].items() },
    -              outputs={ name:tf.saved_model.utils.build_tensor_info(tensor) for name, tensor in sig['outputs'].items() },
    -              method_name=sig['method_name'] if 'method_name' in sig else key)
    +        inputs={name: tf.saved_model.utils.build_tensor_info(tensor) for name, tensor in sig['inputs'].items()},
    +        outputs={name: tf.saved_model.utils.build_tensor_info(tensor) for name, tensor in sig['outputs'].items()},
    +        method_name=sig['method_name'] if 'method_name' in sig else key)
       logging.info("===== signature_def_map: {}".format(signature_def_map))
    -  builder.add_meta_graph_and_variables(sess,
    -              tag_set.split(','),
    -              signature_def_map=signature_def_map,
    -              clear_devices=True)
    +  builder.add_meta_graph_and_variables(
    +      sess,
    +      tag_set.split(','),
    +      signature_def_map=signature_def_map,
    +      clear_devices=True)
       g.finalize()
    builder.save() +
    [docs]def batch_results(mgr, results, qname='output'): """*DEPRECATED*. Use TFNode.DataFeed class instead."""
    raise Exception("DEPRECATED: Use TFNode.DataFeed class instead") +
    [docs]def terminate(mgr, qname='input'): """*DEPRECATED*. Use TFNode.DataFeed class instead."""
    raise Exception("DEPRECATED: Use TFNode.DataFeed class instead") +
    [docs]class DataFeed(object): """This class manages the *InputMode.SPARK* data feeding process from the perspective of the TensorFlow application. @@ -222,7 +234,7 @@

    Source code for tensorflowonspark.TFNode

         self.qname_in = qname_in
         self.qname_out = qname_out
         self.done_feeding = False
    -    self.input_tensors = [ tensor for col, tensor in sorted(input_mapping.items()) ] if input_mapping is not None else None
    +    self.input_tensors = [tensor for col, tensor in sorted(input_mapping.items())] if input_mapping is not None else None
     
     
    [docs] def next_batch(self, batch_size): """Gets a batch of items from the input RDD. @@ -244,7 +256,7 @@

    Source code for tensorflowonspark.TFNode

         """
         logging.debug("next_batch() invoked")
         queue = self.mgr.get_queue(self.qname_in)
    -    tensors = [] if self.input_tensors is None else { tensor:[] for tensor in self.input_tensors }
    +    tensors = [] if self.input_tensors is None else {tensor: [] for tensor in self.input_tensors}
         count = 0
         while count < batch_size:
           item = queue.get(block=True)
    @@ -314,7 +326,6 @@ 

    Source code for tensorflowonspark.TFNode

           except Empty:
             logging.info("dropped {0} items from queue".format(count))
    done = True -
    @@ -347,7 +358,7 @@

    Navigation

  • modules |
  • - +
    diff --git a/docs/_modules/tensorflowonspark/TFSparkNode.html b/docs/_modules/tensorflowonspark/TFSparkNode.html index d90c7127..c4ac9c9d 100644 --- a/docs/_modules/tensorflowonspark/TFSparkNode.html +++ b/docs/_modules/tensorflowonspark/TFSparkNode.html @@ -6,7 +6,7 @@ - tensorflowonspark.TFSparkNode — TensorFlowOnSpark 1.2.1 documentation + tensorflowonspark.TFSparkNode — TensorFlowOnSpark 1.3.0 documentation @@ -25,7 +25,7 @@

    Navigation

  • modules |
  • - +
    @@ -46,6 +46,7 @@

    Source code for tensorflowonspark.TFSparkNode

    from __future__ import nested_scopes from __future__ import print_function +import json import logging import multiprocessing import os @@ -60,10 +61,12 @@

    Source code for tensorflowonspark.TFSparkNode

    from . import TFManager from . import TFNode +from . import gpu_info from . import marker from . import reservation from . import util +
    [docs]class TFNodeContext: """Encapsulates unique metadata for a TensorFlowOnSpark node/executor and provides methods to interact with Spark and HDFS. @@ -71,7 +74,7 @@

    Source code for tensorflowonspark.TFSparkNode

    To simply the end-user API, this class now mirrors the functions of the TFNode module. Args: - :worker_num: integer identifier for this executor, per ``nodeRDD = sc.parallelize(range(num_executors), num_executors).`` + :executor_id: integer identifier for this executor, per ``nodeRDD = sc.parallelize(range(num_executors), num_executors).`` :job_name: TensorFlow job name (e.g. 'ps' or 'worker') of this TF node, per cluster_spec. :task_index: integer rank per job_name, e.g. "worker:0", "worker:1", "ps:0". :cluster_spec: dictionary for constructing a tf.train.ClusterSpec. @@ -79,8 +82,9 @@

    Source code for tensorflowonspark.TFSparkNode

    :working_dir: the current working directory for local filesystems, or YARN containers. :mgr: TFManager instance for this Python worker. """ - def __init__(self, worker_num, job_name, task_index, cluster_spec, defaultFS, working_dir, mgr): - self.worker_num = worker_num + def __init__(self, executor_id, job_name, task_index, cluster_spec, defaultFS, working_dir, mgr): + self.worker_num = executor_id # for backwards-compatibility + self.executor_id = executor_id self.job_name = job_name self.task_index = task_index self.cluster_spec = cluster_spec @@ -121,22 +125,23 @@

    Source code for tensorflowonspark.TFSparkNode

    mgr = None #: TFManager instance
    cluster_id = None #: Unique ID for a given TensorFlowOnSpark cluster, used for invalidating state for new clusters. -def _get_manager(cluster_info, host, ppid): + +def _get_manager(cluster_info, host, executor_id): """Returns this executor's "singleton" instance of the multiprocessing.Manager, reconnecting per python-worker if needed. Args: :cluster_info: cluster node reservations - :host: host IP - :ppid: parent (executor JVM) PID + :host: host IP address + :executor_id: unique id per executor (created during initial call to run()) Returns: TFManager instance for this executor/python-worker """ for node in cluster_info: - if node['host'] == host and node['ppid'] == ppid: + if node['host'] == host and node['executor_id'] == executor_id: addr = node['addr'] authkey = node['authkey'] - TFSparkNode.mgr = TFManager.connect(addr,authkey) + TFSparkNode.mgr = TFManager.connect(addr, authkey) break if TFSparkNode.mgr is None: @@ -146,9 +151,10 @@

    Source code for tensorflowonspark.TFSparkNode

    "3. Spark dynamic allocation is disabled." raise Exception(msg) - logging.info("Connected to TFSparkNode.mgr on {0}, ppid={1}, state={2}".format(host, ppid, str(TFSparkNode.mgr.get('state')))) + logging.info("Connected to TFSparkNode.mgr on {0}, executor={1}, state={2}".format(host, executor_id, str(TFSparkNode.mgr.get('state')))) return TFSparkNode.mgr +
    [docs]def run(fn, tf_args, cluster_meta, tensorboard, log_dir, queues, background): """Wraps the user-provided TensorFlow main function in a Spark mapPartitions function. @@ -165,9 +171,15 @@

    Source code for tensorflowonspark.TFSparkNode

    A nodeRDD.mapPartitions() function. """ def _mapfn(iter): + import tensorflow as tf + # Note: consuming the input iterator helps Pyspark re-use this worker, for i in iter: - worker_num = i + executor_id = i + + # run quick check of GPU infrastructure if using tensorflow-gpu + if tf.test.is_built_with_cuda(): + gpus_to_use = gpu_info.get_gpus(1) # assign TF job/task based on provided cluster_spec template (or use default/null values) job_name = 'default' @@ -176,21 +188,21 @@

    Source code for tensorflowonspark.TFSparkNode

    cluster_template = cluster_meta['cluster_template'] for jobtype in cluster_template: nodes = cluster_template[jobtype] - if worker_num in nodes: + if executor_id in nodes: job_name = jobtype - task_index = nodes.index(worker_num) + task_index = nodes.index(executor_id) break - # get unique id (hostname,ppid) for this executor's JVM + # get unique key (hostname, executor_id) for this executor host = util.get_ip_address() - ppid = os.getppid() + util.write_executor_id(executor_id) port = 0 # check for existing TFManagers if TFSparkNode.mgr is not None and str(TFSparkNode.mgr.get('state')) != "'stopped'": if TFSparkNode.cluster_id == cluster_id: # raise an exception to force Spark to retry this "reservation" task on another executor - raise Exception("TFManager already started on {0}, ppid={1}, state={2}".format(host, ppid, str(TFSparkNode.mgr.get("state")))) + raise Exception("TFManager already started on {0}, executor={1}, state={2}".format(host, executor_id, str(TFSparkNode.mgr.get("state")))) else: # old state, just continue with creating new manager logging.warn("Ignoring old TFManager with cluster_id {0}, requested cluster_id {1}".format(TFSparkNode.cluster_id, cluster_id)) @@ -225,10 +237,10 @@

    Source code for tensorflowonspark.TFSparkNode

    tb_port = 0 if tensorboard and job_name == 'worker' and task_index == 0: tb_sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) - tb_sock.bind(('',0)) + tb_sock.bind(('', 0)) tb_port = tb_sock.getsockname()[1] tb_sock.close() - logdir = log_dir if log_dir else "tensorboard_%d" % worker_num + logdir = log_dir if log_dir else "tensorboard_%d" % executor_id # search for tensorboard in python/bin, PATH, and PYTHONPATH pypath = sys.executable @@ -252,8 +264,8 @@

    Source code for tensorflowonspark.TFSparkNode

    tmp_sock = None node_meta = None for node in cluster_info: - (nhost, nppid) = (node['host'], node['ppid']) - if nhost == host and nppid == ppid: + (nhost, nexec) = (node['host'], node['executor_id']) + if nhost == host and nexec == executor_id: node_meta = node port = node['port'] @@ -262,13 +274,12 @@

    Source code for tensorflowonspark.TFSparkNode

    # first, find a free port for TF tmp_sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) tmp_sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) - tmp_sock.bind(('',port)) + tmp_sock.bind(('', port)) port = tmp_sock.getsockname()[1] node_meta = { - 'worker_num': worker_num, + 'executor_id': executor_id, 'host': host, - 'ppid': ppid, 'job_name': job_name, 'task_index': task_index, 'port': port, @@ -285,21 +296,36 @@

    Source code for tensorflowonspark.TFSparkNode

    client.close() # construct a TensorFlow clusterspec from cluster_info - sorted_cluster_info = sorted(cluster_info, key=lambda k: k['worker_num']) + sorted_cluster_info = sorted(cluster_info, key=lambda k: k['executor_id']) spec = {} - last_worker_num = -1 + last_executor_id = -1 for node in sorted_cluster_info: - if (node['worker_num'] == last_worker_num): + if (node['executor_id'] == last_executor_id): raise Exception("Duplicate worker/task in cluster_info") - last_worker_num = node['worker_num'] + last_executor_id = node['executor_id'] logging.info("node: {0}".format(node)) (njob, nhost, nport) = (node['job_name'], node['host'], node['port']) hosts = [] if njob not in spec else spec[njob] hosts.append("{0}:{1}".format(nhost, nport)) spec[njob] = hosts + # update TF_CONFIG and reserve GPU for tf.estimator based code + # Note: this will execute but be ignored by non-tf.estimator code + tf_config = json.dumps({ + 'cluster': spec, + 'task': {'type': job_name, 'index': task_index}, + 'environment': 'cloud' + }) + os.environ['TF_CONFIG'] = tf_config + if tf.test.is_built_with_cuda(): + num_gpus = tf_args.num_gpus if 'num_gpus' in tf_args else 1 + gpus_to_use = gpu_info.get_gpus(num_gpus) + gpu_str = "GPUs" if num_gpus > 1 else "GPU" + logging.debug("Requested {} {}, setting CUDA_VISIBLE_DEVICES={}".format(num_gpus, gpu_str, gpus_to_use)) + os.environ['CUDA_VISIBLE_DEVICES'] = gpus_to_use + # create a context object to hold metadata for TF - ctx = TFNodeContext(worker_num, job_name, task_index, spec, cluster_meta['default_fs'], cluster_meta['working_dir'], TFSparkNode.mgr) + ctx = TFNodeContext(executor_id, job_name, task_index, spec, cluster_meta['default_fs'], cluster_meta['working_dir'], TFSparkNode.mgr) # release port reserved for TF as late as possible if tmp_sock is not None: @@ -333,7 +359,7 @@

    Source code for tensorflowonspark.TFSparkNode

    if job_name == 'ps' or background: # invoke the TensorFlow main function in a background thread logging.info("Starting TensorFlow {0}:{1} as {2} on cluster node {3} on background process".format( - job_name, task_index, job_name, worker_num)) + job_name, task_index, job_name, executor_id)) p = multiprocessing.Process(target=wrapper_fn_background, args=(tf_args, ctx)) if job_name == 'ps': @@ -361,17 +387,18 @@

    Source code for tensorflowonspark.TFSparkNode

    queue.task_done() else: # otherwise, just run TF function in the main executor/worker thread - logging.info("Starting TensorFlow {0}:{1} on cluster node {2} on foreground thread".format(job_name, task_index, worker_num)) + logging.info("Starting TensorFlow {0}:{1} on cluster node {2} on foreground thread".format(job_name, task_index, executor_id)) wrapper_fn(tf_args, ctx) - logging.info("Finished TensorFlow {0}:{1} on cluster node {2}".format(job_name, task_index, worker_num)) + logging.info("Finished TensorFlow {0}:{1} on cluster node {2}".format(job_name, task_index, executor_id))
    return _mapfn +
    [docs]def train(cluster_info, cluster_meta, qname='input'): """Feeds Spark partitions into the shared multiprocessing.Queue. Args: - :cluster_info: node reservation information for the cluster (e.g. host, ppid, pid, ports, etc) + :cluster_info: node reservation information for the cluster (e.g. host, executor_id, pid, ports, etc) :cluster_meta: dictionary of cluster metadata (e.g. cluster_id, reservation.Server address, etc) :qname: *INTERNAL_USE* @@ -380,7 +407,7 @@

    Source code for tensorflowonspark.TFSparkNode

    """ def _train(iter): # get shared queue, reconnecting if necessary - mgr = _get_manager(cluster_info, util.get_ip_address(), os.getppid()) + mgr = _get_manager(cluster_info, util.get_ip_address(), util.read_executor_id()) try: queue = mgr.get_queue(qname) equeue = mgr.get_queue('error') @@ -432,11 +459,12 @@

    Source code for tensorflowonspark.TFSparkNode

    return _train +
    [docs]def inference(cluster_info, qname='input'): """Feeds Spark partitions into the shared multiprocessing.Queue and returns inference results. Args: - :cluster_info: node reservation information for the cluster (e.g. host, ppid, pid, ports, etc) + :cluster_info: node reservation information for the cluster (e.g. host, executor_id, pid, ports, etc) :qname: *INTERNAL_USE* Returns: @@ -444,7 +472,7 @@

    Source code for tensorflowonspark.TFSparkNode

    """ def _inference(iter): # get shared queue, reconnecting if necessary - mgr = _get_manager(cluster_info, util.get_ip_address(), os.getppid()) + mgr = _get_manager(cluster_info, util.get_ip_address(), util.read_executor_id()) try: queue_in = mgr.get_queue(qname) equeue = mgr.get_queue('error') @@ -491,11 +519,12 @@

    Source code for tensorflowonspark.TFSparkNode

    return _inference +
    [docs]def shutdown(cluster_info, queues=['input']): """Stops all TensorFlow nodes by feeding ``None`` into the multiprocessing.Queues. Args: - :cluster_info: node reservation information for the cluster (e.g. host, ppid, pid, ports, etc). + :cluster_info: node reservation information for the cluster (e.g. host, executor_id, pid, ports, etc). :queues: *INTERNAL_USE* Returns: @@ -503,14 +532,14 @@

    Source code for tensorflowonspark.TFSparkNode

    """ def _shutdown(iter): host = util.get_ip_address() - ppid = os.getppid() + executor_id = util.read_executor_id() # reconnect to shared queue - mgr = _get_manager(cluster_info, host, ppid) + mgr = _get_manager(cluster_info, host, executor_id) # send SIGTERM to Tensorboard proc (if running) for node in cluster_info: - if node['host'] == host and node['ppid'] == ppid: + if node['host'] == host and node['executor_id'] == executor_id: tb_pid = node['tb_pid'] if tb_pid != 0: logging.info("Stopping tensorboard (pid={0})".format(tb_pid)) @@ -532,7 +561,6 @@

    Source code for tensorflowonspark.TFSparkNode

    return [True]
    return _shutdown -
    @@ -565,7 +593,7 @@

    Navigation

  • modules |
  • - +
    diff --git a/docs/_modules/tensorflowonspark/dfutil.html b/docs/_modules/tensorflowonspark/dfutil.html index 94fe65bd..c1acbcd2 100644 --- a/docs/_modules/tensorflowonspark/dfutil.html +++ b/docs/_modules/tensorflowonspark/dfutil.html @@ -4,27 +4,18 @@ + - tensorflowonspark.dfutil — TensorFlowOnSpark 1.2.0 documentation + tensorflowonspark.dfutil — TensorFlowOnSpark 1.3.0 documentation - + - - + @@ -69,8 +60,8 @@

    Source code for tensorflowonspark.dfutil

     
       Args:
         :df: Spark Dataframe
    -  """
    -  return df in loadedDF
    + """
    + return df in loadedDF
    [docs]def saveAsTFRecords(df, output_dir): @@ -84,8 +75,8 @@

    Source code for tensorflowonspark.dfutil

       """
       tf_rdd = df.rdd.mapPartitions(toTFExample(df.dtypes))
       tf_rdd.saveAsNewAPIHadoopFile(output_dir, "org.tensorflow.hadoop.io.TFRecordFileOutputFormat",
    -                            keyClass="org.apache.hadoop.io.BytesWritable",
    -                            valueClass="org.apache.hadoop.io.NullWritable")
    + keyClass="org.apache.hadoop.io.BytesWritable",
    + valueClass="org.apache.hadoop.io.NullWritable")
    [docs]def loadTFRecords(sc, input_dir, binary_features=[]): @@ -108,8 +99,8 @@

    Source code for tensorflowonspark.dfutil

       import tensorflow as tf
     
       tfr_rdd = sc.newAPIHadoopFile(input_dir, "org.tensorflow.hadoop.io.TFRecordFileInputFormat",
    -                              keyClass="org.apache.hadoop.io.BytesWritable",
    -                              valueClass="org.apache.hadoop.io.NullWritable")
    +                                keyClass="org.apache.hadoop.io.BytesWritable",
    +                                valueClass="org.apache.hadoop.io.NullWritable")
     
       # infer Spark SQL types from tf.Example
       record = tfr_rdd.take(1)[0]
    @@ -124,8 +115,8 @@ 

    Source code for tensorflowonspark.dfutil

       df = example_rdd.toDF(schema)
     
       # save reference of this dataframe
    -  loadedDF[df] = input_dir
    -  return df
    + loadedDF[df] = input_dir
    + return df
    [docs]def toTFExample(dtypes): @@ -171,8 +162,8 @@

    Source code for tensorflowonspark.dfutil

           example = tf.train.Example(features=tf.train.Features(feature=features))
           results.append((bytearray(example.SerializeToString()), None))
         return results
    -
    -  return _toTFExample
    +
    + return _toTFExample
    [docs]def infer_schema(example, binary_features=[]): @@ -208,8 +199,8 @@

    Source code for tensorflowonspark.dfutil

           return ArrayType(sql_type)
         else:                           # represent everything else as base types (and empty tensors as StringType())
           return sql_type
    -
    -  return StructType([ StructField(k, _infer_sql_type(k, v), True) for k,v in sorted(example.features.feature.items()) ])
    +
    + return StructType([StructField(k, _infer_sql_type(k, v), True) for k, v in sorted(example.features.feature.items())])
    [docs]def fromTFExample(iter, binary_features=[]): @@ -250,12 +241,11 @@

    Source code for tensorflowonspark.dfutil

       for record in iter:
         example = tf.train.Example()
         example.ParseFromString(bytes(record[0]))       # record is (bytestr, None)
    -    d = { k: _get_value(k, v) for k,v in sorted(example.features.feature.items()) }
    +    d = {k: _get_value(k, v) for k, v in sorted(example.features.feature.items())}
         row = Row(**d)
         results.append(row)
    -
    -  return results
    - +
    + return results
    @@ -265,12 +255,14 @@

    Source code for tensorflowonspark.dfutil

             
    @@ -286,13 +278,13 @@

    Navigation

  • modules |
  • - +
    \ No newline at end of file diff --git a/docs/_modules/tensorflowonspark/gpu_info.html b/docs/_modules/tensorflowonspark/gpu_info.html index 24fd6b9d..58479d66 100644 --- a/docs/_modules/tensorflowonspark/gpu_info.html +++ b/docs/_modules/tensorflowonspark/gpu_info.html @@ -6,7 +6,7 @@ - tensorflowonspark.gpu_info — TensorFlowOnSpark 1.2.1 documentation + tensorflowonspark.gpu_info — TensorFlowOnSpark 1.3.0 documentation @@ -25,7 +25,7 @@

    Navigation

  • modules |
  • - + @@ -54,6 +54,7 @@

    Source code for tensorflowonspark.gpu_info

     
     MAX_RETRIES = 3           #: Maximum retries to allocate GPUs
     
    +
     def _get_gpu():
       """*DEPRECATED*. Allocates first available GPU using cudaSetDevice(), or returns 0 otherwise."""
       # Note: this code executes, but Tensorflow subsequently complains that the "current context was not created by the StreamExecutor cuda_driver API"
    @@ -76,6 +77,7 @@ 

    Source code for tensorflowonspark.gpu_info

           break
       return gpu
     
    +
     
    [docs]def get_gpus(num_gpu=1): """Get list of free GPUs according to nvidia-smi. @@ -87,49 +89,46 @@

    Source code for tensorflowonspark.gpu_info

       Returns:
         Comma-delimited string of GPU ids, or raises an Exception if the requested number of GPUs could not be found.
       """
    +  # get list of gpus (index, uuid)
    +  list_gpus = subprocess.check_output(["nvidia-smi", "--list-gpus"]).decode()
    +  logging.debug("all GPUs:\n{0}".format(list_gpus))
    +
    +  # parse index and guid
    +  gpus = [x for x in list_gpus.split('\n') if len(x) > 0]
    +
    +  def parse_gpu(gpu_str):
    +    cols = gpu_str.split(' ')
    +    return cols[5].split(')')[0], cols[1].split(':')[0]
    +  gpu_list = [parse_gpu(gpu) for gpu in gpus]
    +
    +  # randomize the search order to get a better distribution of GPUs
    +  random.shuffle(gpu_list)
    +
    +  free_gpus = []
    +  retries = 0
    +  while len(free_gpus) < num_gpu and retries < MAX_RETRIES:
    +    smi_output = subprocess.check_output(["nvidia-smi", "--format=csv,noheader,nounits", "--query-compute-apps=gpu_uuid"]).decode()
    +    logging.debug("busy GPUs:\n{0}".format(smi_output))
    +    busy_uuids = [x for x in smi_output.split('\n') if len(x) > 0]
    +    for uuid, index in gpu_list:
    +      if uuid not in busy_uuids:
    +        free_gpus.append(index)
     
    -  try:
    -    # get list of gpus (index, uuid)
    -    list_gpus = subprocess.check_output(["nvidia-smi", "--list-gpus"]).decode()
    -    logging.debug("all GPUs:\n{0}".format(list_gpus))
    -
    -    # parse index and guid
    -    gpus = [ x for x in list_gpus.split('\n') if len(x) > 0 ]
    -
    -    def parse_gpu(gpu_str):
    -      cols = gpu_str.split(' ')
    -      return cols[5].split(')')[0], cols[1].split(':')[0]
    -    gpu_list = [parse_gpu(gpu) for gpu in gpus]
    -
    -    # randomize the search order to get a better distribution of GPUs
    -    random.shuffle(gpu_list)
    -
    -    free_gpus = []
    -    retries = 0
    -    while len(free_gpus) < num_gpu and retries < MAX_RETRIES:
    -      smi_output = subprocess.check_output(["nvidia-smi", "--format=csv,noheader,nounits", "--query-compute-apps=gpu_uuid"]).decode()
    -      logging.debug("busy GPUs:\n{0}".format(smi_output))
    -      busy_uuids = [x for x in smi_output.split('\n') if len(x) > 0 ]
    -      for uuid, index in gpu_list:
    -        if uuid not in busy_uuids:
    -          free_gpus.append(index)
    -
    -      if len(free_gpus) < num_gpu:
    -        # keep trying indefinitely
    -        logging.warn("Unable to find available GPUs: requested={0}, available={1}".format(num_gpu, len(free_gpus)))
    -        retries += 1
    -        time.sleep(30 * retries)
    -        free_gpus = []
    -
    -    # if still can't find GPUs, raise exception
         if len(free_gpus) < num_gpu:
    -      smi_output = subprocess.check_output(["nvidia-smi", "--format=csv", "--query-compute-apps=gpu_uuid,pid,process_name,used_gpu_memory"]).decode()
    -      logging.info(": {0}".format(smi_output))
    -      raise Exception("Unable to find free GPU:\n{0}".format(smi_output))
    +      # keep trying indefinitely
    +      logging.warn("Unable to find available GPUs: requested={0}, available={1}".format(num_gpu, len(free_gpus)))
    +      retries += 1
    +      time.sleep(30 * retries)
    +      free_gpus = []
    +
    +  # if still can't find GPUs, raise exception
    +  if len(free_gpus) < num_gpu:
    +    smi_output = subprocess.check_output(["nvidia-smi", "--format=csv", "--query-compute-apps=gpu_uuid,pid,process_name,used_gpu_memory"]).decode()
    +    logging.info(": {0}".format(smi_output))
    +    raise Exception("Unable to find free GPU:\n{0}".format(smi_output))
    +
    + return ','.join(free_gpus[:num_gpu]) - return ','.join(free_gpus[:num_gpu]) - except subprocess.CalledProcessError as e:
    - print ("nvidia-smi error", e.output) # Function to get the gpu information def _get_free_gpu(max_gpu_utilization=40, min_free_memory=0.5, num_gpu=1): @@ -164,7 +163,7 @@

    Source code for tensorflowonspark.gpu_info

       # Read the gpu information multiple times
       num_times_to_average = 5
       current_array = []
    -  for ind in xrange(num_times_to_average):
    +  for ind in range(num_times_to_average):
         current_array.append(get_gpu_info())
         time.sleep(1)
     
    @@ -172,12 +171,12 @@ 

    Source code for tensorflowonspark.gpu_info

       num_gpus = len(current_array[0])
     
       # Average the gpu information
    -  avg_array = [(0,0,str(x)) for x in xrange(num_gpus)]
    -  for ind in xrange(num_times_to_average):
    -    for gpu_ind in xrange(num_gpus):
    +  avg_array = [(0, 0, str(x)) for x in range(num_gpus)]
    +  for ind in range(num_times_to_average):
    +    for gpu_ind in range(num_gpus):
           avg_array[gpu_ind] = (avg_array[gpu_ind][0] + current_array[ind][gpu_ind][0], avg_array[gpu_ind][1] + current_array[ind][gpu_ind][1], avg_array[gpu_ind][2])
     
    -  for gpu_ind in xrange(num_gpus):
    +  for gpu_ind in range(num_gpus):
         avg_array[gpu_ind] = (float(avg_array[gpu_ind][0]) / num_times_to_average, float(avg_array[gpu_ind][1]) / num_times_to_average, avg_array[gpu_ind][2])
     
       avg_array.sort()
    @@ -202,7 +201,6 @@ 

    Source code for tensorflowonspark.gpu_info

           break
     
       return gpus_to_use, free_memory
    -
     
    @@ -235,7 +233,7 @@

    Navigation

  • modules |
  • - +
    diff --git a/docs/_modules/tensorflowonspark/marker.html b/docs/_modules/tensorflowonspark/marker.html index 782a355f..7e12be10 100644 --- a/docs/_modules/tensorflowonspark/marker.html +++ b/docs/_modules/tensorflowonspark/marker.html @@ -6,7 +6,7 @@ - tensorflowonspark.marker — TensorFlowOnSpark 1.2.1 documentation + tensorflowonspark.marker — TensorFlowOnSpark 1.3.0 documentation @@ -25,7 +25,7 @@

    Navigation

  • modules |
  • - +
    @@ -45,14 +45,15 @@

    Source code for tensorflowonspark.marker

     from __future__ import nested_scopes
     from __future__ import print_function
     
    +
     
    [docs]class Marker(object): """Base class for special marker objects in the data queue"""
    pass +
    [docs]class EndPartition(Marker): """Marks the end of an RDD Partition during data feeding"""
    pass -
    @@ -85,7 +86,7 @@

    Navigation

  • modules |
  • - + diff --git a/docs/_modules/tensorflowonspark/pipeline.html b/docs/_modules/tensorflowonspark/pipeline.html index 88a2b366..a81b546e 100644 --- a/docs/_modules/tensorflowonspark/pipeline.html +++ b/docs/_modules/tensorflowonspark/pipeline.html @@ -4,27 +4,18 @@ + - tensorflowonspark.pipeline — TensorFlowOnSpark 1.2.0 documentation + tensorflowonspark.pipeline — TensorFlowOnSpark 1.3.0 documentation - + - - + @@ -81,7 +72,8 @@

    Source code for tensorflowonspark.pipeline

     import subprocess
     import sys
     
    -##### TensorFlowOnSpark Params
    +
    +# TensorFlowOnSpark Params
     
     
    [docs]class TFTypeConverters(object): """Custom DataFrame TypeConverter for dictionary types (since this is not provided by Spark core).""" @@ -89,160 +81,227 @@

    Source code for tensorflowonspark.pipeline

       def toDict(value):
         if type(value) == dict:
           return value
    -    else:
    -      raise TypeError("Could not convert %s to OrderedDict" % value)
    + else:
    + raise TypeError("Could not convert %s to OrderedDict" % value) +
    [docs]class HasBatchSize(Params): batch_size = Param(Params._dummy(), "batch_size", "Number of records per batch", typeConverter=TypeConverters.toInt) + def __init__(self): super(HasBatchSize, self).__init__() -
    [docs] def setBatchSize(self, value): - return self._set(batch_size=value)
    -
    [docs] def getBatchSize(self): - return self.getOrDefault(self.batch_size)
    + +
    [docs] def setBatchSize(self, value):
    + return self._set(batch_size=value) + +
    [docs] def getBatchSize(self):
    + return self.getOrDefault(self.batch_size) +
    [docs]class HasClusterSize(Params): cluster_size = Param(Params._dummy(), "cluster_size", "Number of nodes in the cluster", typeConverter=TypeConverters.toInt) + def __init__(self): super(HasClusterSize, self).__init__() -
    [docs] def setClusterSize(self, value): - return self._set(cluster_size=value)
    -
    [docs] def getClusterSize(self): - return self.getOrDefault(self.cluster_size)
    + +
    [docs] def setClusterSize(self, value):
    + return self._set(cluster_size=value) + +
    [docs] def getClusterSize(self):
    + return self.getOrDefault(self.cluster_size) +
    [docs]class HasEpochs(Params): epochs = Param(Params._dummy(), "epochs", "Number of epochs to train", typeConverter=TypeConverters.toInt) + def __init__(self): super(HasEpochs, self).__init__() -
    [docs] def setEpochs(self, value): - return self._set(epochs=value)
    -
    [docs] def getEpochs(self): - return self.getOrDefault(self.epochs)
    + +
    [docs] def setEpochs(self, value):
    + return self._set(epochs=value) + +
    [docs] def getEpochs(self):
    + return self.getOrDefault(self.epochs) +
    [docs]class HasInputMapping(Params): input_mapping = Param(Params._dummy(), "input_mapping", "Mapping of input DataFrame column to input tensor", typeConverter=TFTypeConverters.toDict) + def __init__(self): super(HasInputMapping, self).__init__() -
    [docs] def setInputMapping(self, value): - return self._set(input_mapping=value)
    -
    [docs] def getInputMapping(self): - return self.getOrDefault(self.input_mapping)
    + +
    [docs] def setInputMapping(self, value):
    + return self._set(input_mapping=value) + +
    [docs] def getInputMapping(self):
    + return self.getOrDefault(self.input_mapping) +
    [docs]class HasInputMode(Params): input_mode = Param(Params._dummy(), "input_mode", "Input data feeding mode (0=TENSORFLOW, 1=SPARK)", typeConverter=TypeConverters.toInt) + def __init__(self): super(HasInputMode, self).__init__() -
    [docs] def setInputMode(self, value): - return self._set(input_mode=value)
    -
    [docs] def getInputMode(self): - return self.getOrDefault(self.input_mode)
    + +
    [docs] def setInputMode(self, value):
    + return self._set(input_mode=value) + +
    [docs] def getInputMode(self):
    + return self.getOrDefault(self.input_mode) +
    [docs]class HasModelDir(Params): model_dir = Param(Params._dummy(), "model_dir", "Path to save/load model checkpoints", typeConverter=TypeConverters.toString) + def __init__(self): super(HasModelDir, self).__init__() -
    [docs] def setModelDir(self, value): - return self._set(model_dir=value)
    -
    [docs] def getModelDir(self): - return self.getOrDefault(self.model_dir)
    + +
    [docs] def setModelDir(self, value):
    + return self._set(model_dir=value) + +
    [docs] def getModelDir(self):
    + return self.getOrDefault(self.model_dir) +
    [docs]class HasNumPS(Params): num_ps = Param(Params._dummy(), "num_ps", "Number of PS nodes in cluster", typeConverter=TypeConverters.toInt) driver_ps_nodes = Param(Params._dummy(), "driver_ps_nodes", "Run PS nodes on driver locally", typeConverter=TypeConverters.toBoolean) + def __init__(self): super(HasNumPS, self).__init__() -
    [docs] def setNumPS(self, value): - return self._set(num_ps=value)
    -
    [docs] def getNumPS(self): - return self.getOrDefault(self.num_ps)
    -
    [docs] def setDriverPSNodes(self, value): - return self._set(driver_ps_nodes=value)
    -
    [docs] def getDriverPSNodes(self): - return self.getOrDefault(self.driver_ps_nodes)
    + +
    [docs] def setNumPS(self, value):
    + return self._set(num_ps=value) + +
    [docs] def getNumPS(self):
    + return self.getOrDefault(self.num_ps) + +
    [docs] def setDriverPSNodes(self, value):
    + return self._set(driver_ps_nodes=value) + +
    [docs] def getDriverPSNodes(self):
    + return self.getOrDefault(self.driver_ps_nodes) +
    [docs]class HasOutputMapping(Params): output_mapping = Param(Params._dummy(), "output_mapping", "Mapping of output tensor to output DataFrame column", typeConverter=TFTypeConverters.toDict) + def __init__(self): super(HasOutputMapping, self).__init__() -
    [docs] def setOutputMapping(self, value): - return self._set(output_mapping=value)
    -
    [docs] def getOutputMapping(self): - return self.getOrDefault(self.output_mapping)
    + +
    [docs] def setOutputMapping(self, value):
    + return self._set(output_mapping=value) + +
    [docs] def getOutputMapping(self):
    + return self.getOrDefault(self.output_mapping) +
    [docs]class HasProtocol(Params): protocol = Param(Params._dummy(), "protocol", "Network protocol for Tensorflow (grpc|rdma)", typeConverter=TypeConverters.toString) + def __init__(self): super(HasProtocol, self).__init__() -
    [docs] def setProtocol(self, value): - return self._set(protocol=value)
    -
    [docs] def getProtocol(self): - return self.getOrDefault(self.protocol)
    + +
    [docs] def setProtocol(self, value):
    + return self._set(protocol=value) + +
    [docs] def getProtocol(self):
    + return self.getOrDefault(self.protocol) +
    [docs]class HasReaders(Params): readers = Param(Params._dummy(), "readers", "number of reader/enqueue threads", typeConverter=TypeConverters.toInt) + def __init__(self): super(HasReaders, self).__init__() -
    [docs] def setReaders(self, value): - return self._set(readers=value)
    -
    [docs] def getReaders(self): - return self.getOrDefault(self.readers)
    + +
    [docs] def setReaders(self, value):
    + return self._set(readers=value) + +
    [docs] def getReaders(self):
    + return self.getOrDefault(self.readers) +
    [docs]class HasSteps(Params): steps = Param(Params._dummy(), "steps", "Maximum number of steps to train", typeConverter=TypeConverters.toInt) + def __init__(self): super(HasSteps, self).__init__() -
    [docs] def setSteps(self, value): - return self._set(steps=value)
    -
    [docs] def getSteps(self): - return self.getOrDefault(self.steps)
    + +
    [docs] def setSteps(self, value):
    + return self._set(steps=value) + +
    [docs] def getSteps(self):
    + return self.getOrDefault(self.steps) +
    [docs]class HasTensorboard(Params): tensorboard = Param(Params._dummy(), "tensorboard", "Launch tensorboard process", typeConverter=TypeConverters.toBoolean) + def __init__(self): super(HasTensorboard, self).__init__() -
    [docs] def setTensorboard(self, value): - return self._set(tensorboard=value)
    -
    [docs] def getTensorboard(self): - return self.getOrDefault(self.tensorboard)
    + +
    [docs] def setTensorboard(self, value):
    + return self._set(tensorboard=value) + +
    [docs] def getTensorboard(self):
    + return self.getOrDefault(self.tensorboard) +
    [docs]class HasTFRecordDir(Params): tfrecord_dir = Param(Params._dummy(), "tfrecord_dir", "Path to temporarily export a DataFrame as TFRecords (for InputMode.TENSORFLOW apps)", typeConverter=TypeConverters.toString) + def __init__(self): super(HasTFRecordDir, self).__init__() -
    [docs] def setTFRecordDir(self, value): - return self._set(tfrecord_dir=value)
    -
    [docs] def getTFRecordDir(self): - return self.getOrDefault(self.tfrecord_dir)
    -##### SavedModelBuilder Params +
    [docs] def setTFRecordDir(self, value):
    + return self._set(tfrecord_dir=value) + +
    [docs] def getTFRecordDir(self):
    + return self.getOrDefault(self.tfrecord_dir) + + +# SavedModelBuilder Params
    [docs]class HasExportDir(Params): export_dir = Param(Params._dummy(), "export_dir", "Directory to export saved_model", typeConverter=TypeConverters.toString) + def __init__(self): super(HasExportDir, self).__init__() -
    [docs] def setExportDir(self, value): - return self._set(export_dir=value)
    -
    [docs] def getExportDir(self): - return self.getOrDefault(self.export_dir)
    + +
    [docs] def setExportDir(self, value):
    + return self._set(export_dir=value) + +
    [docs] def getExportDir(self):
    + return self.getOrDefault(self.export_dir) +
    [docs]class HasSignatureDefKey(Params): signature_def_key = Param(Params._dummy(), "signature_def_key", "Identifier for a specific saved_model signature", typeConverter=TypeConverters.toString) + def __init__(self): super(HasSignatureDefKey, self).__init__() self._setDefault(signature_def_key=None) -
    [docs] def setSignatureDefKey(self, value): - return self._set(signature_def_key=value)
    -
    [docs] def getSignatureDefKey(self): - return self.getOrDefault(self.signature_def_key)
    + +
    [docs] def setSignatureDefKey(self, value):
    + return self._set(signature_def_key=value) + +
    [docs] def getSignatureDefKey(self):
    + return self.getOrDefault(self.signature_def_key) +
    [docs]class HasTagSet(Params): tag_set = Param(Params._dummy(), "tag_set", "Comma-delimited list of tags identifying a saved_model metagraph", typeConverter=TypeConverters.toString) + def __init__(self): super(HasTagSet, self).__init__() -
    [docs] def setTagSet(self, value): - return self._set(tag_set=value)
    -
    [docs] def getTagSet(self): - return self.getOrDefault(self.tag_set)
    + +
    [docs] def setTagSet(self, value):
    + return self._set(tag_set=value) + +
    [docs] def getTagSet(self):
    + return self.getOrDefault(self.tag_set) +
    [docs]class Namespace(object): """ @@ -251,6 +310,7 @@

    Source code for tensorflowonspark.pipeline

       Based on https://docs.python.org/dev/library/types.html#types.SimpleNamespace
       """
       argv = None
    +
       def __init__(self, d):
         if isinstance(d, list):
           self.argv = d
    @@ -282,18 +342,21 @@ 

    Source code for tensorflowonspark.pipeline

       def __eq__(self, other):
         if self.argv:
           return self.argv == other
    -    else:
    -      return self.__dict__ == other.__dict__
    + else:
    + return self.__dict__ == other.__dict__ +
    [docs]class TFParams(Params): """Mix-in class to store namespace-style args and merge w/ SparkML-style params.""" args = None +
    [docs] def merge_args_params(self): local_args = copy.copy(self.args) # make a local copy of args args_dict = vars(local_args) # get dictionary view for p in self.params: - args_dict[p.name] = self.getOrDefault(p.name) # update with params - return local_args
    + args_dict[p.name] = self.getOrDefault(p.name) # update with params
    + return local_args +
    [docs]class TFEstimator(Estimator, TFParams, HasInputMapping, HasClusterSize, HasNumPS, HasInputMode, HasProtocol, HasTensorboard, HasModelDir, HasExportDir, HasTFRecordDir, @@ -326,19 +389,19 @@

    Source code for tensorflowonspark.pipeline

         self.export_fn = export_fn
         self.args = Namespace(tf_args)
         self._setDefault(input_mapping={},
    -                    cluster_size=1,
    -                    num_ps=0,
    -                    driver_ps_nodes=False,
    -                    input_mode=TFCluster.InputMode.SPARK,
    -                    protocol='grpc',
    -                    tensorboard=False,
    -                    model_dir=None,
    -                    export_dir=None,
    -                    tfrecord_dir=None,
    -                    batch_size=100,
    -                    epochs=1,
    -                    readers=1,
    -                    steps=1000)
    +                     cluster_size=1,
    +                     num_ps=0,
    +                     driver_ps_nodes=False,
    +                     input_mode=TFCluster.InputMode.SPARK,
    +                     protocol='grpc',
    +                     tensorboard=False,
    +                     model_dir=None,
    +                     export_dir=None,
    +                     tfrecord_dir=None,
    +                     batch_size=100,
    +                     epochs=1,
    +                     readers=1,
    +                     steps=1000)
     
       def _fit(self, dataset):
         """Trains a TensorFlow model and returns a TFModel instance with the same args/params pointing to a checkpoint or saved_model on disk.
    @@ -391,8 +454,9 @@ 

    Source code for tensorflowonspark.pipeline

     
           # Run on a single exeucutor
           sc.parallelize([1], 1).foreachPartition(lambda it: _export(it, self.export_fn, tf_args))
    +
    + return self._copyValues(TFModel(self.args)) - return self._copyValues(TFModel(self.args))
    [docs]class TFModel(Model, TFParams, HasInputMapping, HasOutputMapping, @@ -411,11 +475,13 @@

    Source code for tensorflowonspark.pipeline

       def __init__(self, tf_args):
         super(TFModel, self).__init__()
         self.args = Namespace(tf_args)
    -    self._setDefault(batch_size=100,
    -                    model_dir=None,
    -                    export_dir=None,
    -                    signature_def_key=None,
    -                    tag_set=None)
    +    self._setDefault(input_mapping={},
    +                     output_mapping={},
    +                     batch_size=100,
    +                     model_dir=None,
    +                     export_dir=None,
    +                     signature_def_key=None,
    +                     tag_set=None)
     
       def _transform(self, dataset):
         """Transforms the input DataFrame by applying the _run_model() mapPartitions function.
    @@ -425,47 +491,50 @@ 

    Source code for tensorflowonspark.pipeline

         """
         spark = SparkSession.builder.getOrCreate()
     
    -    logging.info("===== 1. inference args: {0}".format(self.args))
    -    logging.info("===== 2. inference params: {0}".format(self._paramMap))
    -    local_args = self.merge_args_params()
    -    logging.info("===== 3. inference args + params: {0}".format(local_args))
    -
         # set a deterministic order for input/output columns (lexicographic by key)
    -    input_cols = [ col for col, tensor in sorted(self.getInputMapping().items()) ]      # input col => input tensor
    -    output_cols = [ col for tensor, col in sorted(self.getOutputMapping().items()) ]    # output tensor => output col
    +    input_cols = [col for col, tensor in sorted(self.getInputMapping().items())]      # input col => input tensor
    +    output_cols = [col for tensor, col in sorted(self.getOutputMapping().items())]    # output tensor => output col
     
         # run single-node inferencing on each executor
         logging.info("input_cols: {}".format(input_cols))
         logging.info("output_cols: {}".format(output_cols))
     
    +    # merge args + params
    +    logging.info("===== 1. inference args: {0}".format(self.args))
    +    logging.info("===== 2. inference params: {0}".format(self._paramMap))
    +    local_args = self.merge_args_params()
    +    logging.info("===== 3. inference args + params: {0}".format(local_args))
    +
         tf_args = self.args.argv if self.args.argv else local_args
    -    rdd_out = dataset.select(input_cols).rdd.mapPartitions(lambda it: _run_model(it, tf_args))
    +    rdd_out = dataset.select(input_cols).rdd.mapPartitions(lambda it: _run_model(it, local_args, tf_args))
     
         # convert to a DataFrame-friendly format
    -    rows_out = rdd_out.map(lambda x: Row(*x))
    -    return spark.createDataFrame(rows_out, output_cols)
    + rows_out = rdd_out.map(lambda x: Row(*x))
    + return spark.createDataFrame(rows_out, output_cols) # global to each python worker process on the executors global_sess = None # tf.Session cache global_args = None # args provided to the _run_model() method. Any change will invalidate the global_sess cache. -def _run_model(iterator, args): + +def _run_model(iterator, args, tf_args): """mapPartitions function to run single-node inferencing from a checkpoint/saved_model, using the model's input/output mappings. Args: :iterator: input RDD partition iterator. - :args: a merged view of command-line args and ML Params. + :args: arguments for TFModel, in argparse format + :tf_args: arguments for TensorFlow inferencing code, in argparse or ARGV format. Returns: An iterator of result data. """ - single_node_env(args) + single_node_env(tf_args) logging.info("===== input_mapping: {}".format(args.input_mapping)) logging.info("===== output_mapping: {}".format(args.output_mapping)) - input_tensor_names = [ tensor for col,tensor in sorted(args.input_mapping.items()) ] - output_tensor_names = [ tensor for tensor,col in sorted(args.output_mapping.items()) ] + input_tensor_names = [tensor for col, tensor in sorted(args.input_mapping.items())] + output_tensor_names = [tensor for tensor, col in sorted(args.output_mapping.items())] # if using a signature_def_key, get input/output tensor info from the requested signature if args.signature_def_key: @@ -473,11 +542,11 @@

    Source code for tensorflowonspark.pipeline

         logging.info("===== loading meta_graph_def for tag_set ({0}) from saved_model: {1}".format(args.tag_set, args.export_dir))
         meta_graph_def = get_meta_graph_def(args.export_dir, args.tag_set)
         signature = signature_def_utils.get_signature_def_by_key(meta_graph_def, args.signature_def_key)
    -    logging.info("signature: {}".format(signature))
    +    logging.debug("signature: {}".format(signature))
         inputs_tensor_info = signature.inputs
    -    logging.info("inputs_tensor_info: {0}".format(inputs_tensor_info))
    +    logging.debug("inputs_tensor_info: {0}".format(inputs_tensor_info))
         outputs_tensor_info = signature.outputs
    -    logging.info("outputs_tensor_info: {0}".format(outputs_tensor_info))
    +    logging.debug("outputs_tensor_info: {0}".format(outputs_tensor_info))
     
       result = []
     
    @@ -524,22 +593,24 @@ 

    Source code for tensorflowonspark.pipeline

           inputs_feed_dict[input_tensors[i]] = tensors[i]
     
         outputs = sess.run(output_tensors, feed_dict=inputs_feed_dict)
    -    lengths = [ len(output) for output in outputs ]
    +    lengths = [len(output) for output in outputs]
         input_size = len(tensors[0])
    -    assert all([ l == input_size for l in lengths ]), "Output array sizes {} must match input size: {}".format(lengths, input_size)
    -    python_outputs = [ output.tolist() for output in outputs ]      # convert from numpy to standard python types
    -    result.extend(zip(*python_outputs))                             # convert to an array of tuples of "output columns"
    +    assert all([length == input_size for length in lengths]), "Output array sizes {} must match input size: {}".format(lengths, input_size)
    +    python_outputs = [output.tolist() for output in outputs]      # convert from numpy to standard python types
    +    result.extend(zip(*python_outputs))                           # convert to an array of tuples of "output columns"
     
       return result
     
    +
     
    [docs]def single_node_env(args): """Sets up environment for a single-node TF session. Args: - :args: command line arguments as argparse args. - :argv: command line arguments as ARGV (array of string). + :args: command line arguments as either argparse args or argv list """ - if args.argv: + if isinstance(args, list): + sys.argv = args + elif args.argv: sys.argv = args.argv # ensure expanded CLASSPATH w/o glob characters (required for Spark 2.1 + JNI) @@ -561,8 +632,9 @@

    Source code for tensorflowonspark.pipeline

         # Note: if there is a GPU conflict (CUDA_ERROR_INVALID_DEVICE), the entire task will fail and retry.
       else:
         # CPU
    -    logging.info("Using CPU")
    -    os.environ['CUDA_VISIBLE_DEVICES'] = ''
    + logging.info("Using CPU")
    + os.environ['CUDA_VISIBLE_DEVICES'] = '' +
    [docs]def get_meta_graph_def(saved_model_dir, tag_set): """Utility function to read a meta_graph_def from disk. @@ -580,8 +652,9 @@

    Source code for tensorflowonspark.pipeline

       set_of_tags = set(tag_set.split(','))
       for meta_graph_def in saved_model.meta_graphs:
         if set(meta_graph_def.meta_info_def.tags) == set_of_tags:
    -      return meta_graph_def
    -  raise RuntimeError("MetaGraphDef associated with tag-set {0} could not be found in SavedModel".format(tag_set))
    + return meta_graph_def
    + raise RuntimeError("MetaGraphDef associated with tag-set {0} could not be found in SavedModel".format(tag_set)) +
    [docs]def yield_batch(iterable, batch_size, num_tensors=1): """Generator that yields batches of a DataFrame iterator. @@ -594,7 +667,7 @@

    Source code for tensorflowonspark.pipeline

       Returns:
         An array of ``num_tensors`` arrays, each of length `batch_size`
       """
    -  tensors = [ [] for i in range(num_tensors) ]
    +  tensors = [[] for i in range(num_tensors)]
       for item in iterable:
         if item is None:
           break
    @@ -603,9 +676,9 @@ 

    Source code for tensorflowonspark.pipeline

           tensors[i].append(tmp)
         if len(tensors[0]) >= batch_size:
           yield tensors
    -      tensors = [ [] for i in range(num_tensors) ]
    -  if len(tensors[0]) > 0:
    -      yield tensors
    + tensors = [[] for i in range(num_tensors)] + if len(tensors[0]) > 0:
    + yield tensors
    @@ -615,12 +688,14 @@

    Source code for tensorflowonspark.pipeline

             
    @@ -636,13 +711,13 @@

    Navigation

  • modules |
  • - +
    \ No newline at end of file diff --git a/docs/_modules/tensorflowonspark/reservation.html b/docs/_modules/tensorflowonspark/reservation.html index 65dee53f..7326a130 100644 --- a/docs/_modules/tensorflowonspark/reservation.html +++ b/docs/_modules/tensorflowonspark/reservation.html @@ -6,7 +6,7 @@ - tensorflowonspark.reservation — TensorFlowOnSpark 1.2.1 documentation + tensorflowonspark.reservation — TensorFlowOnSpark 1.3.0 documentation @@ -25,7 +25,7 @@

    Navigation

  • modules |
  • - +
    @@ -60,6 +60,7 @@

    Source code for tensorflowonspark.reservation

    BUFSIZE = 1024 MAX_RETRIES = 3 +
    [docs]class Reservations: """Thread-safe store for node reservations. @@ -96,6 +97,7 @@

    Source code for tensorflowonspark.reservation

    with self.lock:
    return self.required - len(self.reservations) +
    [docs]class MessageSocket(object): """Abstract class w/ length-prefixed socket send/receive functions.""" @@ -184,13 +186,13 @@

    Source code for tensorflowonspark.reservation

    """ server_sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) server_sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) - server_sock.bind(('',0)) + server_sock.bind(('', 0)) server_sock.listen(10) # hostname may not be resolvable but IP address probably will be host = util.get_ip_address() port = server_sock.getsockname()[1] - addr = (host,port) + addr = (host, port) logging.info("listening for reservations at {0}".format(addr)) def _listen(self, sock): @@ -225,6 +227,7 @@

    Source code for tensorflowonspark.reservation

    """Stop the Server's socket listener."""
    self.done = True +
    [docs]class Client(MessageSocket): """Client to register and await node reservations. @@ -325,7 +328,7 @@

    Navigation

  • modules |
  • - +
    diff --git a/docs/_modules/tensorflowonspark/util.html b/docs/_modules/tensorflowonspark/util.html index 17b5a4ba..5e3e8307 100644 --- a/docs/_modules/tensorflowonspark/util.html +++ b/docs/_modules/tensorflowonspark/util.html @@ -6,7 +6,7 @@ - tensorflowonspark.util — TensorFlowOnSpark 1.2.1 documentation + tensorflowonspark.util — TensorFlowOnSpark 1.3.0 documentation @@ -25,7 +25,7 @@

    Navigation

  • modules |
  • - +
    @@ -62,6 +62,18 @@

    Source code for tensorflowonspark.util

         if os.path.exists(candidate) and os.path.isfile(candidate):
           return candidate
    return False + + +
    [docs]def write_executor_id(num): + """Write executor_id into a local file in the executor's current working directory""" + with open("executor_id", "w") as f:
    + f.write(str(num)) + + +
    [docs]def read_executor_id(): + """Read worker id from a local file in the executor's current working directory""" + with open("executor_id", "r") as f:
    + return int(f.read())
    @@ -94,7 +106,7 @@

    Navigation

  • modules |
  • - + diff --git a/docs/genindex.html b/docs/genindex.html index 065b2317..834abf32 100644 --- a/docs/genindex.html +++ b/docs/genindex.html @@ -7,7 +7,7 @@ - Index — TensorFlowOnSpark 1.2.1 documentation + Index — TensorFlowOnSpark 1.3.0 documentation @@ -26,7 +26,7 @@

    Navigation

  • modules |
  • - + @@ -48,11 +48,17 @@

    Index

    | G | H | I + | L | M | N + | O + | P + | Q | R | S | T + | W + | Y

    A

    @@ -61,9 +67,13 @@

    A

  • absolute_path() (TFNodeContext method)
  • add() (Reservations method) +
  • +
  • args (TFParams attribute)
  • +

    C

    @@ -92,9 +106,19 @@

    C

  • close() (Client method)
  • +
  • cluster_id (TFCluster attribute) + +
  • @@ -192,6 +332,8 @@

    M

    N

    + +
    + +

    O

    + + +
    + +

    P

    + + +
    + +

    Q

    + +

    R

    S

    @@ -260,13 +494,27 @@

    S

    T

    +
  • steps (HasSteps attribute) +
  • stop() (Server method)
  • - + +
    + +

    W

    + + + +
    + +

    Y

    + +
    @@ -327,7 +617,7 @@

    Navigation

  • modules |
  • - + @@ -122,7 +122,7 @@

    Navigation

  • next |
  • - + @@ -54,6 +54,11 @@

    Python Module Index

    tensorflowonspark + + +     + tensorflowonspark.dfutil +     @@ -64,11 +69,21 @@

    Python Module Index

        tensorflowonspark.marker + + +     + tensorflowonspark.pipeline +     tensorflowonspark.reservation + + +     + tensorflowonspark.TFCluster +     @@ -122,7 +137,7 @@

    Navigation

  • modules |
  • - + @@ -84,7 +84,7 @@

    Navigation

  • modules |
  • - + @@ -43,8 +43,228 @@

    Navigation

    -
    -

    tensorflowonspark.TFCluster module

    +
    +

    tensorflowonspark.TFCluster module

    +

    This module provides a high-level API to manage the TensorFlowOnSpark cluster.

    +

    There are three main phases of operation:

    +
      +
    1. Reservation/Startup - reserves a port for the TensorFlow process on each executor, starts a multiprocessing.Manager to +listen for data/control messages, and then launches the Tensorflow main function on the executors.
    2. +
    3. Data feeding - For InputMode.SPARK only. Sends RDD data to the TensorFlow nodes via each executor’s multiprocessing.Manager. PS +nodes will tie up their executors, so they won’t receive any subsequent data feeding tasks.
    4. +
    5. Shutdown - sends a shutdown control message to the multiprocessing.Managers of the PS nodes and pushes end-of-feed markers into the data +queues of the worker nodes.
    6. +
    +
    +
    +class InputMode[source]
    +

    Bases: object

    +

    Enum for the input modes of data feeding.

    +
    +
    +SPARK = 1
    +

    Spark is responsible for feeding data to the TensorFlow application via an RDD.

    +
    + +
    +
    +TENSORFLOW = 0
    +

    TensorFlow application is responsible for reading any data.

    +
    + +
    + +
    +
    +class TFCluster[source]
    +

    Bases: object

    +
    +
    +cluster_id = None
    +

    Unique ID for this cluster, used to invalidate state for new clusters.

    +
    + +
    +
    +cluster_info = None
    +

    Cluster node reservations

    +
    + +
    +
    +cluster_meta = None
    +

    Cluster metadata dictionary, e.g. cluster_id, defaultFS, reservation.Server address, etc.

    +
    + +
    +
    +defaultFS = None
    +

    Default FileSystem string, e.g. file:// or hdfs://<namenode>/

    +
    + +
    +
    +inference(dataRDD, qname='input')[source]
    +

    For InputMode.SPARK only: Feeds Spark RDD partitions into the TensorFlow worker nodes and returns an RDD of results

    +

    It is the responsibility of the TensorFlow “main” function to interpret the rows of the RDD and provide valid data for the output RDD.

    +

    This will use the distributed TensorFlow cluster for inferencing, so the TensorFlow “main” function should be capable of inferencing. +Per Spark design, the output RDD will be lazily-executed only when a Spark action is invoked on the RDD.

    +
    +
    Args:
    +
    +++ + + + + + +
    dataRDD:input data as a Spark RDD
    qname:INTERNAL_USE
    +
    +
    Returns:
    +
    A Spark RDD representing the output of the TensorFlow inferencing
    +
    +
    + +
    +
    +input_mode = None
    +

    TFCluster.InputMode for this cluster

    +
    + +
    +
    +nodeRDD = None
    +

    RDD representing the nodes of the cluster, i.e. sc.parallelize(range(num_executors), num_executors)

    +
    + +
    +
    +num_executors = None
    +

    Number of executors in the Spark job (and therefore, the number of nodes in the TensorFlow cluster).

    +
    + +
    +
    +queues = None
    +

    INTERNAL_USE

    +
    + +
    +
    +sc = None
    +

    SparkContext

    +
    + +
    +
    +server = None
    +

    reservation.Server for this cluster

    +
    + +
    +
    +shutdown(ssc=None)[source]
    +

    Stops the distributed TensorFlow cluster.

    +
    +
    Args:
    +
    +++ + + + +
    ssc:For Streaming applications only. Spark StreamingContext
    +
    +
    +
    + +
    +
    +tensorboard_url()[source]
    +

    Utility function to get the Tensorboard URL

    +
    + +
    +
    +train(dataRDD, num_epochs=0, qname='input')[source]
    +

    For InputMode.SPARK only. Feeds Spark RDD partitions into the TensorFlow worker nodes

    +

    It is the responsibility of the TensorFlow “main” function to interpret the rows of the RDD.

    +

    Since epochs are implemented via RDD.union() and the entire RDD must generally be processed in full, it is recommended +to set num_epochs to closely match your training termination condition (e.g. steps or accuracy). See TFNode.DataFeed +for more details.

    +
    +
    Args:
    +
    +++ + + + + + + + +
    dataRDD:input data as a Spark RDD.
    num_epochs:number of times to repeat the dataset during training.
    qname:INTERNAL USE.
    +
    +
    +
    + +
    +
    +working_dir = None
    +

    Current working directory

    +
    + +
    + +
    +
    +run(sc, map_fun, tf_args, num_executors, num_ps, tensorboard=False, input_mode=0, log_dir=None, driver_ps_nodes=False, master_node=None, reservation_timeout=600, queues=['input', 'output', 'error'])[source]
    +

    Starts the TensorFlowOnSpark cluster and Runs the TensorFlow “main” function on the Spark executors

    +
    +
    Args:
    +
    +++ + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    sc:SparkContext
    map_fun:user-supplied TensorFlow “main” function
    tf_args:argparse args, or command-line ARGV. These will be passed to the map_fun.
    num_executors:number of Spark executors. This should match your Spark job’s --num_executors.
    num_ps:number of Spark executors which are reserved for TensorFlow PS nodes. All other executors will be used as TensorFlow worker nodes.
    tensorboard:boolean indicating if the chief worker should spawn a Tensorboard server.
    input_mode:TFCluster.InputMode
    log_dir:directory to save tensorboard event logs. If None, defaults to a fixed path on local filesystem.
    driver_ps_nodes:
     run the PS nodes on the driver locally instead of on the spark executors; this help maximizing computing resources (esp. GPU). You will need to set cluster_size = num_executors + num_ps
    master_node:name of the “master” or “chief” node in the cluster_template, used for tf.estimator applications.
    reservation_timeout:
     number of seconds after which cluster reservation times out (600 sec default)
    queues:INTERNAL_USE
    +
    +
    Returns:
    +
    A TFCluster object representing the started cluster.
    +
    +
    +
    @@ -97,7 +317,7 @@

    Navigation

  • previous |
  • - +
    diff --git a/docs/tensorflowonspark.TFManager.html b/docs/tensorflowonspark.TFManager.html index bc1348fc..4514bb1f 100644 --- a/docs/tensorflowonspark.TFManager.html +++ b/docs/tensorflowonspark.TFManager.html @@ -6,7 +6,7 @@ - tensorflowonspark.TFManager module — TensorFlowOnSpark 1.2.1 documentation + tensorflowonspark.TFManager module — TensorFlowOnSpark 1.3.0 documentation @@ -33,7 +33,7 @@

    Navigation

  • previous |
  • - +
    @@ -150,7 +150,7 @@

    Navigation

  • previous |
  • - +
    diff --git a/docs/tensorflowonspark.TFNode.html b/docs/tensorflowonspark.TFNode.html index 2258e898..5a9d7306 100644 --- a/docs/tensorflowonspark.TFNode.html +++ b/docs/tensorflowonspark.TFNode.html @@ -6,7 +6,7 @@ - tensorflowonspark.TFNode module — TensorFlowOnSpark 1.2.1 documentation + tensorflowonspark.TFNode module — TensorFlowOnSpark 1.3.0 documentation @@ -33,7 +33,7 @@

    Navigation

  • previous |
  • - + @@ -297,7 +297,7 @@

    Navigation

  • previous |
  • - + diff --git a/docs/tensorflowonspark.TFSparkNode.html b/docs/tensorflowonspark.TFSparkNode.html index c494aa55..19aef37c 100644 --- a/docs/tensorflowonspark.TFSparkNode.html +++ b/docs/tensorflowonspark.TFSparkNode.html @@ -6,7 +6,7 @@ - tensorflowonspark.TFSparkNode module — TensorFlowOnSpark 1.2.1 documentation + tensorflowonspark.TFSparkNode module — TensorFlowOnSpark 1.3.0 documentation @@ -33,7 +33,7 @@

    Navigation

  • previous |
  • - + @@ -48,7 +48,7 @@

    Navigation

    This module provides low-level functions for managing the TensorFlowOnSpark cluster.

    -class TFNodeContext(worker_num, job_name, task_index, cluster_spec, defaultFS, working_dir, mgr)[source]
    +class TFNodeContext(executor_id, job_name, task_index, cluster_spec, defaultFS, working_dir, mgr)[source]

    Bases: object

    Encapsulates unique metadata for a TensorFlowOnSpark node/executor and provides methods to interact with Spark and HDFS.

    An instance of this object will be passed to the TensorFlow “main” function via the ctx argument. @@ -59,7 +59,7 @@

    Navigation

    -worker_num:integer identifier for this executor, per nodeRDD = sc.parallelize(range(num_executors), num_executors). +executor_id:integer identifier for this executor, per nodeRDD = sc.parallelize(range(num_executors), num_executors). job_name:TensorFlow job name (e.g. ‘ps’ or ‘worker’) of this TF node, per cluster_spec. @@ -138,7 +138,7 @@

    Navigation

    -cluster_info:node reservation information for the cluster (e.g. host, ppid, pid, ports, etc) +cluster_info:node reservation information for the cluster (e.g. host, executor_id, pid, ports, etc) qname:INTERNAL_USE @@ -192,7 +192,7 @@

    Navigation

    -cluster_info:node reservation information for the cluster (e.g. host, ppid, pid, ports, etc). +cluster_info:node reservation information for the cluster (e.g. host, executor_id, pid, ports, etc). queues:INTERNAL_USE @@ -214,7 +214,7 @@

    Navigation

    -cluster_info:node reservation information for the cluster (e.g. host, ppid, pid, ports, etc) +cluster_info:node reservation information for the cluster (e.g. host, executor_id, pid, ports, etc) cluster_meta:dictionary of cluster metadata (e.g. cluster_id, reservation.Server address, etc) @@ -280,7 +280,7 @@

    Navigation

  • previous |
  • - + diff --git a/docs/tensorflowonspark.dfutil.html b/docs/tensorflowonspark.dfutil.html index 22da42d6..3b64749e 100644 --- a/docs/tensorflowonspark.dfutil.html +++ b/docs/tensorflowonspark.dfutil.html @@ -6,7 +6,7 @@ - tensorflowonspark.dfutil module — TensorFlowOnSpark 1.2.1 documentation + tensorflowonspark.dfutil module — TensorFlowOnSpark 1.3.0 documentation @@ -33,7 +33,7 @@

    Navigation

  • previous |
  • - + @@ -43,8 +43,152 @@

    Navigation

    -
    -

    tensorflowonspark.dfutil module

    +
    +

    tensorflowonspark.dfutil module

    +

    A collection of utility functions for loading/saving TensorFlow TFRecords files as Spark DataFrames.

    +
    +
    +fromTFExample(iter, binary_features=[])[source]
    +

    mapPartition function to convert an RDD of serialized tf.train.Example bytestring into an RDD of Row.

    +

    Note: TensorFlow represents both strings and binary types as tf.train.BytesList, and we need to +disambiguate these types for Spark DataFrames DTypes (StringType and BinaryType), so we require a “hint” +from the caller in the binary_features argument.

    +
    +
    Args:
    +
    +++ + + + + + + +
    iter:the RDD partition iterator
    binary_features:
     a list of tf.train.Example features which are expected to be binary/bytearrays.
    +
    +
    Returns:
    +
    An array/iterator of DataFrame Row with features converted into columns.
    +
    +
    + +
    +
    +infer_schema(example, binary_features=[])[source]
    +

    Given a tf.train.Example, infer the Spark DataFrame schema (StructFields).

    +

    Note: TensorFlow represents both strings and binary types as tf.train.BytesList, and we need to +disambiguate these types for Spark DataFrames DTypes (StringType and BinaryType), so we require a “hint” +from the caller in the binary_features argument.

    +
    +
    Args:
    +
    +++ + + + + + + +
    example:a tf.train.Example
    binary_features:
     a list of tf.train.Example features which are expected to be binary/bytearrays.
    +
    +
    Returns:
    +
    A DataFrame StructType schema
    +
    +
    + +
    +
    +isLoadedDF(df)[source]
    +

    Returns True if the input DataFrame was produced by the loadTFRecords() method.

    +

    This is primarily used by the Spark ML Pipelines APIs.

    +
    +
    Args:
    +
    +++ + + + +
    df:Spark Dataframe
    +
    +
    +
    + +
    +
    +loadTFRecords(sc, input_dir, binary_features=[])[source]
    +

    Load TFRecords from disk into a Spark DataFrame.

    +

    This will attempt to automatically convert the tf.train.Example features into Spark DataFrame columns of equivalent types.

    +

    Note: TensorFlow represents both strings and binary types as tf.train.BytesList, and we need to +disambiguate these types for Spark DataFrames DTypes (StringType and BinaryType), so we require a “hint” +from the caller in the binary_features argument.

    +
    +
    Args:
    +
    +++ + + + + + + + + +
    sc:SparkContext
    input_dir:location of TFRecords on disk.
    binary_features:
     a list of tf.train.Example features which are expected to be binary/bytearrays.
    +
    +
    Returns:
    +
    A Spark DataFrame mirroring the tf.train.Example schema.
    +
    +
    + +
    +
    +saveAsTFRecords(df, output_dir)[source]
    +

    Save a Spark DataFrame as TFRecords.

    +

    This will convert the DataFrame rows to TFRecords prior to saving.

    +
    +
    Args:
    +
    +++ + + + + + +
    df:Spark DataFrame
    output_dir:Path to save TFRecords
    +
    +
    +
    + +
    +
    +toTFExample(dtypes)[source]
    +

    mapPartition function to convert a Spark RDD of Row into an RDD of serialized tf.train.Example bytestring.

    +

    Note that tf.train.Example is a fairly flat structure with limited datatypes, e.g. tf.train.FloatList, +tf.train.Int64List, and tf.train.BytesList, so most DataFrame types will be coerced into one of these types.

    +
    +
    Args:
    +
    +++ + + + +
    dtypes:the DataFrame.dtypes of the source DataFrame.
    +
    +
    Returns:
    +
    A mapPartition function which converts the source DataFrame into tf.train.Example bytestrings.
    +
    +
    +
    @@ -97,7 +241,7 @@

    Navigation

  • previous |
  • - +
    diff --git a/docs/tensorflowonspark.gpu_info.html b/docs/tensorflowonspark.gpu_info.html index 9601ec04..11d0ed40 100644 --- a/docs/tensorflowonspark.gpu_info.html +++ b/docs/tensorflowonspark.gpu_info.html @@ -6,7 +6,7 @@ - tensorflowonspark.gpu_info module — TensorFlowOnSpark 1.2.1 documentation + tensorflowonspark.gpu_info module — TensorFlowOnSpark 1.3.0 documentation @@ -33,7 +33,7 @@

    Navigation

  • previous |
  • - +
    @@ -124,7 +124,7 @@

    Navigation

  • previous |
  • - +
    diff --git a/docs/tensorflowonspark.html b/docs/tensorflowonspark.html index 794d2eda..c8ee6eb6 100644 --- a/docs/tensorflowonspark.html +++ b/docs/tensorflowonspark.html @@ -6,7 +6,7 @@ - tensorflowonspark package — TensorFlowOnSpark 1.2.1 documentation + tensorflowonspark package — TensorFlowOnSpark 1.3.0 documentation @@ -33,7 +33,7 @@

    Navigation

  • previous |
  • - + @@ -122,7 +122,7 @@

    Navigation

  • previous |
  • - + @@ -111,7 +111,7 @@

    Navigation

  • previous |
  • - + diff --git a/docs/tensorflowonspark.pipeline.html b/docs/tensorflowonspark.pipeline.html index 12dcdbbb..1ab4f1d5 100644 --- a/docs/tensorflowonspark.pipeline.html +++ b/docs/tensorflowonspark.pipeline.html @@ -6,7 +6,7 @@ - tensorflowonspark.pipeline module — TensorFlowOnSpark 1.2.1 documentation + tensorflowonspark.pipeline module — TensorFlowOnSpark 1.3.0 documentation @@ -33,7 +33,7 @@

    Navigation

  • previous |
  • - + @@ -43,8 +43,538 @@

    Navigation

    -
    -

    tensorflowonspark.pipeline module

    +
    +

    tensorflowonspark.pipeline module

    +

    This module extends the TensorFlowOnSpark API to support Spark ML Pipelines.

    +

    It provides a TFEstimator class to fit a TFModel using TensorFlow. The TFEstimator will actually spawn a TensorFlowOnSpark cluster +to conduct distributed training, but due to architectural limitations, the TFModel will only run single-node TensorFlow instances +when inferencing on the executors. The executors will run in parallel, but the TensorFlow model must fit in the memory +of each executor.

    +

    There is also an option to provide a separate “export” function, which allows users to export a different graph for inferencing vs. training. +This is useful when the training graph uses InputMode.TENSORFLOW with queue_runners, but the inferencing graph needs placeholders. +And this is especially useful for exporting saved_models for TensorFlow Serving.

    +
    +
    +class HasBatchSize[source]
    +

    Bases: pyspark.ml.param.Params

    +
    +
    +batch_size = Param(parent='undefined', name='batch_size', doc='Number of records per batch')
    +
    + +
    +
    +getBatchSize()[source]
    +
    + +
    +
    +setBatchSize(value)[source]
    +
    + +
    + +
    +
    +class HasClusterSize[source]
    +

    Bases: pyspark.ml.param.Params

    +
    +
    +cluster_size = Param(parent='undefined', name='cluster_size', doc='Number of nodes in the cluster')
    +
    + +
    +
    +getClusterSize()[source]
    +
    + +
    +
    +setClusterSize(value)[source]
    +
    + +
    + +
    +
    +class HasEpochs[source]
    +

    Bases: pyspark.ml.param.Params

    +
    +
    +epochs = Param(parent='undefined', name='epochs', doc='Number of epochs to train')
    +
    + +
    +
    +getEpochs()[source]
    +
    + +
    +
    +setEpochs(value)[source]
    +
    + +
    + +
    +
    +class HasExportDir[source]
    +

    Bases: pyspark.ml.param.Params

    +
    +
    +export_dir = Param(parent='undefined', name='export_dir', doc='Directory to export saved_model')
    +
    + +
    +
    +getExportDir()[source]
    +
    + +
    +
    +setExportDir(value)[source]
    +
    + +
    + +
    +
    +class HasInputMapping[source]
    +

    Bases: pyspark.ml.param.Params

    +
    +
    +getInputMapping()[source]
    +
    + +
    +
    +input_mapping = Param(parent='undefined', name='input_mapping', doc='Mapping of input DataFrame column to input tensor')
    +
    + +
    +
    +setInputMapping(value)[source]
    +
    + +
    + +
    +
    +class HasInputMode[source]
    +

    Bases: pyspark.ml.param.Params

    +
    +
    +getInputMode()[source]
    +
    + +
    +
    +input_mode = Param(parent='undefined', name='input_mode', doc='Input data feeding mode (0=TENSORFLOW, 1=SPARK)')
    +
    + +
    +
    +setInputMode(value)[source]
    +
    + +
    + +
    +
    +class HasModelDir[source]
    +

    Bases: pyspark.ml.param.Params

    +
    +
    +getModelDir()[source]
    +
    + +
    +
    +model_dir = Param(parent='undefined', name='model_dir', doc='Path to save/load model checkpoints')
    +
    + +
    +
    +setModelDir(value)[source]
    +
    + +
    + +
    +
    +class HasNumPS[source]
    +

    Bases: pyspark.ml.param.Params

    +
    +
    +driver_ps_nodes = Param(parent='undefined', name='driver_ps_nodes', doc='Run PS nodes on driver locally')
    +
    + +
    +
    +getDriverPSNodes()[source]
    +
    + +
    +
    +getNumPS()[source]
    +
    + +
    +
    +num_ps = Param(parent='undefined', name='num_ps', doc='Number of PS nodes in cluster')
    +
    + +
    +
    +setDriverPSNodes(value)[source]
    +
    + +
    +
    +setNumPS(value)[source]
    +
    + +
    + +
    +
    +class HasOutputMapping[source]
    +

    Bases: pyspark.ml.param.Params

    +
    +
    +getOutputMapping()[source]
    +
    + +
    +
    +output_mapping = Param(parent='undefined', name='output_mapping', doc='Mapping of output tensor to output DataFrame column')
    +
    + +
    +
    +setOutputMapping(value)[source]
    +
    + +
    + +
    +
    +class HasProtocol[source]
    +

    Bases: pyspark.ml.param.Params

    +
    +
    +getProtocol()[source]
    +
    + +
    +
    +protocol = Param(parent='undefined', name='protocol', doc='Network protocol for Tensorflow (grpc|rdma)')
    +
    + +
    +
    +setProtocol(value)[source]
    +
    + +
    + +
    +
    +class HasReaders[source]
    +

    Bases: pyspark.ml.param.Params

    +
    +
    +getReaders()[source]
    +
    + +
    +
    +readers = Param(parent='undefined', name='readers', doc='number of reader/enqueue threads')
    +
    + +
    +
    +setReaders(value)[source]
    +
    + +
    + +
    +
    +class HasSignatureDefKey[source]
    +

    Bases: pyspark.ml.param.Params

    +
    +
    +getSignatureDefKey()[source]
    +
    + +
    +
    +setSignatureDefKey(value)[source]
    +
    + +
    +
    +signature_def_key = Param(parent='undefined', name='signature_def_key', doc='Identifier for a specific saved_model signature')
    +
    + +
    + +
    +
    +class HasSteps[source]
    +

    Bases: pyspark.ml.param.Params

    +
    +
    +getSteps()[source]
    +
    + +
    +
    +setSteps(value)[source]
    +
    + +
    +
    +steps = Param(parent='undefined', name='steps', doc='Maximum number of steps to train')
    +
    + +
    + +
    +
    +class HasTFRecordDir[source]
    +

    Bases: pyspark.ml.param.Params

    +
    +
    +getTFRecordDir()[source]
    +
    + +
    +
    +setTFRecordDir(value)[source]
    +
    + +
    +
    +tfrecord_dir = Param(parent='undefined', name='tfrecord_dir', doc='Path to temporarily export a DataFrame as TFRecords (for InputMode.TENSORFLOW apps)')
    +
    + +
    + +
    +
    +class HasTagSet[source]
    +

    Bases: pyspark.ml.param.Params

    +
    +
    +getTagSet()[source]
    +
    + +
    +
    +setTagSet(value)[source]
    +
    + +
    +
    +tag_set = Param(parent='undefined', name='tag_set', doc='Comma-delimited list of tags identifying a saved_model metagraph')
    +
    + +
    + +
    +
    +class HasTensorboard[source]
    +

    Bases: pyspark.ml.param.Params

    +
    +
    +getTensorboard()[source]
    +
    + +
    +
    +setTensorboard(value)[source]
    +
    + +
    +
    +tensorboard = Param(parent='undefined', name='tensorboard', doc='Launch tensorboard process')
    +
    + +
    + +
    +
    +class Namespace(d)[source]
    +

    Bases: object

    +

    Utility class to convert dictionaries to Namespace-like objects.

    +

    Based on https://docs.python.org/dev/library/types.html#types.SimpleNamespace

    +
    +
    +argv = None
    +
    + +
    + +
    +
    +class TFEstimator(train_fn, tf_args, export_fn=None)[source]
    +

    Bases: pyspark.ml.base.Estimator, tensorflowonspark.pipeline.TFParams, tensorflowonspark.pipeline.HasInputMapping, tensorflowonspark.pipeline.HasClusterSize, tensorflowonspark.pipeline.HasNumPS, tensorflowonspark.pipeline.HasInputMode, tensorflowonspark.pipeline.HasProtocol, tensorflowonspark.pipeline.HasTensorboard, tensorflowonspark.pipeline.HasModelDir, tensorflowonspark.pipeline.HasExportDir, tensorflowonspark.pipeline.HasTFRecordDir, tensorflowonspark.pipeline.HasBatchSize, tensorflowonspark.pipeline.HasEpochs, tensorflowonspark.pipeline.HasReaders, tensorflowonspark.pipeline.HasSteps

    +

    Spark ML Estimator which launches a TensorFlowOnSpark cluster for distributed training.

    +

    The columns of the DataFrame passed to the fit() method will be mapped to TensorFlow tensors according to the setInputMapping() method.

    +

    If an export_fn was provided to the constructor, it will be run on a single executor immediately after the distributed training has completed. +This allows users to export a TensorFlow saved_model with a different execution graph for inferencing, e.g. replacing an input graph of +TFReaders and QueueRunners with Placeholders.

    +

    For InputMode.TENSORFLOW, the input DataFrame will be exported as TFRecords to a temporary location specified by the tfrecord_dir. +The TensorFlow application will then be expected to read directly from this location during training. However, if the input DataFrame was +produced by the dfutil.loadTFRecords() method, i.e. originated from TFRecords on disk, then the tfrecord_dir will be set to the +original source location of the TFRecords with the additional export step.

    +
    +
    Args:
    +
    +++ + + + + + + + +
    train_fn:TensorFlow “main” function for training.
    tf_args:Arguments specific to the TensorFlow “main” function.
    export_fn:TensorFlow function for exporting a saved_model.
    +
    +
    +
    +
    +export_fn = None
    +
    + +
    +
    +train_fn = None
    +
    + +
    + +
    +
    +class TFModel(tf_args)[source]
    +

    Bases: pyspark.ml.base.Model, tensorflowonspark.pipeline.TFParams, tensorflowonspark.pipeline.HasInputMapping, tensorflowonspark.pipeline.HasOutputMapping, tensorflowonspark.pipeline.HasBatchSize, tensorflowonspark.pipeline.HasModelDir, tensorflowonspark.pipeline.HasExportDir, tensorflowonspark.pipeline.HasSignatureDefKey, tensorflowonspark.pipeline.HasTagSet

    +

    Spark ML Model backed by a TensorFlow model checkpoint/saved_model on disk.

    +

    During transform(), each executor will run an independent, single-node instance of TensorFlow in parallel, so the model must fit in memory. +The model/session will be loaded/initialized just once for each Spark Python worker, and the session will be cached for +subsequent tasks/partitions to avoid re-loading the model for each partition.

    +
    +
    Args:
    +
    +++ + + + +
    tf_args:Dictionary of arguments specific to TensorFlow “main” function.
    +
    +
    +
    + +
    +
    +class TFParams[source]
    +

    Bases: pyspark.ml.param.Params

    +

    Mix-in class to store namespace-style args and merge w/ SparkML-style params.

    +
    +
    +args = None
    +
    + +
    +
    +merge_args_params()[source]
    +
    + +
    + +
    +
    +class TFTypeConverters[source]
    +

    Bases: object

    +

    Custom DataFrame TypeConverter for dictionary types (since this is not provided by Spark core).

    +
    +
    +static toDict()[source]
    +
    + +
    + +
    +
    +get_meta_graph_def(saved_model_dir, tag_set)[source]
    +

    Utility function to read a meta_graph_def from disk.

    +

    From saved_model_cli.py

    +
    +
    Args:
    +
    +++ + + + + + + +
    saved_model_dir:
     path to saved_model.
    tag_set:list of string tags identifying the TensorFlow graph within the saved_model.
    +
    +
    Returns:
    +
    A TensorFlow meta_graph_def, or raises an Exception otherwise.
    +
    +
    + +
    +
    +single_node_env(args)[source]
    +

    Sets up environment for a single-node TF session.

    +
    +
    Args:
    +
    +++ + + + +
    args:command line arguments as either argparse args or argv list
    +
    +
    +
    + +
    +
    +yield_batch(iterable, batch_size, num_tensors=1)[source]
    +

    Generator that yields batches of a DataFrame iterator.

    +
    +
    Args:
    +
    +++ + + + + + + + +
    iterable:Spark partition iterator.
    batch_size:number of items to retrieve per invocation.
    num_tensors:number of tensors (columns) expected in each item.
    +
    +
    Returns:
    +
    An array of num_tensors arrays, each of length batch_size
    +
    +
    +
    @@ -97,7 +627,7 @@

    Navigation

  • previous |
  • - +
    diff --git a/docs/tensorflowonspark.reservation.html b/docs/tensorflowonspark.reservation.html index 6e28df54..1152ea15 100644 --- a/docs/tensorflowonspark.reservation.html +++ b/docs/tensorflowonspark.reservation.html @@ -6,7 +6,7 @@ - tensorflowonspark.reservation module — TensorFlowOnSpark 1.2.1 documentation + tensorflowonspark.reservation module — TensorFlowOnSpark 1.3.0 documentation @@ -33,7 +33,7 @@

    Navigation

  • previous |
  • - +
    @@ -286,7 +286,7 @@

    Navigation

  • previous |
  • - +
    diff --git a/docs/tensorflowonspark.reservation_client.html b/docs/tensorflowonspark.reservation_client.html index d9d87f6c..a34d1a84 100644 --- a/docs/tensorflowonspark.reservation_client.html +++ b/docs/tensorflowonspark.reservation_client.html @@ -6,7 +6,7 @@ - tensorflowonspark.reservation_client module — TensorFlowOnSpark 1.2.1 documentation + tensorflowonspark.reservation_client module — TensorFlowOnSpark 1.3.0 documentation @@ -33,7 +33,7 @@

    Navigation

  • previous |
  • - + @@ -97,7 +97,7 @@

    Navigation

  • previous |
  • - + diff --git a/docs/tensorflowonspark.util.html b/docs/tensorflowonspark.util.html index fb2895af..8a6751d3 100644 --- a/docs/tensorflowonspark.util.html +++ b/docs/tensorflowonspark.util.html @@ -6,7 +6,7 @@ - tensorflowonspark.util module — TensorFlowOnSpark 1.2.1 documentation + tensorflowonspark.util module — TensorFlowOnSpark 1.3.0 documentation @@ -29,7 +29,7 @@

    Navigation

  • previous |
  • - + @@ -53,6 +53,18 @@

    Navigation

    Simple utility to get host IP address.

    +
    +
    +read_executor_id()[source]
    +

    Read worker id from a local file in the executor’s current working directory

    +
    + +
    +
    +write_executor_id(num)[source]
    +

    Write executor_id into a local file in the executor’s current working directory

    +
    + @@ -99,7 +111,7 @@

    Navigation

  • previous |
  • - + diff --git a/setup.py b/setup.py index e373784e..0a423c72 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ setup( name = 'tensorflowonspark', packages = ['tensorflowonspark'], - version = '1.2.1', + version = '1.3.0', description = 'Deep learning with TensorFlow on Apache Spark clusters', author = 'Yahoo, Inc.', url = 'https://github.com/yahoo/TensorFlowOnSpark', diff --git a/tensorflowonspark/TFCluster.py b/tensorflowonspark/TFCluster.py index 102fa96f..de922e10 100644 --- a/tensorflowonspark/TFCluster.py +++ b/tensorflowonspark/TFCluster.py @@ -36,11 +36,13 @@ # status of TF background job tf_status = {} + class InputMode(object): """Enum for the input modes of data feeding.""" TENSORFLOW = 0 #: TensorFlow application is responsible for reading any data. SPARK = 1 #: Spark is responsible for feeding data to the TensorFlow application via an RDD. + class TFCluster(object): sc = None #: SparkContext @@ -197,8 +199,9 @@ def tensorboard_url(self): tb_url = "http://{0}:{1}".format(node['host'], node['tb_port']) return tb_url + def run(sc, map_fun, tf_args, num_executors, num_ps, tensorboard=False, input_mode=InputMode.TENSORFLOW, - log_dir=None, driver_ps_nodes=False, reservation_timeout=600, queues=['input', 'output', 'error']): + log_dir=None, driver_ps_nodes=False, master_node=None, reservation_timeout=600, queues=['input', 'output', 'error']): """Starts the TensorFlowOnSpark cluster and Runs the TensorFlow "main" function on the Spark executors Args: @@ -211,6 +214,7 @@ def run(sc, map_fun, tf_args, num_executors, num_ps, tensorboard=False, input_mo :input_mode: TFCluster.InputMode :log_dir: directory to save tensorboard event logs. If None, defaults to a fixed path on local filesystem. :driver_ps_nodes: run the PS nodes on the driver locally instead of on the spark executors; this help maximizing computing resources (esp. GPU). You will need to set cluster_size = num_executors + num_ps + :master_node: name of the "master" or "chief" node in the cluster_template, used for `tf.estimator` applications. :reservation_timeout: number of seconds after which cluster reservation times out (600 sec default) :queues: *INTERNAL_USE* @@ -226,8 +230,13 @@ def run(sc, map_fun, tf_args, num_executors, num_ps, tensorboard=False, input_mo # build a cluster_spec template using worker_nums cluster_template = {} cluster_template['ps'] = range(num_ps) - cluster_template['worker'] = range(num_ps, num_executors) - logging.info("worker node range %s, ps node range %s" % (cluster_template['worker'], cluster_template['ps'])) + if master_node is None: + cluster_template['worker'] = range(num_ps, num_executors) + else: + cluster_template[master_node] = range(num_ps, num_ps + 1) + if num_executors > num_ps + 1: + cluster_template['worker'] = range(num_ps + 1, num_executors) + logging.info("cluster_template: {}".format(cluster_template)) # get default filesystem from spark defaultFS = sc._jsc.hadoopConfiguration().get("fs.defaultFS") @@ -311,13 +320,17 @@ def _start(status): logging.info("") logging.info("========================================================================================") - # since our "primary key" for each executor's TFManager is (host, ppid), sanity check for duplicates + # since our "primary key" for each executor's TFManager is (host, executor_id), sanity check for duplicates # Note: this may occur if Spark retries failed Python tasks on the same executor. tb_nodes = set() for node in cluster_info: - node_id = (node['host'],node['ppid']) + node_id = (node['host'], node['executor_id']) if node_id in tb_nodes: - raise Exception("Duplicate cluster node id detected (host={0}, ppid={1}). Please ensure that (1) the number of executors >= number of TensorFlow nodes, (2) the number of tasks per executors == 1, and (3) TFCluster.shutdown() is successfully invoked when done.".format(node_id[0], node_id[1])) + raise Exception("Duplicate cluster node id detected (host={0}, executor_id={1})".format(node_id[0], node_id[1]) + + "Please ensure that:\n" + + "1. Number of executors >= number of TensorFlow nodes\n" + + "2. Number of tasks per executors is 1\n" + + "3, TFCluster.shutdown() is successfully invoked when done.") else: tb_nodes.add(node_id) diff --git a/tensorflowonspark/TFNode.py b/tensorflowonspark/TFNode.py index 2f65e0ac..2997481e 100755 --- a/tensorflowonspark/TFNode.py +++ b/tensorflowonspark/TFNode.py @@ -21,6 +21,7 @@ from six.moves.queue import Empty from . import marker + def hdfs_path(ctx, path): """Convenience function to create a Tensorflow-compatible absolute HDFS path from relative paths @@ -47,6 +48,7 @@ def hdfs_path(ctx, path): logging.warn("Unknown scheme {0} with relative path: {1}".format(ctx.defaultFS, path)) return "{0}/{1}".format(ctx.defaultFS, path) + def start_cluster_server(ctx, num_gpus=1, rdma=False): """Function that wraps the creation of TensorFlow ``tf.train.Server`` for a node in a distributed TensorFlow cluster. @@ -71,7 +73,8 @@ def start_cluster_server(ctx, num_gpus=1, rdma=False): if tf.test.is_built_with_cuda(): # GPU gpu_initialized = False - while not gpu_initialized: + retries = 3 + while not gpu_initialized and retries > 0: try: # override PS jobs to only reserve one GPU if ctx.job_name == 'ps': @@ -97,7 +100,10 @@ def start_cluster_server(ctx, num_gpus=1, rdma=False): except Exception as e: print(e) logging.error("{0}: Failed to allocate GPU, trying again...".format(ctx.worker_num)) + retries -= 1 time.sleep(10) + if not gpu_initialized: + raise Exception("Failed to allocate GPU") else: # CPU os.environ['CUDA_VISIBLE_DEVICES'] = '' @@ -111,10 +117,12 @@ def start_cluster_server(ctx, num_gpus=1, rdma=False): return (cluster, server) + def next_batch(mgr, batch_size, qname='input'): """*DEPRECATED*. Use TFNode.DataFeed class instead.""" raise Exception("DEPRECATED: Use TFNode.DataFeed class instead") + def export_saved_model(sess, export_dir, tag_set, signatures): """Convenience function to export a saved_model using provided arguments @@ -148,25 +156,29 @@ def export_saved_model(sess, export_dir, tag_set, signatures): signature_def_map = {} for key, sig in signatures.items(): signature_def_map[key] = tf.saved_model.signature_def_utils.build_signature_def( - inputs={ name:tf.saved_model.utils.build_tensor_info(tensor) for name, tensor in sig['inputs'].items() }, - outputs={ name:tf.saved_model.utils.build_tensor_info(tensor) for name, tensor in sig['outputs'].items() }, - method_name=sig['method_name'] if 'method_name' in sig else key) + inputs={name: tf.saved_model.utils.build_tensor_info(tensor) for name, tensor in sig['inputs'].items()}, + outputs={name: tf.saved_model.utils.build_tensor_info(tensor) for name, tensor in sig['outputs'].items()}, + method_name=sig['method_name'] if 'method_name' in sig else key) logging.info("===== signature_def_map: {}".format(signature_def_map)) - builder.add_meta_graph_and_variables(sess, - tag_set.split(','), - signature_def_map=signature_def_map, - clear_devices=True) + builder.add_meta_graph_and_variables( + sess, + tag_set.split(','), + signature_def_map=signature_def_map, + clear_devices=True) g.finalize() builder.save() + def batch_results(mgr, results, qname='output'): """*DEPRECATED*. Use TFNode.DataFeed class instead.""" raise Exception("DEPRECATED: Use TFNode.DataFeed class instead") + def terminate(mgr, qname='input'): """*DEPRECATED*. Use TFNode.DataFeed class instead.""" raise Exception("DEPRECATED: Use TFNode.DataFeed class instead") + class DataFeed(object): """This class manages the *InputMode.SPARK* data feeding process from the perspective of the TensorFlow application. @@ -184,7 +196,7 @@ def __init__(self, mgr, train_mode=True, qname_in='input', qname_out='output', i self.qname_in = qname_in self.qname_out = qname_out self.done_feeding = False - self.input_tensors = [ tensor for col, tensor in sorted(input_mapping.items()) ] if input_mapping is not None else None + self.input_tensors = [tensor for col, tensor in sorted(input_mapping.items())] if input_mapping is not None else None def next_batch(self, batch_size): """Gets a batch of items from the input RDD. @@ -206,7 +218,7 @@ def next_batch(self, batch_size): """ logging.debug("next_batch() invoked") queue = self.mgr.get_queue(self.qname_in) - tensors = [] if self.input_tensors is None else { tensor:[] for tensor in self.input_tensors } + tensors = [] if self.input_tensors is None else {tensor: [] for tensor in self.input_tensors} count = 0 while count < batch_size: item = queue.get(block=True) @@ -276,4 +288,3 @@ def terminate(self): except Empty: logging.info("dropped {0} items from queue".format(count)) done = True - diff --git a/tensorflowonspark/TFSparkNode.py b/tensorflowonspark/TFSparkNode.py index b79316dc..ad8ed9aa 100644 --- a/tensorflowonspark/TFSparkNode.py +++ b/tensorflowonspark/TFSparkNode.py @@ -8,6 +8,7 @@ from __future__ import nested_scopes from __future__ import print_function +import json import logging import multiprocessing import os @@ -22,10 +23,12 @@ from . import TFManager from . import TFNode +from . import gpu_info from . import marker from . import reservation from . import util + class TFNodeContext: """Encapsulates unique metadata for a TensorFlowOnSpark node/executor and provides methods to interact with Spark and HDFS. @@ -33,7 +36,7 @@ class TFNodeContext: To simply the end-user API, this class now mirrors the functions of the TFNode module. Args: - :worker_num: integer identifier for this executor, per ``nodeRDD = sc.parallelize(range(num_executors), num_executors).`` + :executor_id: integer identifier for this executor, per ``nodeRDD = sc.parallelize(range(num_executors), num_executors).`` :job_name: TensorFlow job name (e.g. 'ps' or 'worker') of this TF node, per cluster_spec. :task_index: integer rank per job_name, e.g. "worker:0", "worker:1", "ps:0". :cluster_spec: dictionary for constructing a tf.train.ClusterSpec. @@ -41,8 +44,9 @@ class TFNodeContext: :working_dir: the current working directory for local filesystems, or YARN containers. :mgr: TFManager instance for this Python worker. """ - def __init__(self, worker_num, job_name, task_index, cluster_spec, defaultFS, working_dir, mgr): - self.worker_num = worker_num + def __init__(self, executor_id, job_name, task_index, cluster_spec, defaultFS, working_dir, mgr): + self.worker_num = executor_id # for backwards-compatibility + self.executor_id = executor_id self.job_name = job_name self.task_index = task_index self.cluster_spec = cluster_spec @@ -83,22 +87,23 @@ class TFSparkNode(object): mgr = None #: TFManager instance cluster_id = None #: Unique ID for a given TensorFlowOnSpark cluster, used for invalidating state for new clusters. -def _get_manager(cluster_info, host, ppid): + +def _get_manager(cluster_info, host, executor_id): """Returns this executor's "singleton" instance of the multiprocessing.Manager, reconnecting per python-worker if needed. Args: :cluster_info: cluster node reservations - :host: host IP - :ppid: parent (executor JVM) PID + :host: host IP address + :executor_id: unique id per executor (created during initial call to run()) Returns: TFManager instance for this executor/python-worker """ for node in cluster_info: - if node['host'] == host and node['ppid'] == ppid: + if node['host'] == host and node['executor_id'] == executor_id: addr = node['addr'] authkey = node['authkey'] - TFSparkNode.mgr = TFManager.connect(addr,authkey) + TFSparkNode.mgr = TFManager.connect(addr, authkey) break if TFSparkNode.mgr is None: @@ -108,9 +113,10 @@ def _get_manager(cluster_info, host, ppid): "3. Spark dynamic allocation is disabled." raise Exception(msg) - logging.info("Connected to TFSparkNode.mgr on {0}, ppid={1}, state={2}".format(host, ppid, str(TFSparkNode.mgr.get('state')))) + logging.info("Connected to TFSparkNode.mgr on {0}, executor={1}, state={2}".format(host, executor_id, str(TFSparkNode.mgr.get('state')))) return TFSparkNode.mgr + def run(fn, tf_args, cluster_meta, tensorboard, log_dir, queues, background): """Wraps the user-provided TensorFlow main function in a Spark mapPartitions function. @@ -127,9 +133,15 @@ def run(fn, tf_args, cluster_meta, tensorboard, log_dir, queues, background): A nodeRDD.mapPartitions() function. """ def _mapfn(iter): + import tensorflow as tf + # Note: consuming the input iterator helps Pyspark re-use this worker, for i in iter: - worker_num = i + executor_id = i + + # run quick check of GPU infrastructure if using tensorflow-gpu + if tf.test.is_built_with_cuda(): + gpus_to_use = gpu_info.get_gpus(1) # assign TF job/task based on provided cluster_spec template (or use default/null values) job_name = 'default' @@ -138,21 +150,21 @@ def _mapfn(iter): cluster_template = cluster_meta['cluster_template'] for jobtype in cluster_template: nodes = cluster_template[jobtype] - if worker_num in nodes: + if executor_id in nodes: job_name = jobtype - task_index = nodes.index(worker_num) + task_index = nodes.index(executor_id) break - # get unique id (hostname,ppid) for this executor's JVM + # get unique key (hostname, executor_id) for this executor host = util.get_ip_address() - ppid = os.getppid() + util.write_executor_id(executor_id) port = 0 # check for existing TFManagers if TFSparkNode.mgr is not None and str(TFSparkNode.mgr.get('state')) != "'stopped'": if TFSparkNode.cluster_id == cluster_id: # raise an exception to force Spark to retry this "reservation" task on another executor - raise Exception("TFManager already started on {0}, ppid={1}, state={2}".format(host, ppid, str(TFSparkNode.mgr.get("state")))) + raise Exception("TFManager already started on {0}, executor={1}, state={2}".format(host, executor_id, str(TFSparkNode.mgr.get("state")))) else: # old state, just continue with creating new manager logging.warn("Ignoring old TFManager with cluster_id {0}, requested cluster_id {1}".format(TFSparkNode.cluster_id, cluster_id)) @@ -187,10 +199,10 @@ def _mapfn(iter): tb_port = 0 if tensorboard and job_name == 'worker' and task_index == 0: tb_sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) - tb_sock.bind(('',0)) + tb_sock.bind(('', 0)) tb_port = tb_sock.getsockname()[1] tb_sock.close() - logdir = log_dir if log_dir else "tensorboard_%d" % worker_num + logdir = log_dir if log_dir else "tensorboard_%d" % executor_id # search for tensorboard in python/bin, PATH, and PYTHONPATH pypath = sys.executable @@ -214,8 +226,8 @@ def _mapfn(iter): tmp_sock = None node_meta = None for node in cluster_info: - (nhost, nppid) = (node['host'], node['ppid']) - if nhost == host and nppid == ppid: + (nhost, nexec) = (node['host'], node['executor_id']) + if nhost == host and nexec == executor_id: node_meta = node port = node['port'] @@ -224,13 +236,12 @@ def _mapfn(iter): # first, find a free port for TF tmp_sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) tmp_sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) - tmp_sock.bind(('',port)) + tmp_sock.bind(('', port)) port = tmp_sock.getsockname()[1] node_meta = { - 'worker_num': worker_num, + 'executor_id': executor_id, 'host': host, - 'ppid': ppid, 'job_name': job_name, 'task_index': task_index, 'port': port, @@ -247,21 +258,36 @@ def _mapfn(iter): client.close() # construct a TensorFlow clusterspec from cluster_info - sorted_cluster_info = sorted(cluster_info, key=lambda k: k['worker_num']) + sorted_cluster_info = sorted(cluster_info, key=lambda k: k['executor_id']) spec = {} - last_worker_num = -1 + last_executor_id = -1 for node in sorted_cluster_info: - if (node['worker_num'] == last_worker_num): + if (node['executor_id'] == last_executor_id): raise Exception("Duplicate worker/task in cluster_info") - last_worker_num = node['worker_num'] + last_executor_id = node['executor_id'] logging.info("node: {0}".format(node)) (njob, nhost, nport) = (node['job_name'], node['host'], node['port']) hosts = [] if njob not in spec else spec[njob] hosts.append("{0}:{1}".format(nhost, nport)) spec[njob] = hosts + # update TF_CONFIG and reserve GPU for tf.estimator based code + # Note: this will execute but be ignored by non-tf.estimator code + tf_config = json.dumps({ + 'cluster': spec, + 'task': {'type': job_name, 'index': task_index}, + 'environment': 'cloud' + }) + os.environ['TF_CONFIG'] = tf_config + if tf.test.is_built_with_cuda(): + num_gpus = tf_args.num_gpus if 'num_gpus' in tf_args else 1 + gpus_to_use = gpu_info.get_gpus(num_gpus) + gpu_str = "GPUs" if num_gpus > 1 else "GPU" + logging.debug("Requested {} {}, setting CUDA_VISIBLE_DEVICES={}".format(num_gpus, gpu_str, gpus_to_use)) + os.environ['CUDA_VISIBLE_DEVICES'] = gpus_to_use + # create a context object to hold metadata for TF - ctx = TFNodeContext(worker_num, job_name, task_index, spec, cluster_meta['default_fs'], cluster_meta['working_dir'], TFSparkNode.mgr) + ctx = TFNodeContext(executor_id, job_name, task_index, spec, cluster_meta['default_fs'], cluster_meta['working_dir'], TFSparkNode.mgr) # release port reserved for TF as late as possible if tmp_sock is not None: @@ -295,7 +321,7 @@ def wrapper_fn_background(args, context): if job_name == 'ps' or background: # invoke the TensorFlow main function in a background thread logging.info("Starting TensorFlow {0}:{1} as {2} on cluster node {3} on background process".format( - job_name, task_index, job_name, worker_num)) + job_name, task_index, job_name, executor_id)) p = multiprocessing.Process(target=wrapper_fn_background, args=(tf_args, ctx)) if job_name == 'ps': @@ -323,17 +349,18 @@ def wrapper_fn_background(args, context): queue.task_done() else: # otherwise, just run TF function in the main executor/worker thread - logging.info("Starting TensorFlow {0}:{1} on cluster node {2} on foreground thread".format(job_name, task_index, worker_num)) + logging.info("Starting TensorFlow {0}:{1} on cluster node {2} on foreground thread".format(job_name, task_index, executor_id)) wrapper_fn(tf_args, ctx) - logging.info("Finished TensorFlow {0}:{1} on cluster node {2}".format(job_name, task_index, worker_num)) + logging.info("Finished TensorFlow {0}:{1} on cluster node {2}".format(job_name, task_index, executor_id)) return _mapfn + def train(cluster_info, cluster_meta, qname='input'): """Feeds Spark partitions into the shared multiprocessing.Queue. Args: - :cluster_info: node reservation information for the cluster (e.g. host, ppid, pid, ports, etc) + :cluster_info: node reservation information for the cluster (e.g. host, executor_id, pid, ports, etc) :cluster_meta: dictionary of cluster metadata (e.g. cluster_id, reservation.Server address, etc) :qname: *INTERNAL_USE* @@ -342,7 +369,7 @@ def train(cluster_info, cluster_meta, qname='input'): """ def _train(iter): # get shared queue, reconnecting if necessary - mgr = _get_manager(cluster_info, util.get_ip_address(), os.getppid()) + mgr = _get_manager(cluster_info, util.get_ip_address(), util.read_executor_id()) try: queue = mgr.get_queue(qname) equeue = mgr.get_queue('error') @@ -394,11 +421,12 @@ def _train(iter): return _train + def inference(cluster_info, qname='input'): """Feeds Spark partitions into the shared multiprocessing.Queue and returns inference results. Args: - :cluster_info: node reservation information for the cluster (e.g. host, ppid, pid, ports, etc) + :cluster_info: node reservation information for the cluster (e.g. host, executor_id, pid, ports, etc) :qname: *INTERNAL_USE* Returns: @@ -406,7 +434,7 @@ def inference(cluster_info, qname='input'): """ def _inference(iter): # get shared queue, reconnecting if necessary - mgr = _get_manager(cluster_info, util.get_ip_address(), os.getppid()) + mgr = _get_manager(cluster_info, util.get_ip_address(), util.read_executor_id()) try: queue_in = mgr.get_queue(qname) equeue = mgr.get_queue('error') @@ -453,11 +481,12 @@ def _inference(iter): return _inference + def shutdown(cluster_info, queues=['input']): """Stops all TensorFlow nodes by feeding ``None`` into the multiprocessing.Queues. Args: - :cluster_info: node reservation information for the cluster (e.g. host, ppid, pid, ports, etc). + :cluster_info: node reservation information for the cluster (e.g. host, executor_id, pid, ports, etc). :queues: *INTERNAL_USE* Returns: @@ -465,14 +494,14 @@ def shutdown(cluster_info, queues=['input']): """ def _shutdown(iter): host = util.get_ip_address() - ppid = os.getppid() + executor_id = util.read_executor_id() # reconnect to shared queue - mgr = _get_manager(cluster_info, host, ppid) + mgr = _get_manager(cluster_info, host, executor_id) # send SIGTERM to Tensorboard proc (if running) for node in cluster_info: - if node['host'] == host and node['ppid'] == ppid: + if node['host'] == host and node['executor_id'] == executor_id: tb_pid = node['tb_pid'] if tb_pid != 0: logging.info("Stopping tensorboard (pid={0})".format(tb_pid)) @@ -494,4 +523,3 @@ def _shutdown(iter): return [True] return _shutdown - diff --git a/tensorflowonspark/dfutil.py b/tensorflowonspark/dfutil.py index 0654c0bd..59c47ece 100644 --- a/tensorflowonspark/dfutil.py +++ b/tensorflowonspark/dfutil.py @@ -37,8 +37,8 @@ def saveAsTFRecords(df, output_dir): """ tf_rdd = df.rdd.mapPartitions(toTFExample(df.dtypes)) tf_rdd.saveAsNewAPIHadoopFile(output_dir, "org.tensorflow.hadoop.io.TFRecordFileOutputFormat", - keyClass="org.apache.hadoop.io.BytesWritable", - valueClass="org.apache.hadoop.io.NullWritable") + keyClass="org.apache.hadoop.io.BytesWritable", + valueClass="org.apache.hadoop.io.NullWritable") def loadTFRecords(sc, input_dir, binary_features=[]): @@ -61,8 +61,8 @@ def loadTFRecords(sc, input_dir, binary_features=[]): import tensorflow as tf tfr_rdd = sc.newAPIHadoopFile(input_dir, "org.tensorflow.hadoop.io.TFRecordFileInputFormat", - keyClass="org.apache.hadoop.io.BytesWritable", - valueClass="org.apache.hadoop.io.NullWritable") + keyClass="org.apache.hadoop.io.BytesWritable", + valueClass="org.apache.hadoop.io.NullWritable") # infer Spark SQL types from tf.Example record = tfr_rdd.take(1)[0] @@ -162,7 +162,7 @@ def _infer_sql_type(k, v): else: # represent everything else as base types (and empty tensors as StringType()) return sql_type - return StructType([ StructField(k, _infer_sql_type(k, v), True) for k,v in sorted(example.features.feature.items()) ]) + return StructType([StructField(k, _infer_sql_type(k, v), True) for k, v in sorted(example.features.feature.items())]) def fromTFExample(iter, binary_features=[]): @@ -203,9 +203,8 @@ def _get_value(k, v): for record in iter: example = tf.train.Example() example.ParseFromString(bytes(record[0])) # record is (bytestr, None) - d = { k: _get_value(k, v) for k,v in sorted(example.features.feature.items()) } + d = {k: _get_value(k, v) for k, v in sorted(example.features.feature.items())} row = Row(**d) results.append(row) return results - diff --git a/tensorflowonspark/gpu_info.py b/tensorflowonspark/gpu_info.py index 1e73789e..8ef6dd8d 100644 --- a/tensorflowonspark/gpu_info.py +++ b/tensorflowonspark/gpu_info.py @@ -16,6 +16,7 @@ MAX_RETRIES = 3 #: Maximum retries to allocate GPUs + def _get_gpu(): """*DEPRECATED*. Allocates first available GPU using cudaSetDevice(), or returns 0 otherwise.""" # Note: this code executes, but Tensorflow subsequently complains that the "current context was not created by the StreamExecutor cuda_driver API" @@ -38,6 +39,7 @@ def _get_gpu(): break return gpu + def get_gpus(num_gpu=1): """Get list of free GPUs according to nvidia-smi. @@ -49,49 +51,46 @@ def get_gpus(num_gpu=1): Returns: Comma-delimited string of GPU ids, or raises an Exception if the requested number of GPUs could not be found. """ + # get list of gpus (index, uuid) + list_gpus = subprocess.check_output(["nvidia-smi", "--list-gpus"]).decode() + logging.debug("all GPUs:\n{0}".format(list_gpus)) + + # parse index and guid + gpus = [x for x in list_gpus.split('\n') if len(x) > 0] + + def parse_gpu(gpu_str): + cols = gpu_str.split(' ') + return cols[5].split(')')[0], cols[1].split(':')[0] + gpu_list = [parse_gpu(gpu) for gpu in gpus] + + # randomize the search order to get a better distribution of GPUs + random.shuffle(gpu_list) + + free_gpus = [] + retries = 0 + while len(free_gpus) < num_gpu and retries < MAX_RETRIES: + smi_output = subprocess.check_output(["nvidia-smi", "--format=csv,noheader,nounits", "--query-compute-apps=gpu_uuid"]).decode() + logging.debug("busy GPUs:\n{0}".format(smi_output)) + busy_uuids = [x for x in smi_output.split('\n') if len(x) > 0] + for uuid, index in gpu_list: + if uuid not in busy_uuids: + free_gpus.append(index) - try: - # get list of gpus (index, uuid) - list_gpus = subprocess.check_output(["nvidia-smi", "--list-gpus"]).decode() - logging.debug("all GPUs:\n{0}".format(list_gpus)) - - # parse index and guid - gpus = [ x for x in list_gpus.split('\n') if len(x) > 0 ] - - def parse_gpu(gpu_str): - cols = gpu_str.split(' ') - return cols[5].split(')')[0], cols[1].split(':')[0] - gpu_list = [parse_gpu(gpu) for gpu in gpus] - - # randomize the search order to get a better distribution of GPUs - random.shuffle(gpu_list) - - free_gpus = [] - retries = 0 - while len(free_gpus) < num_gpu and retries < MAX_RETRIES: - smi_output = subprocess.check_output(["nvidia-smi", "--format=csv,noheader,nounits", "--query-compute-apps=gpu_uuid"]).decode() - logging.debug("busy GPUs:\n{0}".format(smi_output)) - busy_uuids = [x for x in smi_output.split('\n') if len(x) > 0 ] - for uuid, index in gpu_list: - if uuid not in busy_uuids: - free_gpus.append(index) - - if len(free_gpus) < num_gpu: - # keep trying indefinitely - logging.warn("Unable to find available GPUs: requested={0}, available={1}".format(num_gpu, len(free_gpus))) - retries += 1 - time.sleep(30 * retries) - free_gpus = [] - - # if still can't find GPUs, raise exception if len(free_gpus) < num_gpu: - smi_output = subprocess.check_output(["nvidia-smi", "--format=csv", "--query-compute-apps=gpu_uuid,pid,process_name,used_gpu_memory"]).decode() - logging.info(": {0}".format(smi_output)) - raise Exception("Unable to find free GPU:\n{0}".format(smi_output)) + # keep trying indefinitely + logging.warn("Unable to find available GPUs: requested={0}, available={1}".format(num_gpu, len(free_gpus))) + retries += 1 + time.sleep(30 * retries) + free_gpus = [] + + # if still can't find GPUs, raise exception + if len(free_gpus) < num_gpu: + smi_output = subprocess.check_output(["nvidia-smi", "--format=csv", "--query-compute-apps=gpu_uuid,pid,process_name,used_gpu_memory"]).decode() + logging.info(": {0}".format(smi_output)) + raise Exception("Unable to find free GPU:\n{0}".format(smi_output)) + + return ','.join(free_gpus[:num_gpu]) - return ','.join(free_gpus[:num_gpu]) - except subprocess.CalledProcessError as e: - print ("nvidia-smi error", e.output) # Function to get the gpu information def _get_free_gpu(max_gpu_utilization=40, min_free_memory=0.5, num_gpu=1): @@ -126,7 +125,7 @@ def get_gpu_info(): # Read the gpu information multiple times num_times_to_average = 5 current_array = [] - for ind in xrange(num_times_to_average): + for ind in range(num_times_to_average): current_array.append(get_gpu_info()) time.sleep(1) @@ -134,12 +133,12 @@ def get_gpu_info(): num_gpus = len(current_array[0]) # Average the gpu information - avg_array = [(0,0,str(x)) for x in xrange(num_gpus)] - for ind in xrange(num_times_to_average): - for gpu_ind in xrange(num_gpus): + avg_array = [(0, 0, str(x)) for x in range(num_gpus)] + for ind in range(num_times_to_average): + for gpu_ind in range(num_gpus): avg_array[gpu_ind] = (avg_array[gpu_ind][0] + current_array[ind][gpu_ind][0], avg_array[gpu_ind][1] + current_array[ind][gpu_ind][1], avg_array[gpu_ind][2]) - for gpu_ind in xrange(num_gpus): + for gpu_ind in range(num_gpus): avg_array[gpu_ind] = (float(avg_array[gpu_ind][0]) / num_times_to_average, float(avg_array[gpu_ind][1]) / num_times_to_average, avg_array[gpu_ind][2]) avg_array.sort() @@ -164,4 +163,3 @@ def get_gpu_info(): break return gpus_to_use, free_memory - diff --git a/tensorflowonspark/marker.py b/tensorflowonspark/marker.py index a7b957cf..d87a365e 100755 --- a/tensorflowonspark/marker.py +++ b/tensorflowonspark/marker.py @@ -7,11 +7,12 @@ from __future__ import nested_scopes from __future__ import print_function + class Marker(object): """Base class for special marker objects in the data queue""" pass + class EndPartition(Marker): """Marks the end of an RDD Partition during data feeding""" pass - diff --git a/tensorflowonspark/pipeline.py b/tensorflowonspark/pipeline.py index 1a4f5c9c..825af1d4 100755 --- a/tensorflowonspark/pipeline.py +++ b/tensorflowonspark/pipeline.py @@ -34,7 +34,8 @@ import subprocess import sys -##### TensorFlowOnSpark Params + +# TensorFlowOnSpark Params class TFTypeConverters(object): """Custom DataFrame TypeConverter for dictionary types (since this is not provided by Spark core).""" @@ -45,158 +46,225 @@ def toDict(value): else: raise TypeError("Could not convert %s to OrderedDict" % value) + class HasBatchSize(Params): batch_size = Param(Params._dummy(), "batch_size", "Number of records per batch", typeConverter=TypeConverters.toInt) + def __init__(self): super(HasBatchSize, self).__init__() + def setBatchSize(self, value): return self._set(batch_size=value) + def getBatchSize(self): return self.getOrDefault(self.batch_size) + class HasClusterSize(Params): cluster_size = Param(Params._dummy(), "cluster_size", "Number of nodes in the cluster", typeConverter=TypeConverters.toInt) + def __init__(self): super(HasClusterSize, self).__init__() + def setClusterSize(self, value): return self._set(cluster_size=value) + def getClusterSize(self): return self.getOrDefault(self.cluster_size) + class HasEpochs(Params): epochs = Param(Params._dummy(), "epochs", "Number of epochs to train", typeConverter=TypeConverters.toInt) + def __init__(self): super(HasEpochs, self).__init__() + def setEpochs(self, value): return self._set(epochs=value) + def getEpochs(self): return self.getOrDefault(self.epochs) + class HasInputMapping(Params): input_mapping = Param(Params._dummy(), "input_mapping", "Mapping of input DataFrame column to input tensor", typeConverter=TFTypeConverters.toDict) + def __init__(self): super(HasInputMapping, self).__init__() + def setInputMapping(self, value): return self._set(input_mapping=value) + def getInputMapping(self): return self.getOrDefault(self.input_mapping) + class HasInputMode(Params): input_mode = Param(Params._dummy(), "input_mode", "Input data feeding mode (0=TENSORFLOW, 1=SPARK)", typeConverter=TypeConverters.toInt) + def __init__(self): super(HasInputMode, self).__init__() + def setInputMode(self, value): return self._set(input_mode=value) + def getInputMode(self): return self.getOrDefault(self.input_mode) + class HasModelDir(Params): model_dir = Param(Params._dummy(), "model_dir", "Path to save/load model checkpoints", typeConverter=TypeConverters.toString) + def __init__(self): super(HasModelDir, self).__init__() + def setModelDir(self, value): return self._set(model_dir=value) + def getModelDir(self): return self.getOrDefault(self.model_dir) + class HasNumPS(Params): num_ps = Param(Params._dummy(), "num_ps", "Number of PS nodes in cluster", typeConverter=TypeConverters.toInt) driver_ps_nodes = Param(Params._dummy(), "driver_ps_nodes", "Run PS nodes on driver locally", typeConverter=TypeConverters.toBoolean) + def __init__(self): super(HasNumPS, self).__init__() + def setNumPS(self, value): return self._set(num_ps=value) + def getNumPS(self): return self.getOrDefault(self.num_ps) + def setDriverPSNodes(self, value): return self._set(driver_ps_nodes=value) + def getDriverPSNodes(self): return self.getOrDefault(self.driver_ps_nodes) + class HasOutputMapping(Params): output_mapping = Param(Params._dummy(), "output_mapping", "Mapping of output tensor to output DataFrame column", typeConverter=TFTypeConverters.toDict) + def __init__(self): super(HasOutputMapping, self).__init__() + def setOutputMapping(self, value): return self._set(output_mapping=value) + def getOutputMapping(self): return self.getOrDefault(self.output_mapping) + class HasProtocol(Params): protocol = Param(Params._dummy(), "protocol", "Network protocol for Tensorflow (grpc|rdma)", typeConverter=TypeConverters.toString) + def __init__(self): super(HasProtocol, self).__init__() + def setProtocol(self, value): return self._set(protocol=value) + def getProtocol(self): return self.getOrDefault(self.protocol) + class HasReaders(Params): readers = Param(Params._dummy(), "readers", "number of reader/enqueue threads", typeConverter=TypeConverters.toInt) + def __init__(self): super(HasReaders, self).__init__() + def setReaders(self, value): return self._set(readers=value) + def getReaders(self): return self.getOrDefault(self.readers) + class HasSteps(Params): steps = Param(Params._dummy(), "steps", "Maximum number of steps to train", typeConverter=TypeConverters.toInt) + def __init__(self): super(HasSteps, self).__init__() + def setSteps(self, value): return self._set(steps=value) + def getSteps(self): return self.getOrDefault(self.steps) + class HasTensorboard(Params): tensorboard = Param(Params._dummy(), "tensorboard", "Launch tensorboard process", typeConverter=TypeConverters.toBoolean) + def __init__(self): super(HasTensorboard, self).__init__() + def setTensorboard(self, value): return self._set(tensorboard=value) + def getTensorboard(self): return self.getOrDefault(self.tensorboard) + class HasTFRecordDir(Params): tfrecord_dir = Param(Params._dummy(), "tfrecord_dir", "Path to temporarily export a DataFrame as TFRecords (for InputMode.TENSORFLOW apps)", typeConverter=TypeConverters.toString) + def __init__(self): super(HasTFRecordDir, self).__init__() + def setTFRecordDir(self, value): return self._set(tfrecord_dir=value) + def getTFRecordDir(self): return self.getOrDefault(self.tfrecord_dir) -##### SavedModelBuilder Params + +# SavedModelBuilder Params class HasExportDir(Params): export_dir = Param(Params._dummy(), "export_dir", "Directory to export saved_model", typeConverter=TypeConverters.toString) + def __init__(self): super(HasExportDir, self).__init__() + def setExportDir(self, value): return self._set(export_dir=value) + def getExportDir(self): return self.getOrDefault(self.export_dir) + class HasSignatureDefKey(Params): signature_def_key = Param(Params._dummy(), "signature_def_key", "Identifier for a specific saved_model signature", typeConverter=TypeConverters.toString) + def __init__(self): super(HasSignatureDefKey, self).__init__() self._setDefault(signature_def_key=None) + def setSignatureDefKey(self, value): return self._set(signature_def_key=value) + def getSignatureDefKey(self): return self.getOrDefault(self.signature_def_key) + class HasTagSet(Params): tag_set = Param(Params._dummy(), "tag_set", "Comma-delimited list of tags identifying a saved_model metagraph", typeConverter=TypeConverters.toString) + def __init__(self): super(HasTagSet, self).__init__() + def setTagSet(self, value): return self._set(tag_set=value) + def getTagSet(self): return self.getOrDefault(self.tag_set) + class Namespace(object): """ Utility class to convert dictionaries to Namespace-like objects. @@ -204,6 +272,7 @@ class Namespace(object): Based on https://docs.python.org/dev/library/types.html#types.SimpleNamespace """ argv = None + def __init__(self, d): if isinstance(d, list): self.argv = d @@ -238,9 +307,11 @@ def __eq__(self, other): else: return self.__dict__ == other.__dict__ + class TFParams(Params): """Mix-in class to store namespace-style args and merge w/ SparkML-style params.""" args = None + def merge_args_params(self): local_args = copy.copy(self.args) # make a local copy of args args_dict = vars(local_args) # get dictionary view @@ -248,6 +319,7 @@ def merge_args_params(self): args_dict[p.name] = self.getOrDefault(p.name) # update with params return local_args + class TFEstimator(Estimator, TFParams, HasInputMapping, HasClusterSize, HasNumPS, HasInputMode, HasProtocol, HasTensorboard, HasModelDir, HasExportDir, HasTFRecordDir, HasBatchSize, HasEpochs, HasReaders, HasSteps): @@ -279,19 +351,19 @@ def __init__(self, train_fn, tf_args, export_fn=None): self.export_fn = export_fn self.args = Namespace(tf_args) self._setDefault(input_mapping={}, - cluster_size=1, - num_ps=0, - driver_ps_nodes=False, - input_mode=TFCluster.InputMode.SPARK, - protocol='grpc', - tensorboard=False, - model_dir=None, - export_dir=None, - tfrecord_dir=None, - batch_size=100, - epochs=1, - readers=1, - steps=1000) + cluster_size=1, + num_ps=0, + driver_ps_nodes=False, + input_mode=TFCluster.InputMode.SPARK, + protocol='grpc', + tensorboard=False, + model_dir=None, + export_dir=None, + tfrecord_dir=None, + batch_size=100, + epochs=1, + readers=1, + steps=1000) def _fit(self, dataset): """Trains a TensorFlow model and returns a TFModel instance with the same args/params pointing to a checkpoint or saved_model on disk. @@ -347,6 +419,7 @@ def _export(iterator, fn, args): return self._copyValues(TFModel(self.args)) + class TFModel(Model, TFParams, HasInputMapping, HasOutputMapping, HasBatchSize, @@ -365,12 +438,12 @@ def __init__(self, tf_args): super(TFModel, self).__init__() self.args = Namespace(tf_args) self._setDefault(input_mapping={}, - output_mapping={}, - batch_size=100, - model_dir=None, - export_dir=None, - signature_def_key=None, - tag_set=None) + output_mapping={}, + batch_size=100, + model_dir=None, + export_dir=None, + signature_def_key=None, + tag_set=None) def _transform(self, dataset): """Transforms the input DataFrame by applying the _run_model() mapPartitions function. @@ -381,8 +454,8 @@ def _transform(self, dataset): spark = SparkSession.builder.getOrCreate() # set a deterministic order for input/output columns (lexicographic by key) - input_cols = [ col for col, tensor in sorted(self.getInputMapping().items()) ] # input col => input tensor - output_cols = [ col for tensor, col in sorted(self.getOutputMapping().items()) ] # output tensor => output col + input_cols = [col for col, tensor in sorted(self.getInputMapping().items())] # input col => input tensor + output_cols = [col for tensor, col in sorted(self.getOutputMapping().items())] # output tensor => output col # run single-node inferencing on each executor logging.info("input_cols: {}".format(input_cols)) @@ -406,6 +479,7 @@ def _transform(self, dataset): global_sess = None # tf.Session cache global_args = None # args provided to the _run_model() method. Any change will invalidate the global_sess cache. + def _run_model(iterator, args, tf_args): """mapPartitions function to run single-node inferencing from a checkpoint/saved_model, using the model's input/output mappings. @@ -421,8 +495,8 @@ def _run_model(iterator, args, tf_args): logging.info("===== input_mapping: {}".format(args.input_mapping)) logging.info("===== output_mapping: {}".format(args.output_mapping)) - input_tensor_names = [ tensor for col,tensor in sorted(args.input_mapping.items()) ] - output_tensor_names = [ tensor for tensor,col in sorted(args.output_mapping.items()) ] + input_tensor_names = [tensor for col, tensor in sorted(args.input_mapping.items())] + output_tensor_names = [tensor for tensor, col in sorted(args.output_mapping.items())] # if using a signature_def_key, get input/output tensor info from the requested signature if args.signature_def_key: @@ -481,14 +555,15 @@ def _run_model(iterator, args, tf_args): inputs_feed_dict[input_tensors[i]] = tensors[i] outputs = sess.run(output_tensors, feed_dict=inputs_feed_dict) - lengths = [ len(output) for output in outputs ] + lengths = [len(output) for output in outputs] input_size = len(tensors[0]) - assert all([ l == input_size for l in lengths ]), "Output array sizes {} must match input size: {}".format(lengths, input_size) - python_outputs = [ output.tolist() for output in outputs ] # convert from numpy to standard python types - result.extend(zip(*python_outputs)) # convert to an array of tuples of "output columns" + assert all([length == input_size for length in lengths]), "Output array sizes {} must match input size: {}".format(lengths, input_size) + python_outputs = [output.tolist() for output in outputs] # convert from numpy to standard python types + result.extend(zip(*python_outputs)) # convert to an array of tuples of "output columns" return result + def single_node_env(args): """Sets up environment for a single-node TF session. @@ -522,6 +597,7 @@ def single_node_env(args): logging.info("Using CPU") os.environ['CUDA_VISIBLE_DEVICES'] = '' + def get_meta_graph_def(saved_model_dir, tag_set): """Utility function to read a meta_graph_def from disk. @@ -541,6 +617,7 @@ def get_meta_graph_def(saved_model_dir, tag_set): return meta_graph_def raise RuntimeError("MetaGraphDef associated with tag-set {0} could not be found in SavedModel".format(tag_set)) + def yield_batch(iterable, batch_size, num_tensors=1): """Generator that yields batches of a DataFrame iterator. @@ -552,7 +629,7 @@ def yield_batch(iterable, batch_size, num_tensors=1): Returns: An array of ``num_tensors`` arrays, each of length `batch_size` """ - tensors = [ [] for i in range(num_tensors) ] + tensors = [[] for i in range(num_tensors)] for item in iterable: if item is None: break @@ -561,6 +638,6 @@ def yield_batch(iterable, batch_size, num_tensors=1): tensors[i].append(tmp) if len(tensors[0]) >= batch_size: yield tensors - tensors = [ [] for i in range(num_tensors) ] + tensors = [[] for i in range(num_tensors)] if len(tensors[0]) > 0: yield tensors diff --git a/tensorflowonspark/reservation.py b/tensorflowonspark/reservation.py index 78e4cc5e..8f204520 100644 --- a/tensorflowonspark/reservation.py +++ b/tensorflowonspark/reservation.py @@ -22,6 +22,7 @@ BUFSIZE = 1024 MAX_RETRIES = 3 + class Reservations: """Thread-safe store for node reservations. @@ -58,6 +59,7 @@ def remaining(self): with self.lock: return self.required - len(self.reservations) + class MessageSocket(object): """Abstract class w/ length-prefixed socket send/receive functions.""" @@ -146,13 +148,13 @@ def start(self): """ server_sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) server_sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) - server_sock.bind(('',0)) + server_sock.bind(('', 0)) server_sock.listen(10) # hostname may not be resolvable but IP address probably will be host = util.get_ip_address() port = server_sock.getsockname()[1] - addr = (host,port) + addr = (host, port) logging.info("listening for reservations at {0}".format(addr)) def _listen(self, sock): @@ -187,6 +189,7 @@ def stop(self): """Stop the Server's socket listener.""" self.done = True + class Client(MessageSocket): """Client to register and await node reservations. diff --git a/tensorflowonspark/util.py b/tensorflowonspark/util.py index fd649e8a..a8270a25 100644 --- a/tensorflowonspark/util.py +++ b/tensorflowonspark/util.py @@ -24,3 +24,15 @@ def find_in_path(path, file): if os.path.exists(candidate) and os.path.isfile(candidate): return candidate return False + + +def write_executor_id(num): + """Write executor_id into a local file in the executor's current working directory""" + with open("executor_id", "w") as f: + f.write(str(num)) + + +def read_executor_id(): + """Read worker id from a local file in the executor's current working directory""" + with open("executor_id", "r") as f: + return int(f.read())