diff --git a/docs/.buildinfo b/docs/.buildinfo index 4ce8b70f..36fb09ab 100644 --- a/docs/.buildinfo +++ b/docs/.buildinfo @@ -1,4 +1,4 @@ # Sphinx build info version 1 # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. -config: 874109c9e8f56215fdcb46cac4aab9f9 +config: abbb35398bf3c41c0f421213a6263bf9 tags: 645f666f9bcd5a90fca523b33c5a78b7 diff --git a/docs/_modules/index.html b/docs/_modules/index.html index a086bc32..10c1b7d0 100644 --- a/docs/_modules/index.html +++ b/docs/_modules/index.html @@ -4,27 +4,18 @@ + - Overview: module code — TensorFlowOnSpark 1.2.0 documentation + Overview: module code — TensorFlowOnSpark 1.2.1 documentation - + - - + @@ -44,15 +35,11 @@

Navigation

All modules for which code is available

-
\ No newline at end of file diff --git a/docs/_modules/tensorflowonspark/TFManager.html b/docs/_modules/tensorflowonspark/TFManager.html index fe686d8d..e61dbfba 100644 --- a/docs/_modules/tensorflowonspark/TFManager.html +++ b/docs/_modules/tensorflowonspark/TFManager.html @@ -4,27 +4,18 @@ + - tensorflowonspark.TFManager — TensorFlowOnSpark 1.2.0 documentation + tensorflowonspark.TFManager — TensorFlowOnSpark 1.2.1 documentation - + - - + @@ -58,8 +49,8 @@

Source code for tensorflowonspark.TFManager

 from multiprocessing import JoinableQueue
 
 
[docs]class TFManager(BaseManager): - """Python multiprocessing.Manager for distributed, multi-process communication.""" - pass
+ """Python multiprocessing.Manager for distributed, multi-process communication."""
+ pass # global to each Spark executor's python worker @@ -73,6 +64,12 @@

Source code for tensorflowonspark.TFManager

 def _set(key, value):
   kdict[key] = value
 
+def _get_queue(qname):
+  try:
+    return qdict[qname]
+  except KeyError:
+    return None
+
 
[docs]def start(authkey, queues, mode='local'): """Create a new multiprocess.Manager (or return existing one). @@ -89,15 +86,16 @@

Source code for tensorflowonspark.TFManager

   kdict.clear()
   for q in queues:
     qdict[q] = JoinableQueue()
-  TFManager.register('get_queue', callable=lambda qname: qdict[qname])
+
+  TFManager.register('get_queue', callable=lambda qname: _get_queue(qname))
   TFManager.register('get', callable=lambda key: _get(key))
   TFManager.register('set', callable=lambda key, value: _set(key, value))
   if mode == 'remote':
     mgr = TFManager(address=('',0), authkey=authkey)
   else:
     mgr = TFManager(authkey=authkey)
-  mgr.start()
-  return mgr
+ mgr.start()
+ return mgr
[docs]def connect(address, authkey): """Connect to a multiprocess.Manager. @@ -113,8 +111,8 @@

Source code for tensorflowonspark.TFManager

   TFManager.register('get')
   TFManager.register('set')
   m = TFManager(address, authkey=authkey)
-  m.connect()
-  return m
+ m.connect()
+ return m
@@ -125,12 +123,14 @@

Source code for tensorflowonspark.TFManager

         
@@ -146,13 +146,13 @@

Navigation

  • modules |
  • - +
    \ No newline at end of file diff --git a/docs/_modules/tensorflowonspark/TFNode.html b/docs/_modules/tensorflowonspark/TFNode.html index e1588097..68d592f0 100644 --- a/docs/_modules/tensorflowonspark/TFNode.html +++ b/docs/_modules/tensorflowonspark/TFNode.html @@ -4,27 +4,18 @@ + - tensorflowonspark.TFNode — TensorFlowOnSpark 1.2.0 documentation + tensorflowonspark.TFNode — TensorFlowOnSpark 1.2.1 documentation - + - - + @@ -91,8 +82,8 @@

    Source code for tensorflowonspark.TFNode

         elif ctx.defaultFS.startswith("file://"):
           return "{0}/{1}/{2}".format(ctx.defaultFS, ctx.working_dir[1:], path)
         else:
    -      logging.warn("Unknown scheme {0} with relative path: {1}".format(ctx.defaultFS, path))
    -      return "{0}/{1}".format(ctx.defaultFS, path)
    + logging.warn("Unknown scheme {0} with relative path: {1}".format(ctx.defaultFS, path)) + return "{0}/{1}".format(ctx.defaultFS, path)
    [docs]def start_cluster_server(ctx, num_gpus=1, rdma=False): """Function that wraps the creation of TensorFlow ``tf.train.Server`` for a node in a distributed TensorFlow cluster. @@ -155,12 +146,12 @@

    Source code for tensorflowonspark.TFNode

     
         # Create and start a server for the local task.
         server = tf.train.Server(cluster, ctx.job_name, ctx.task_index)
    -
    -  return (cluster, server)
    +
    + return (cluster, server)
    [docs]def next_batch(mgr, batch_size, qname='input'): - """*DEPRECATED*. Use TFNode.DataFeed class instead.""" - raise Exception("DEPRECATED: Use TFNode.DataFeed class instead")
    + """*DEPRECATED*. Use TFNode.DataFeed class instead.""" + raise Exception("DEPRECATED: Use TFNode.DataFeed class instead")
    [docs]def export_saved_model(sess, export_dir, tag_set, signatures): """Convenience function to export a saved_model using provided arguments @@ -203,16 +194,16 @@

    Source code for tensorflowonspark.TFNode

                   tag_set.split(','),
                   signature_def_map=signature_def_map,
                   clear_devices=True)
    -  g.finalize()
    -  builder.save()
    + g.finalize()
    + builder.save()
    [docs]def batch_results(mgr, results, qname='output'): - """*DEPRECATED*. Use TFNode.DataFeed class instead.""" - raise Exception("DEPRECATED: Use TFNode.DataFeed class instead")
    + """*DEPRECATED*. Use TFNode.DataFeed class instead.""" + raise Exception("DEPRECATED: Use TFNode.DataFeed class instead")
    [docs]def terminate(mgr, qname='input'): - """*DEPRECATED*. Use TFNode.DataFeed class instead.""" - raise Exception("DEPRECATED: Use TFNode.DataFeed class instead")
    + """*DEPRECATED*. Use TFNode.DataFeed class instead.""" + raise Exception("DEPRECATED: Use TFNode.DataFeed class instead")
    [docs]class DataFeed(object): """This class manages the *InputMode.SPARK* data feeding process from the perspective of the TensorFlow application. @@ -278,12 +269,12 @@

    Source code for tensorflowonspark.TFNode

                 tensors[self.input_tensors[i]].append(item[i])
             count += 1
             queue.task_done()
    -    logging.debug("next_batch() returning {0} items".format(count))
    -    return tensors
    + logging.debug("next_batch() returning {0} items".format(count))
    + return tensors
    [docs] def should_stop(self): - """Check if the feed process was told to stop (by a call to ``terminate``).""" - return self.done_feeding
    + """Check if the feed process was told to stop (by a call to ``terminate``).""" + return self.done_feeding
    [docs] def batch_results(self, results): """Push a batch of output results to the Spark output RDD of ``TFCluster.inference()``. @@ -297,8 +288,8 @@

    Source code for tensorflowonspark.TFNode

         logging.debug("batch_results() invoked")
         queue = self.mgr.get_queue(self.qname_out)
         for item in results:
    -      queue.put(item, block=True)
    -    logging.debug("batch_results() returning data")
    + queue.put(item, block=True)
    + logging.debug("batch_results() returning data")
    [docs] def terminate(self): """Terminate data feeding early. @@ -321,8 +312,8 @@

    Source code for tensorflowonspark.TFNode

             queue.task_done()
             count += 1
           except Empty:
    -        logging.info("dropped {0} items from queue".format(count))
    -        done = True
    + logging.info("dropped {0} items from queue".format(count)) + done = True @@ -333,12 +324,14 @@

    Source code for tensorflowonspark.TFNode

             
    @@ -354,13 +347,13 @@

    Navigation

  • modules |
  • - +
    \ No newline at end of file diff --git a/docs/_modules/tensorflowonspark/TFSparkNode.html b/docs/_modules/tensorflowonspark/TFSparkNode.html index 353f2fb0..d90c7127 100644 --- a/docs/_modules/tensorflowonspark/TFSparkNode.html +++ b/docs/_modules/tensorflowonspark/TFSparkNode.html @@ -4,27 +4,18 @@ + - tensorflowonspark.TFSparkNode — TensorFlowOnSpark 1.2.0 documentation + tensorflowonspark.TFSparkNode — TensorFlowOnSpark 1.2.1 documentation - + - - + @@ -56,18 +47,21 @@

    Source code for tensorflowonspark.TFSparkNode

    from __future__ import print_function import logging +import multiprocessing import os -import sys import platform import socket import subprocess -import multiprocessing +import sys import uuid +import time +import traceback +from threading import Thread from . import TFManager from . import TFNode -from . import reservation from . import marker +from . import reservation from . import util
    [docs]class TFNodeContext: @@ -95,20 +89,20 @@

    Source code for tensorflowonspark.TFSparkNode

    self.mgr = mgr
    [docs] def absolute_path(self, path): - """Convenience function to access ``TFNode.hdfs_path`` directly from this object instance.""" - return TFNode.hdfs_path(self, path)
    + """Convenience function to access ``TFNode.hdfs_path`` directly from this object instance."""
    + return TFNode.hdfs_path(self, path)
    [docs] def start_cluster_server(self, num_gpus=1, rdma=False): - """Convenience function to access ``TFNode.start_cluster_server`` directly from this object instance.""" - return TFNode.start_cluster_server(self, num_gpus, rdma)
    + """Convenience function to access ``TFNode.start_cluster_server`` directly from this object instance."""
    + return TFNode.start_cluster_server(self, num_gpus, rdma)
    [docs] def export_saved_model(self, sess, export_dir, tag_set, signatures): - """Convenience function to access ``TFNode.export_saved_model`` directly from this object instance.""" - TFNode.export_saved_model(sess, export_dir, tag_set, signatures)
    + """Convenience function to access ``TFNode.export_saved_model`` directly from this object instance."""
    + TFNode.export_saved_model(sess, export_dir, tag_set, signatures)
    [docs] def get_data_feed(self, train_mode=True, qname_in='input', qname_out='output', input_mapping=None): - """Convenience function to access ``TFNode.DataFeed`` directly from this object instance.""" - return TFNode.DataFeed(self.mgr, train_mode, qname_in, qname_out, input_mapping)
    + """Convenience function to access ``TFNode.DataFeed`` directly from this object instance.""" + return TFNode.DataFeed(self.mgr, train_mode, qname_in, qname_out, input_mapping)
    [docs]class TFSparkNode(object): @@ -124,8 +118,8 @@

    Source code for tensorflowonspark.TFSparkNode

    This also manages a reference to the TFManager "singleton" per executor. Since Spark can spawn more than one python-worker per executor, this will reconnect to the "singleton" instance as needed. """ - mgr = None #: TFManager instance - cluster_id = None #: Unique ID for a given TensorFlowOnSpark cluster, used for invalidating state for new clusters.
    + mgr = None #: TFManager instance
    + cluster_id = None #: Unique ID for a given TensorFlowOnSpark cluster, used for invalidating state for new clusters. def _get_manager(cluster_info, host, ppid): """Returns this executor's "singleton" instance of the multiprocessing.Manager, reconnecting per python-worker if needed. @@ -144,6 +138,14 @@

    Source code for tensorflowonspark.TFSparkNode

    authkey = node['authkey'] TFSparkNode.mgr = TFManager.connect(addr,authkey) break + + if TFSparkNode.mgr is None: + msg = "No TFManager found on this node, please ensure that:\n" + \ + "1. Spark num_executors matches TensorFlow cluster_size\n" + \ + "2. Spark cores/tasks per executor is 1.\n" + \ + "3. Spark dynamic allocation is disabled." + raise Exception(msg) + logging.info("Connected to TFSparkNode.mgr on {0}, ppid={1}, state={2}".format(host, ppid, str(TFSparkNode.mgr.get('state')))) return TFSparkNode.mgr @@ -199,7 +201,7 @@

    Source code for tensorflowonspark.TFSparkNode

    addr = None if job_name == 'ps': # PS nodes must be remotely accessible in order to shutdown from Spark driver. - TFSparkNode.mgr = TFManager.start(authkey, ['control'], 'remote') + TFSparkNode.mgr = TFManager.start(authkey, ['control', 'error'], 'remote') addr = (host, TFSparkNode.mgr.address[1]) else: # worker nodes only need to be locally accessible within the executor for data feeding @@ -285,7 +287,11 @@

    Source code for tensorflowonspark.TFSparkNode

    # construct a TensorFlow clusterspec from cluster_info sorted_cluster_info = sorted(cluster_info, key=lambda k: k['worker_num']) spec = {} + last_worker_num = -1 for node in sorted_cluster_info: + if (node['worker_num'] == last_worker_num): + raise Exception("Duplicate worker/task in cluster_info") + last_worker_num = node['worker_num'] logging.info("node: {0}".format(node)) (njob, nhost, nport) = (node['job_name'], node['host'], node['port']) hosts = [] if njob not in spec else spec[njob] @@ -315,11 +321,21 @@

    Source code for tensorflowonspark.TFSparkNode

    sys.argv = args fn(args, context) + def wrapper_fn_background(args, context): + """Wrapper function that signals exceptions to foreground process.""" + errq = TFSparkNode.mgr.get_queue('error') + try: + wrapper_fn(args, context) + except Exception: + errq.put(traceback.format_exc()) + errq.join() + if job_name == 'ps' or background: # invoke the TensorFlow main function in a background thread logging.info("Starting TensorFlow {0}:{1} as {2} on cluster node {3} on background process".format( job_name, task_index, job_name, worker_num)) - p = multiprocessing.Process(target=wrapper_fn, args=(tf_args, ctx)) + + p = multiprocessing.Process(target=wrapper_fn_background, args=(tf_args, ctx)) if job_name == 'ps': p.daemon = True p.start() @@ -327,8 +343,15 @@

    Source code for tensorflowonspark.TFSparkNode

    # for ps nodes only, wait indefinitely in foreground thread for a "control" event (None == "stop") if job_name == 'ps': queue = TFSparkNode.mgr.get_queue('control') + equeue = TFSparkNode.mgr.get_queue('error') done = False while not done: + while (queue.empty() and equeue.empty()): + time.sleep(1) + if (not equeue.empty()): + e_str = equeue.get() + equeue.task_done() + raise Exception("exception in ps:\n" + e_str) msg = queue.get(block=True) logging.info("Got msg: {0}".format(msg)) if msg is None: @@ -341,8 +364,8 @@

    Source code for tensorflowonspark.TFSparkNode

    logging.info("Starting TensorFlow {0}:{1} on cluster node {2} on foreground thread".format(job_name, task_index, worker_num)) wrapper_fn(tf_args, ctx) logging.info("Finished TensorFlow {0}:{1} on cluster node {2}".format(job_name, task_index, worker_num)) - - return _mapfn
    +
    + return _mapfn
    [docs]def train(cluster_info, cluster_meta, qname='input'): """Feeds Spark partitions into the shared multiprocessing.Queue. @@ -358,7 +381,13 @@

    Source code for tensorflowonspark.TFSparkNode

    def _train(iter): # get shared queue, reconnecting if necessary mgr = _get_manager(cluster_info, util.get_ip_address(), os.getppid()) - queue = mgr.get_queue(qname) + try: + queue = mgr.get_queue(qname) + equeue = mgr.get_queue('error') + except (AttributeError, KeyError): + msg = "Queue '{}' not found on this node, check for exceptions on other nodes.".format(qname) + raise Exception(msg) + state = str(mgr.get('state')) logging.info("mgr.state={0}".format(state)) terminating = state == "'terminating'" @@ -368,15 +397,23 @@

    Source code for tensorflowonspark.TFSparkNode

    for item in iter: count += 1 logging.info("Skipped {0} items from partition".format(count)) - else: logging.info("Feeding partition {0} into {1} queue {2}".format(iter, qname, queue)) count = 0 for item in iter: count += 1 queue.put(item, block=True) + # wait for consumers to finish processing all items in queue before "finishing" this iterator - queue.join() + joinThr = Thread(target=queue.join) + joinThr.start() + while (joinThr.isAlive()): + if (not equeue.empty()): + e_str = equeue.get() + equeue.task_done() + raise Exception("exception in worker:\n" + e_str) + time.sleep(1) +# queue.join() logging.info("Processed {0} items in partition".format(count)) # check if TF is terminating feed after this partition @@ -392,8 +429,8 @@

    Source code for tensorflowonspark.TFSparkNode

    # ignore any errors while requesting stop logging.debug("Error while requesting stop: {0}".format(e)) return [terminating] - - return _train
    +
    + return _train
    [docs]def inference(cluster_info, qname='input'): """Feeds Spark partitions into the shared multiprocessing.Queue and returns inference results. @@ -408,7 +445,12 @@

    Source code for tensorflowonspark.TFSparkNode

    def _inference(iter): # get shared queue, reconnecting if necessary mgr = _get_manager(cluster_info, util.get_ip_address(), os.getppid()) - queue_in = mgr.get_queue(qname) + try: + queue_in = mgr.get_queue(qname) + equeue = mgr.get_queue('error') + except (AttributeError, KeyError): + msg = "Queue '{}' not found on this node, check for exceptions on other nodes.".format(qname) + raise Exception(msg) logging.info("Feeding partition {0} into {1} queue {2}".format(iter, qname, queue_in)) count = 0 @@ -424,7 +466,15 @@

    Source code for tensorflowonspark.TFSparkNode

    return [] # wait for consumers to finish processing all items in queue before "finishing" this iterator - queue_in.join() + joinThr = Thread(target=queue_in.join) + joinThr.start() + while (joinThr.isAlive()): + if (not equeue.empty()): + e_str = equeue.get() + equeue.task_done() + raise Exception("exception in worker:\n" + e_str) + time.sleep(1) + logging.info("Processed {0} items in partition".format(count)) # read result queue @@ -438,8 +488,8 @@

    Source code for tensorflowonspark.TFSparkNode

    logging.info("Finished processing partition") return results - - return _inference
    +
    + return _inference
    [docs]def shutdown(cluster_info, queues=['input']): """Stops all TensorFlow nodes by feeding ``None`` into the multiprocessing.Queues. @@ -469,15 +519,19 @@

    Source code for tensorflowonspark.TFSparkNode

    # terminate any listening queues logging.info("Stopping all queues") for q in queues: - queue = mgr.get_queue(q) - logging.info("Feeding None into {0} queue".format(q)) - queue.put(None, block=True) + try: + queue = mgr.get_queue(q) + logging.info("Feeding None into {0} queue".format(q)) + queue.put(None, block=True) + except (AttributeError, KeyError): + msg = "Queue '{}' not found on this node, check for exceptions on other nodes.".format(q) + raise Exception(msg) logging.info("Setting mgr.state to 'stopped'") mgr.set('state', 'stopped') return [True] - - return _shutdown
    +
    + return _shutdown
    @@ -488,12 +542,14 @@

    Source code for tensorflowonspark.TFSparkNode

    @@ -509,13 +565,13 @@

    Navigation

  • modules |
  • - +
    \ No newline at end of file diff --git a/docs/_modules/tensorflowonspark/gpu_info.html b/docs/_modules/tensorflowonspark/gpu_info.html index ec909beb..24fd6b9d 100644 --- a/docs/_modules/tensorflowonspark/gpu_info.html +++ b/docs/_modules/tensorflowonspark/gpu_info.html @@ -4,27 +4,18 @@ + - tensorflowonspark.gpu_info — TensorFlowOnSpark 1.2.0 documentation + tensorflowonspark.gpu_info — TensorFlowOnSpark 1.2.1 documentation - + - - + @@ -137,8 +128,8 @@

    Source code for tensorflowonspark.gpu_info

           raise Exception("Unable to find free GPU:\n{0}".format(smi_output))
     
         return ','.join(free_gpus[:num_gpu])
    -  except subprocess.CalledProcessError as e:
    -    print ("nvidia-smi error", e.output)
    + except subprocess.CalledProcessError as e:
    + print ("nvidia-smi error", e.output) # Function to get the gpu information def _get_free_gpu(max_gpu_utilization=40, min_free_memory=0.5, num_gpu=1): @@ -221,12 +212,14 @@

    Source code for tensorflowonspark.gpu_info

             
    @@ -242,13 +235,13 @@

    Navigation

  • modules |
  • - +
    \ No newline at end of file diff --git a/docs/_modules/tensorflowonspark/marker.html b/docs/_modules/tensorflowonspark/marker.html index a914e583..782a355f 100644 --- a/docs/_modules/tensorflowonspark/marker.html +++ b/docs/_modules/tensorflowonspark/marker.html @@ -4,27 +4,18 @@ + - tensorflowonspark.marker — TensorFlowOnSpark 1.2.0 documentation + tensorflowonspark.marker — TensorFlowOnSpark 1.2.1 documentation - + - - + @@ -55,12 +46,12 @@

    Source code for tensorflowonspark.marker

     from __future__ import print_function
     
     
    [docs]class Marker(object): - """Base class for special marker objects in the data queue""" - pass
    + """Base class for special marker objects in the data queue"""
    + pass
    [docs]class EndPartition(Marker): - """Marks the end of an RDD Partition during data feeding""" - pass
    + """Marks the end of an RDD Partition during data feeding"""
    + pass
    @@ -71,12 +62,14 @@

    Source code for tensorflowonspark.marker

             
    @@ -92,13 +85,13 @@

    Navigation

  • modules |
  • - +
    \ No newline at end of file diff --git a/docs/_modules/tensorflowonspark/reservation.html b/docs/_modules/tensorflowonspark/reservation.html index 3fc3020a..65dee53f 100644 --- a/docs/_modules/tensorflowonspark/reservation.html +++ b/docs/_modules/tensorflowonspark/reservation.html @@ -4,27 +4,18 @@ + - tensorflowonspark.reservation — TensorFlowOnSpark 1.2.0 documentation + tensorflowonspark.reservation — TensorFlowOnSpark 1.2.1 documentation - + - - + @@ -60,6 +51,7 @@

    Source code for tensorflowonspark.reservation

    import select import socket import struct +import sys import threading import time @@ -86,23 +78,23 @@

    Source code for tensorflowonspark.reservation

    Args: :meta: a dictonary of metadata about a node """ - with self.lock: - self.reservations.append(meta)
    + with self.lock:
    + self.reservations.append(meta)
    [docs] def done(self): """Returns True if the ``required`` number of reservations have been fulfilled.""" - with self.lock: - return len(self.reservations) >= self.required
    + with self.lock:
    + return len(self.reservations) >= self.required
    [docs] def get(self): """Get the list of current reservations.""" - with self.lock: - return self.reservations
    + with self.lock:
    + return self.reservations
    [docs] def remaining(self): """Get a count of remaining/unfulfilled reservations.""" - with self.lock: - return self.required - len(self.reservations)
    + with self.lock: + return self.required - len(self.reservations)
    [docs]class MessageSocket(object): """Abstract class w/ length-prefixed socket send/receive functions.""" @@ -126,14 +118,14 @@

    Source code for tensorflowonspark.reservation

    recv_len -= len(buf) recv_done = (recv_len == 0) - msg = pickle.loads(data) - return msg
    + msg = pickle.loads(data)
    + return msg
    [docs] def send(self, sock, msg): """Send ``msg`` to destination ``sock``.""" data = pickle.dumps(msg) - buf = struct.pack('>I', len(data)) + data - sock.sendall(buf)
    + buf = struct.pack('>I', len(data)) + data + sock.sendall(buf)
    [docs]class Server(MessageSocket): @@ -149,13 +141,22 @@

    Source code for tensorflowonspark.reservation

    assert count > 0 self.reservations = Reservations(count) -
    [docs] def await_reservations(self): +
    [docs] def await_reservations(self, sc, status={}, timeout=600): """Block until all reservations are received.""" + timespent = 0 while not self.reservations.done(): logging.info("waiting for {0} reservations".format(self.reservations.remaining())) + # check status flags for any errors + if 'error' in status: + sc.cancelAllJobs() + sc.stop() + sys.exit(1) time.sleep(1) - logging.info("all reservations completed") - return self.reservations.get()
    + timespent += 1 + if (timespent > timeout): + raise Exception("timed out waiting for reservations to complete") + logging.info("all reservations completed")
    + return self.reservations.get() def _handle_message(self, sock, msg): logging.debug("received: {0}".format(msg)) @@ -217,12 +218,12 @@

    Source code for tensorflowonspark.reservation

    t = threading.Thread(target=_listen, args=(self, server_sock)) t.daemon = True t.start() - - return addr
    +
    + return addr
    [docs] def stop(self): - """Stop the Server's socket listener.""" - self.done = True
    + """Stop the Server's socket listener.""" + self.done = True
    [docs]class Client(MessageSocket): """Client to register and await node reservations. @@ -267,31 +268,31 @@

    Source code for tensorflowonspark.reservation

    return resp
    [docs] def close(self): - """Close the client socket.""" - self.sock.close()
    + """Close the client socket."""
    + self.sock.close()
    [docs] def register(self, reservation): """Register ``reservation`` with server.""" - resp = self._request('REG', reservation) - return resp
    + resp = self._request('REG', reservation)
    + return resp
    [docs] def get_reservations(self): """Get current list of reservations.""" - cluster_info = self._request('QINFO') - return cluster_info
    + cluster_info = self._request('QINFO') + return cluster_info
    [docs] def await_reservations(self): """Poll until all reservations completed, then return cluster_info.""" done = False while not done: done = self._request('QUERY') - time.sleep(1) - return self.get_reservations()
    + time.sleep(1) + return self.get_reservations()
    [docs] def request_stop(self): """Request server stop.""" - resp = self._request('STOP') - return resp
    + resp = self._request('STOP') + return resp @@ -301,12 +302,14 @@

    Source code for tensorflowonspark.reservation

    @@ -322,13 +325,13 @@

    Navigation

  • modules |
  • - + \ No newline at end of file diff --git a/docs/_modules/tensorflowonspark/util.html b/docs/_modules/tensorflowonspark/util.html index b8e36b97..17b5a4ba 100644 --- a/docs/_modules/tensorflowonspark/util.html +++ b/docs/_modules/tensorflowonspark/util.html @@ -4,27 +4,18 @@ + - tensorflowonspark.util — TensorFlowOnSpark 1.2.0 documentation + tensorflowonspark.util — TensorFlowOnSpark 1.2.1 documentation - + - - + @@ -60,8 +51,8 @@

    Source code for tensorflowonspark.util

     
    [docs]def get_ip_address(): """Simple utility to get host IP address.""" s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) - s.connect(("8.8.8.8", 80)) - return s.getsockname()[0]
    + s.connect(("8.8.8.8", 80))
    + return s.getsockname()[0]
    [docs]def find_in_path(path, file): @@ -69,8 +60,8 @@

    Source code for tensorflowonspark.util

       for p in path.split(os.pathsep):
         candidate = os.path.join(p, file)
         if os.path.exists(candidate) and os.path.isfile(candidate):
    -      return candidate
    -  return False
    + return candidate
    + return False @@ -80,12 +71,14 @@

    Source code for tensorflowonspark.util

             
    @@ -101,13 +94,13 @@

    Navigation

  • modules |
  • - +
    \ No newline at end of file diff --git a/docs/_static/basic.css b/docs/_static/basic.css index 607b5f55..19ced105 100644 --- a/docs/_static/basic.css +++ b/docs/_static/basic.css @@ -82,9 +82,21 @@ div.sphinxsidebar input { } div.sphinxsidebar #searchbox input[type="text"] { - width: 170px; + float: left; + width: 80%; + padding: 0.25em; + box-sizing: border-box; +} + +div.sphinxsidebar #searchbox input[type="submit"] { + float: left; + width: 20%; + border-left: none; + padding: 0.25em; + box-sizing: border-box; } + img { border: 0; max-width: 100%; @@ -199,6 +211,11 @@ table.modindextable td { /* -- general body styles --------------------------------------------------- */ +div.body { + min-width: 450px; + max-width: 800px; +} + div.body p, div.body dd, div.body li, div.body blockquote { -moz-hyphens: auto; -ms-hyphens: auto; diff --git a/docs/_static/websupport.js b/docs/_static/websupport.js index 79b18e38..78e14bb4 100644 --- a/docs/_static/websupport.js +++ b/docs/_static/websupport.js @@ -301,7 +301,7 @@ li.hide(); // Determine where in the parents children list to insert this comment. - for(i=0; i < siblings.length; i++) { + for(var i=0; i < siblings.length; i++) { if (comp(comment, siblings[i]) <= 0) { $('#cd' + siblings[i].id) .parent() diff --git a/docs/genindex.html b/docs/genindex.html index 8422ec34..065b2317 100644 --- a/docs/genindex.html +++ b/docs/genindex.html @@ -5,27 +5,18 @@ + - Index — TensorFlowOnSpark 1.2.0 documentation + Index — TensorFlowOnSpark 1.2.1 documentation - + - - + @@ -57,17 +48,11 @@

    Index

    | G | H | I - | L | M | N - | O - | P - | Q | R | S | T - | W - | Y

    A

    @@ -76,13 +61,9 @@

    A

  • absolute_path() (TFNodeContext method)
  • add() (Reservations method) -
  • -
  • args (TFParams attribute)
  • -

    C

    @@ -115,19 +92,9 @@

    C

  • close() (Client method)
  • -
  • cluster_id (TFCluster attribute) - -
  • @@ -341,8 +192,6 @@

    M

    N

    - -
    - -

    O

    - - -
    - -

    P

    - - -
    - -

    Q

    - -

    R

    S

    - + @@ -501,91 +260,37 @@

    S

    T

    -
  • steps (HasSteps attribute) -
  • stop() (Server method)
  • -
    - -

    W

    - - -
    - -

    Y

    - -
    @@ -597,17 +302,16 @@

    Y

    \ No newline at end of file diff --git a/docs/index.html b/docs/index.html index 43cc222d..40ebf619 100644 --- a/docs/index.html +++ b/docs/index.html @@ -4,28 +4,19 @@ + - Welcome to TensorFlowOnSpark’s documentation! — TensorFlowOnSpark 1.2.0 documentation + Welcome to TensorFlowOnSpark’s documentation! — TensorFlowOnSpark 1.2.1 documentation - + - - + @@ -105,12 +96,14 @@

    This Page

    @@ -129,12 +122,12 @@

    Navigation

  • next |
  • - + \ No newline at end of file diff --git a/docs/objects.inv b/docs/objects.inv index d8a0b89e..9ca43666 100644 Binary files a/docs/objects.inv and b/docs/objects.inv differ diff --git a/docs/py-modindex.html b/docs/py-modindex.html index b7d6f4e6..59cf7791 100644 --- a/docs/py-modindex.html +++ b/docs/py-modindex.html @@ -4,20 +4,12 @@ + - Python Module Index — TensorFlowOnSpark 1.2.0 documentation + Python Module Index — TensorFlowOnSpark 1.2.1 documentation - + @@ -26,8 +18,7 @@ - - + @@ -63,11 +54,6 @@

    Python Module Index

    tensorflowonspark - - -     - tensorflowonspark.dfutil -     @@ -78,26 +64,11 @@

    Python Module Index

        tensorflowonspark.marker - - -     - tensorflowonspark.pipeline -     tensorflowonspark.reservation - - -     - tensorflowonspark.reservation_client - - - -     - tensorflowonspark.TFCluster -     @@ -128,12 +99,14 @@

    Python Module Index

    @@ -149,12 +122,12 @@

    Navigation

  • modules |
  • - + \ No newline at end of file diff --git a/docs/search.html b/docs/search.html index e4bbade3..e3658248 100644 --- a/docs/search.html +++ b/docs/search.html @@ -4,20 +4,12 @@ + - Search — TensorFlowOnSpark 1.2.0 documentation + Search — TensorFlowOnSpark 1.2.1 documentation - + @@ -31,8 +23,7 @@ - - + @@ -93,12 +84,12 @@

    Navigation

  • modules |
  • - + \ No newline at end of file diff --git a/docs/searchindex.js b/docs/searchindex.js index fb08e76b..95e724f5 100644 --- a/docs/searchindex.js +++ b/docs/searchindex.js @@ -1 +1 @@ -Search.setIndex({docnames:["index","tensorflowonspark","tensorflowonspark.TFCluster","tensorflowonspark.TFManager","tensorflowonspark.TFNode","tensorflowonspark.TFSparkNode","tensorflowonspark.dfutil","tensorflowonspark.gpu_info","tensorflowonspark.marker","tensorflowonspark.pipeline","tensorflowonspark.reservation","tensorflowonspark.reservation_client","tensorflowonspark.util"],envversion:52,filenames:["index.rst","tensorflowonspark.rst","tensorflowonspark.TFCluster.rst","tensorflowonspark.TFManager.rst","tensorflowonspark.TFNode.rst","tensorflowonspark.TFSparkNode.rst","tensorflowonspark.dfutil.rst","tensorflowonspark.gpu_info.rst","tensorflowonspark.marker.rst","tensorflowonspark.pipeline.rst","tensorflowonspark.reservation.rst","tensorflowonspark.reservation_client.rst","tensorflowonspark.util.rst"],objects:{"":{tensorflowonspark:[1,0,0,"-"]},"tensorflowonspark.TFCluster":{InputMode:[2,1,1,""],TFCluster:[2,1,1,""],run:[2,4,1,""]},"tensorflowonspark.TFCluster.InputMode":{SPARK:[2,2,1,""],TENSORFLOW:[2,2,1,""]},"tensorflowonspark.TFCluster.TFCluster":{cluster_id:[2,2,1,""],cluster_info:[2,2,1,""],cluster_meta:[2,2,1,""],defaultFS:[2,2,1,""],inference:[2,3,1,""],input_mode:[2,2,1,""],nodeRDD:[2,2,1,""],num_executors:[2,2,1,""],queues:[2,2,1,""],sc:[2,2,1,""],server:[2,2,1,""],shutdown:[2,3,1,""],tensorboard_url:[2,3,1,""],train:[2,3,1,""],working_dir:[2,2,1,""]},"tensorflowonspark.TFManager":{TFManager:[3,1,1,""],connect:[3,4,1,""],start:[3,4,1,""]},"tensorflowonspark.TFNode":{DataFeed:[4,1,1,""],batch_results:[4,4,1,""],export_saved_model:[4,4,1,""],hdfs_path:[4,4,1,""],next_batch:[4,4,1,""],start_cluster_server:[4,4,1,""],terminate:[4,4,1,""]},"tensorflowonspark.TFNode.DataFeed":{batch_results:[4,3,1,""],next_batch:[4,3,1,""],should_stop:[4,3,1,""],terminate:[4,3,1,""]},"tensorflowonspark.TFSparkNode":{TFNodeContext:[5,1,1,""],TFSparkNode:[5,1,1,""],inference:[5,4,1,""],run:[5,4,1,""],shutdown:[5,4,1,""],train:[5,4,1,""]},"tensorflowonspark.TFSparkNode.TFNodeContext":{absolute_path:[5,3,1,""],export_saved_model:[5,3,1,""],get_data_feed:[5,3,1,""],start_cluster_server:[5,3,1,""]},"tensorflowonspark.TFSparkNode.TFSparkNode":{cluster_id:[5,2,1,""],mgr:[5,2,1,""]},"tensorflowonspark.dfutil":{fromTFExample:[6,4,1,""],infer_schema:[6,4,1,""],isLoadedDF:[6,4,1,""],loadTFRecords:[6,4,1,""],saveAsTFRecords:[6,4,1,""],toTFExample:[6,4,1,""]},"tensorflowonspark.gpu_info":{MAX_RETRIES:[7,5,1,""],get_gpus:[7,4,1,""]},"tensorflowonspark.marker":{EndPartition:[8,1,1,""],Marker:[8,1,1,""]},"tensorflowonspark.pipeline":{HasBatchSize:[9,1,1,""],HasClusterSize:[9,1,1,""],HasEpochs:[9,1,1,""],HasExportDir:[9,1,1,""],HasInputMapping:[9,1,1,""],HasInputMode:[9,1,1,""],HasModelDir:[9,1,1,""],HasNumPS:[9,1,1,""],HasOutputMapping:[9,1,1,""],HasProtocol:[9,1,1,""],HasReaders:[9,1,1,""],HasSignatureDefKey:[9,1,1,""],HasSteps:[9,1,1,""],HasTFRecordDir:[9,1,1,""],HasTagSet:[9,1,1,""],HasTensorboard:[9,1,1,""],Namespace:[9,1,1,""],TFEstimator:[9,1,1,""],TFModel:[9,1,1,""],TFParams:[9,1,1,""],TFTypeConverters:[9,1,1,""],get_meta_graph_def:[9,4,1,""],single_node_env:[9,4,1,""],yield_batch:[9,4,1,""]},"tensorflowonspark.pipeline.HasBatchSize":{batch_size:[9,2,1,""],getBatchSize:[9,3,1,""],setBatchSize:[9,3,1,""]},"tensorflowonspark.pipeline.HasClusterSize":{cluster_size:[9,2,1,""],getClusterSize:[9,3,1,""],setClusterSize:[9,3,1,""]},"tensorflowonspark.pipeline.HasEpochs":{epochs:[9,2,1,""],getEpochs:[9,3,1,""],setEpochs:[9,3,1,""]},"tensorflowonspark.pipeline.HasExportDir":{export_dir:[9,2,1,""],getExportDir:[9,3,1,""],setExportDir:[9,3,1,""]},"tensorflowonspark.pipeline.HasInputMapping":{getInputMapping:[9,3,1,""],input_mapping:[9,2,1,""],setInputMapping:[9,3,1,""]},"tensorflowonspark.pipeline.HasInputMode":{getInputMode:[9,3,1,""],input_mode:[9,2,1,""],setInputMode:[9,3,1,""]},"tensorflowonspark.pipeline.HasModelDir":{getModelDir:[9,3,1,""],model_dir:[9,2,1,""],setModelDir:[9,3,1,""]},"tensorflowonspark.pipeline.HasNumPS":{driver_ps_nodes:[9,2,1,""],getDriverPSNodes:[9,3,1,""],getNumPS:[9,3,1,""],num_ps:[9,2,1,""],setDriverPSNodes:[9,3,1,""],setNumPS:[9,3,1,""]},"tensorflowonspark.pipeline.HasOutputMapping":{getOutputMapping:[9,3,1,""],output_mapping:[9,2,1,""],setOutputMapping:[9,3,1,""]},"tensorflowonspark.pipeline.HasProtocol":{getProtocol:[9,3,1,""],protocol:[9,2,1,""],setProtocol:[9,3,1,""]},"tensorflowonspark.pipeline.HasReaders":{getReaders:[9,3,1,""],readers:[9,2,1,""],setReaders:[9,3,1,""]},"tensorflowonspark.pipeline.HasSignatureDefKey":{getSignatureDefKey:[9,3,1,""],setSignatureDefKey:[9,3,1,""],signature_def_key:[9,2,1,""]},"tensorflowonspark.pipeline.HasSteps":{getSteps:[9,3,1,""],setSteps:[9,3,1,""],steps:[9,2,1,""]},"tensorflowonspark.pipeline.HasTFRecordDir":{getTFRecordDir:[9,3,1,""],setTFRecordDir:[9,3,1,""],tfrecord_dir:[9,2,1,""]},"tensorflowonspark.pipeline.HasTagSet":{getTagSet:[9,3,1,""],setTagSet:[9,3,1,""],tag_set:[9,2,1,""]},"tensorflowonspark.pipeline.HasTensorboard":{getTensorboard:[9,3,1,""],setTensorboard:[9,3,1,""],tensorboard:[9,2,1,""]},"tensorflowonspark.pipeline.Namespace":{argv:[9,2,1,""]},"tensorflowonspark.pipeline.TFEstimator":{export_fn:[9,2,1,""],train_fn:[9,2,1,""]},"tensorflowonspark.pipeline.TFParams":{args:[9,2,1,""],merge_args_params:[9,3,1,""]},"tensorflowonspark.pipeline.TFTypeConverters":{toDict:[9,6,1,""]},"tensorflowonspark.reservation":{Client:[10,1,1,""],MessageSocket:[10,1,1,""],Reservations:[10,1,1,""],Server:[10,1,1,""]},"tensorflowonspark.reservation.Client":{await_reservations:[10,3,1,""],close:[10,3,1,""],get_reservations:[10,3,1,""],register:[10,3,1,""],request_stop:[10,3,1,""],server_addr:[10,2,1,""],sock:[10,2,1,""]},"tensorflowonspark.reservation.MessageSocket":{receive:[10,3,1,""],send:[10,3,1,""]},"tensorflowonspark.reservation.Reservations":{add:[10,3,1,""],done:[10,3,1,""],get:[10,3,1,""],remaining:[10,3,1,""]},"tensorflowonspark.reservation.Server":{await_reservations:[10,3,1,""],done:[10,2,1,""],reservations:[10,2,1,""],start:[10,3,1,""],stop:[10,3,1,""]},"tensorflowonspark.util":{find_in_path:[12,4,1,""],get_ip_address:[12,4,1,""]},tensorflowonspark:{TFCluster:[2,0,0,"-"],TFManager:[3,0,0,"-"],TFNode:[4,0,0,"-"],TFSparkNode:[5,0,0,"-"],dfutil:[6,0,0,"-"],gpu_info:[7,0,0,"-"],marker:[8,0,0,"-"],pipeline:[9,0,0,"-"],reservation:[10,0,0,"-"],reservation_client:[11,0,0,"-"],util:[12,0,0,"-"]}},objnames:{"0":["py","module","Python module"],"1":["py","class","Python class"],"2":["py","attribute","Python attribute"],"3":["py","method","Python method"],"4":["py","function","Python function"],"5":["py","data","Python data"],"6":["py","staticmethod","Python static method"]},objtypes:{"0":"py:module","1":"py:class","2":"py:attribute","3":"py:method","4":"py:function","5":"py:data","6":"py:staticmethod"},terms:{"abstract":10,"boolean":[2,4,5,10],"class":[2,3,4,5,8,9,10],"default":[2,5],"enum":2,"export":[4,9],"function":[2,4,5,6,9,10],"new":[2,3,5],"return":[2,3,4,5,6,7,9,10],"static":9,"true":[4,5,6,10],And:[4,9],For:[2,4,5,9],The:[4,9],There:[2,9],These:[2,5],USE:2,Use:4,about:10,absolut:4,absolute_path:5,access:[3,5],accord:[7,9],accordingli:4,accuraci:[2,4],action:2,actual:9,add:10,addit:9,address:[2,3,5,10,11,12],after:9,all:[2,5,10],alloc:[4,7],allow:9,also:[3,5,9],ani:[2,4],api:[2,5,6,9],app:9,applic:[2,4,9],architectur:9,arg:[2,3,4,5,6,7,9,10],argpars:[2,5,9],argument:[4,5,6,9],argv:[2,5,9],arrai:[4,6,9],associ:4,attempt:6,authkei:3,author:3,automat:6,avail:7,avoid:[4,9],await:10,await_reserv:10,back:9,background:[5,10],base:[2,3,4,5,8,9,10],basemanag:3,batch:[4,9],batch_result:4,batch_siz:[4,9],becaus:4,been:10,binari:6,binary_featur:6,binarytyp:6,block:10,both:6,bytearrai:6,byteslist:6,bytestr:6,cach:[3,9],call:4,caller:[4,6],can:[4,5],capabl:2,check:4,checkpoint:9,chief:[2,5],client:10,close:[2,10],cluster:[2,4,5,9,10],cluster_id:[2,5],cluster_info:[2,5,10],cluster_meta:[2,5],cluster_s:[2,9],cluster_spec:[4,5],clusterspec:[4,5],code:4,coerc:6,collect:6,column:[4,6,9],comma:[7,9],command:[2,5,9],commun:[3,4],compat:4,complet:[9,10],comput:2,condit:[2,4],conduct:9,connect:[3,10],construct:5,constructor:[4,9],contain:[4,5,10],content:0,control:2,conveni:[4,5],convert:[4,6,9],core:9,correct:4,could:7,count:10,creat:[3,4],creation:4,ctx:[4,5],current:[2,4,5,10],custom:9,data:[2,4,5,8,9],datafe:[2,4,5],datafram:[4,6,9],datardd:[2,5],dataset:2,datatyp:6,defaultf:[2,5],delimit:[7,9],deprec:4,design:2,desir:[4,7],destin:10,detail:2,determin:4,dev:9,dfutil:[0,1,9],dictionari:[2,4,5,9],dictonari:10,differ:9,directli:[5,9],directori:[2,5,9],disambigu:6,disk:[4,6,9],distribut:[2,3,4,9],doc:9,doe:4,done:10,driver:[2,9,11],driver_ps_nod:[2,9],dtype:6,due:9,dure:[2,8,9,10],each:[2,9],earli:4,either:3,encapsul:5,end:[2,4,5,8],endpartit:8,enqueu:9,entir:2,environ:9,epoch:[2,9],equival:[4,6],esp:2,especi:9,estim:9,etc:[2,4,5],event:[2,5],exampl:6,except:[7,9],excess:4,execut:[2,9],executor:[2,4,5,9],exist:3,expect:[4,5,6,9,10],explicit:4,export_dir:[4,5,9],export_fn:9,export_saved_model:[4,5],extend:9,extra:4,fairli:6,fals:[2,4,5,10],featur:6,feed:[2,4,5,8,9],fewer:4,file:[2,5,6,12],filesystem:[2,4,5],find:12,find_in_path:12,fit:9,fix:[2,5],flat:6,floatlist:6,follow:4,form:[4,5],found:7,free:7,from:[3,4,5,6,9],fromtfexampl:6,fulfil:10,full:2,further:4,gener:[2,4,9],get:[2,4,7,10,12],get_data_fe:5,get_gpu:7,get_ip_address:12,get_meta_graph_def:9,get_reserv:10,getbatchs:9,getclusters:9,getdriverpsnod:9,getepoch:9,getexportdir:9,getinputmap:9,getinputmod:9,getmodeldir:9,getnump:9,getoutputmap:9,getprotocol:9,getread:9,getsignaturedefkei:9,getstep:9,gettagset:9,gettensorboard:9,gettfrecorddir:9,given:[5,6,12],gpu:[2,4,7],gpu_info:[0,1],graph:[4,9],grpc:9,has:9,hasbatchs:9,hasclusters:9,hasepoch:9,hasexportdir:9,hasinputmap:9,hasinputmod:9,hasmodeldir:9,hasnump:9,hasoutputmap:9,hasprotocol:9,hasread:9,hassignaturedefkei:9,hasstep:9,hastagset:9,hastensorboard:9,hastfrecorddir:9,have:[4,10],hdf:[2,4,5],hdfs_path:[4,5],help:[2,4],helper:4,high:[2,5],hint:6,host:[3,5,10,11,12],howev:9,html:9,http:9,identifi:[4,5,9],ids:7,ignor:4,immedi:9,implement:2,incom:4,independ:9,index:0,indic:[2,3,4,5,10],infer:[2,4,5,6],infer_schema:6,inferenc:[2,4,9],inform:5,initi:9,input:[2,4,5,6,9],input_dir:6,input_map:[4,5,9],input_mod:[2,9],input_tensor_alia:4,input_tensor_nam:4,inputmod:[2,4,9],instanc:[3,4,5,9],instanti:4,instead:[2,4],int64list:6,integ:5,intend:[4,5],interact:5,intern:2,internal_us:[2,3,4,5],interpret:2,invalid:[2,5],invoc:9,invok:[2,4],isloadeddf:6,item:[4,9],iter:[6,9],itself:4,iverb:4,job:[2,5],job_nam:5,just:9,kei:[3,4],launch:[2,9],lazili:2,length:[4,9,10],level:[2,5],librari:9,like:9,limit:[6,9],line:[2,5,9],list:[6,7,9,10],listen:[2,10],load:[6,9],loadtfrecord:[6,9],local:[2,3,5,9],locat:[6,9],log:[2,5,11],log_dir:[2,5],logic:5,low:5,mai:4,main:[2,5,9],manag:[2,3,4,5,10],map:[4,9],map_fun:[2,4],mappartit:[5,6],mark:8,marker:[0,1,2],match:[2,4],max_retri:7,maxim:2,maximum:[7,9],mechan:4,memori:[3,9],merg:9,merge_args_param:9,messag:[2,10],messagesocket:10,meta:10,meta_graph_def:9,metadata:[2,4,5,10],metagraph:9,method:[4,5,6,9,10],method_nam:4,mgr:[4,5],mirror:[5,6],mix:9,mode:[2,3,9],model:9,model_dir:9,modul:[0,1],more:[2,5],most:6,msg:10,multi:3,multipl:4,multiprocess:[2,3,5],must:[2,9],name:[4,5,9],namenod:[2,5],namespac:9,need:[2,5,6,9],network:9,next_batch:4,node:[2,4,5,9,10],noderdd:[2,5],none:[2,3,4,5,9,10],note:[4,6,11],now:5,num_epoch:2,num_executor:[2,5],num_gpu:[4,5,7],num_p:[2,9],num_tensor:9,number:[2,4,7,9,10],nvidia:7,object:[2,4,5,8,9,10],often:4,onc:9,one:[3,4,5,6],onli:[2,3,4,9],oper:[2,4],option:9,org:9,origin:9,other:2,otherwis:[3,9],output:[2,4,5,9],output_dir:6,output_map:9,output_tensor_alia:4,output_tensor_nam:4,overhead:4,packag:0,page:0,parallel:[2,5,9],param:9,parent:9,partit:[2,4,5,6,8,9],pass:[2,5,9],path:[2,4,5,6,9,12],per:[2,4,5,9],perspect:4,phase:2,pickl:[3,10],pid:5,pipelin:[0,1,4,6],placehold:9,point:10,poll:10,port:[2,3,5,10,11],ppid:5,prefix:[4,10],previous:4,primarili:[4,6],prior:6,process:[2,3,4,5,9],produc:[6,9],protocol:9,provid:[2,4,5,9],push:[2,4],pyspark:9,python:[3,4,5,9],qname:[2,4,5],qname_in:[4,5],qname_out:[4,5],queue:[2,3,5,8],queue_runn:9,queuerunn:9,rais:[7,9],rang:[2,5],rank:5,rdd:[2,4,5,6,8],rdma:[4,5,9],reach:4,read:[2,9],reader:9,receiv:[2,10],recommend:2,reconnect:5,record:9,refer:5,referenc:3,regist:10,rel:4,remain:10,remot:3,repeat:2,replac:[4,9],report:11,repres:[2,6],represent:[4,5],request:[7,10],request_stop:10,requir:[6,10],reserv:[0,1,2,5,11],reservation_cli:[0,1],resourc:2,respons:[2,4],result:[2,4,5],retri:7,retriev:[4,9],role:4,row:[2,4,6],run:[2,5,9],safe:10,same:3,save:[2,4,5,6,9],saveastfrecord:6,saved_model:[4,9],saved_model_cli:9,saved_model_dir:9,schema:6,scheme:4,search:0,see:[2,5],send:[2,10],sent:4,separ:[4,9],serial:[3,6],serv:9,server:[2,4,5,10,11],server_addr:10,sess:[4,5],session:[4,9],set:[2,9],setbatchs:9,setclusters:9,setdriverpsnod:9,setepoch:9,setexportdir:9,setinputmap:9,setinputmod:9,setmodeldir:9,setnump:9,setoutputmap:9,setprotocol:9,setread:9,setsignaturedefkei:9,setstep:9,settagset:9,settensorboard:9,settfrecorddir:9,share:5,should:[2,4,5,10],should_stop:4,shutdown:[2,5,10,11],signal:[4,11],signatur:[4,5,9],signature_def_kei:[4,9],signature_def_map:4,simpl:[10,11,12],simplenamespac:9,simpli:5,simplifi:4,sinc:[2,4,5,9],singl:9,single_node_env:9,singleton:5,size:4,smi:7,sock:10,socket:10,sourc:[2,3,4,5,6,7,8,9,10,12],spark:[2,4,5,6,9,11],sparkcontext:[2,6],sparkml:9,spawn:[2,5,9],special:8,specif:[4,9],specifi:[4,9],ssc:2,standard:5,start:[2,3,4,10],start_cluster_serv:[4,5],startup:[2,10],state:[2,5],step:[2,4,9],still:4,stop:[2,4,5,10],store:[9,10],stream:2,streamingcontext:[2,11],string:[2,3,4,5,6,7,9,12],stringtyp:6,struct:[],structfield:6,structtyp:6,structur:6,style:9,submodul:0,subsequ:[2,9],suppli:[2,3],support:9,tag:9,tag_set:[4,5,9],task:[2,9],task_index:5,tcp:10,temporari:9,temporarili:9,tensor1:4,tensor2:4,tensor:[4,9],tensorboard:[2,5,9],tensorboard_url:2,tensorflow:[2,4,5,6,9],tensorn:4,termin:[2,4],tf_arg:[2,5,9],tf_argv:[],tfcluster:[0,1,4,5,10],tfestim:9,tfmanag:[0,1,4,5],tfmodel:9,tfnode:[0,1,2,5],tfnodecontext:[4,5],tfparam:9,tfreader:9,tfrecord:[6,9],tfrecord_dir:9,tfsparknod:[0,1],tftypeconvert:9,than:[4,5],thei:2,therefor:2,thi:[2,4,5,6,7,9,10],thread:[9,10],three:2,tie:2,time:[2,7],todict:9,told:4,totfexampl:6,train:[2,4,5,6,9],train_fn:9,train_mod:[4,5],transform:9,tupl:[3,4,10],type:[6,9],typeconvert:9,undefin:9,unfulfil:10,union:2,uniqu:[2,3,5],unrel:4,until:[7,10],url:2,use:[2,11],used:[2,4,5,6],useful:9,user:[2,5,9],uses:9,using:[4,9],util:[0,1,2,6,9,11],valid:2,valu:[4,9],via:[2,5],when:[2,9],where:5,which:[2,3,6,9],within:[4,9],won:2,work:[2,5],worker:[2,3,4,5,9],worker_num:5,working_dir:[2,5],wrap:[4,5],yarn:5,yield:9,yield_batch:9,you:[2,4],your:[2,4]},titles:["Welcome to TensorFlowOnSpark\u2019s documentation!","tensorflowonspark package","tensorflowonspark.TFCluster module","tensorflowonspark.TFManager module","tensorflowonspark.TFNode module","tensorflowonspark.TFSparkNode module","tensorflowonspark.dfutil module","tensorflowonspark.gpu_info module","tensorflowonspark.marker module","tensorflowonspark.pipeline module","tensorflowonspark.reservation module","tensorflowonspark.reservation_client module","tensorflowonspark.util module"],titleterms:{dfutil:6,document:0,gpu_info:7,indic:0,marker:8,modul:[2,3,4,5,6,7,8,9,10,11,12],packag:1,pipelin:9,reserv:10,reservation_cli:11,submodul:1,tabl:0,tensorflowonspark:[0,1,2,3,4,5,6,7,8,9,10,11,12],tfcluster:2,tfmanag:3,tfnode:4,tfsparknod:5,util:12,welcom:0}}) \ No newline at end of file +Search.setIndex({docnames:["index","tensorflowonspark","tensorflowonspark.TFCluster","tensorflowonspark.TFManager","tensorflowonspark.TFNode","tensorflowonspark.TFSparkNode","tensorflowonspark.dfutil","tensorflowonspark.gpu_info","tensorflowonspark.marker","tensorflowonspark.pipeline","tensorflowonspark.reservation","tensorflowonspark.reservation_client","tensorflowonspark.util"],envversion:53,filenames:["index.rst","tensorflowonspark.rst","tensorflowonspark.TFCluster.rst","tensorflowonspark.TFManager.rst","tensorflowonspark.TFNode.rst","tensorflowonspark.TFSparkNode.rst","tensorflowonspark.dfutil.rst","tensorflowonspark.gpu_info.rst","tensorflowonspark.marker.rst","tensorflowonspark.pipeline.rst","tensorflowonspark.reservation.rst","tensorflowonspark.reservation_client.rst","tensorflowonspark.util.rst"],objects:{"":{tensorflowonspark:[1,0,0,"-"]},"tensorflowonspark.TFManager":{TFManager:[3,1,1,""],connect:[3,2,1,""],start:[3,2,1,""]},"tensorflowonspark.TFNode":{DataFeed:[4,1,1,""],batch_results:[4,2,1,""],export_saved_model:[4,2,1,""],hdfs_path:[4,2,1,""],next_batch:[4,2,1,""],start_cluster_server:[4,2,1,""],terminate:[4,2,1,""]},"tensorflowonspark.TFNode.DataFeed":{batch_results:[4,3,1,""],next_batch:[4,3,1,""],should_stop:[4,3,1,""],terminate:[4,3,1,""]},"tensorflowonspark.TFSparkNode":{TFNodeContext:[5,1,1,""],TFSparkNode:[5,1,1,""],inference:[5,2,1,""],run:[5,2,1,""],shutdown:[5,2,1,""],train:[5,2,1,""]},"tensorflowonspark.TFSparkNode.TFNodeContext":{absolute_path:[5,3,1,""],export_saved_model:[5,3,1,""],get_data_feed:[5,3,1,""],start_cluster_server:[5,3,1,""]},"tensorflowonspark.TFSparkNode.TFSparkNode":{cluster_id:[5,4,1,""],mgr:[5,4,1,""]},"tensorflowonspark.gpu_info":{MAX_RETRIES:[7,5,1,""],get_gpus:[7,2,1,""]},"tensorflowonspark.marker":{EndPartition:[8,1,1,""],Marker:[8,1,1,""]},"tensorflowonspark.reservation":{Client:[10,1,1,""],MessageSocket:[10,1,1,""],Reservations:[10,1,1,""],Server:[10,1,1,""]},"tensorflowonspark.reservation.Client":{await_reservations:[10,3,1,""],close:[10,3,1,""],get_reservations:[10,3,1,""],register:[10,3,1,""],request_stop:[10,3,1,""],server_addr:[10,4,1,""],sock:[10,4,1,""]},"tensorflowonspark.reservation.MessageSocket":{receive:[10,3,1,""],send:[10,3,1,""]},"tensorflowonspark.reservation.Reservations":{add:[10,3,1,""],done:[10,3,1,""],get:[10,3,1,""],remaining:[10,3,1,""]},"tensorflowonspark.reservation.Server":{await_reservations:[10,3,1,""],done:[10,4,1,""],reservations:[10,4,1,""],start:[10,3,1,""],stop:[10,3,1,""]},"tensorflowonspark.util":{find_in_path:[12,2,1,""],get_ip_address:[12,2,1,""]},tensorflowonspark:{TFManager:[3,0,0,"-"],TFNode:[4,0,0,"-"],TFSparkNode:[5,0,0,"-"],gpu_info:[7,0,0,"-"],marker:[8,0,0,"-"],reservation:[10,0,0,"-"],util:[12,0,0,"-"]}},objnames:{"0":["py","module","Python module"],"1":["py","class","Python class"],"2":["py","function","Python function"],"3":["py","method","Python method"],"4":["py","attribute","Python attribute"],"5":["py","data","Python data"]},objtypes:{"0":"py:module","1":"py:class","2":"py:function","3":"py:method","4":"py:attribute","5":"py:data"},terms:{"abstract":10,"boolean":[4,5,10],"class":[3,4,5,8,10],"default":5,"export":4,"function":[4,5,10],"new":[3,5],"return":[3,4,5,7,10],"true":[4,5,10],And:4,For:[4,5],The:4,These:5,Use:4,about:10,absolut:4,absolute_path:5,access:[3,5],accord:7,accordingli:4,accuraci:4,add:10,address:[3,5,10,12],all:[5,10],alloc:[4,7],also:[3,5],ani:4,api:5,applic:4,arg:[3,4,5,7,10],argpars:5,argument:[4,5],argv:5,arrai:4,associ:4,authkei:3,author:3,avail:7,avoid:4,await:10,await_reserv:10,background:[5,10],base:[3,4,5,8,10],basemanag:3,batch:4,batch_result:4,batch_siz:4,becaus:4,been:10,block:10,cach:3,call:4,caller:4,can:[4,5],check:4,chief:5,client:10,close:10,cluster:[4,5,10],cluster_id:5,cluster_info:[5,10],cluster_meta:5,cluster_spec:[4,5],clusterspec:[4,5],code:4,column:4,comma:7,command:5,commun:[3,4],compat:4,complet:10,condit:4,connect:[3,10],construct:5,constructor:4,contain:[4,5,10],content:0,conveni:[4,5],convert:4,correct:4,could:7,count:10,creat:[3,4],creation:4,ctx:[3,4,5],current:[4,5,10],data:[4,5,8],datafe:[4,5],datafram:4,datardd:5,defaultf:5,delimit:7,deprec:4,desir:[4,7],destin:10,determin:4,dfutil:[0,1],dictionari:[4,5],dictonari:10,directli:5,directori:5,disk:4,distribut:[3,4],doe:4,done:10,dure:[8,10],earli:4,either:3,encapsul:5,end:[4,5,8],endpartit:8,equival:4,etc:[4,5],event:5,except:7,excess:4,executor:[4,5],exist:3,expect:[4,5,10],explicit:4,export_dir:[4,5],export_saved_model:[4,5],extra:4,fals:[4,5,10],feed:[4,5,8],fewer:4,file:[5,12],filesystem:[4,5],find:12,find_in_path:12,fix:5,follow:4,form:[4,5],found:7,free:7,from:[3,4,5],fulfil:10,further:4,gener:4,get:[4,7,10,12],get_data_fe:5,get_gpu:7,get_ip_address:12,get_reserv:10,given:[5,12],gpu:[4,7],gpu_info:[0,1],graph:4,have:[4,10],hdf:[4,5],hdfs_path:[4,5],help:4,helper:4,high:5,host:[3,5,10,12],identifi:[4,5],ids:7,ignor:4,incom:4,index:0,indic:[3,4,5,10],infer:[4,5],inferenc:4,inform:5,input:[4,5],input_map:[4,5],input_tensor_alia:4,input_tensor_nam:4,inputmod:4,instanc:[3,4,5],instanti:4,instead:4,integ:5,intend:[4,5],interact:5,internal_us:[3,4,5],invalid:5,invok:4,item:4,itself:4,iverb:4,job:5,job_nam:5,kei:[3,4],length:[4,10],level:5,line:5,list:[7,10],listen:10,local:[3,5],log:5,log_dir:5,logic:5,low:5,mai:4,main:5,manag:[3,4,5,10],map:4,map_fun:4,mappartit:5,mark:8,marker:[0,1],match:4,max_retri:7,maximum:7,mechan:4,memori:3,messag:10,messagesocket:10,meta:10,metadata:[4,5,10],method:[4,5,10],method_nam:4,mgr:[4,5],mirror:5,mode:3,modul:[0,1],more:5,msg:10,multi:3,multipl:4,multiprocess:[3,5],name:[4,5],namenod:5,need:5,next_batch:4,node:[4,5,10],noderdd:5,none:[3,4,5,10],note:4,now:5,num_executor:5,num_gpu:[4,5,7],number:[4,7,10],nvidia:7,object:[4,5,8,10],often:4,one:[3,4,5],onli:[3,4],oper:4,otherwis:3,output:[4,5],output_tensor_alia:4,output_tensor_nam:4,overhead:4,packag:0,page:0,parallel:5,partit:[4,5,8],pass:5,path:[4,5,12],per:[4,5],perspect:4,pickl:[3,10],pid:5,pipelin:[0,1,4],point:10,poll:10,port:[3,5,10],ppid:5,prefix:[4,10],previous:4,primarili:4,process:[3,4,5],provid:[4,5],push:4,python:[3,4,5],qname:[4,5],qname_in:[4,5],qname_out:[4,5],queue:[3,5,8],rais:7,rang:5,rank:5,rdd:[4,5,8],rdma:[4,5],reach:4,receiv:10,reconnect:5,refer:5,referenc:3,regist:10,rel:4,remain:10,remot:3,replac:4,represent:[4,5],request:[7,10],request_stop:10,requir:10,reserv:[0,1,5],reservation_cli:[0,1],respons:4,result:[4,5],retri:7,retriev:4,role:4,row:4,run:5,safe:10,same:3,save:[4,5],saved_model:4,scheme:4,search:0,see:5,send:10,sent:4,separ:4,serial:3,server:[4,5,10],server_addr:10,sess:[4,5],session:4,share:5,should:[4,5,10],should_stop:4,shutdown:[5,10],signal:4,signatur:[4,5],signature_def_kei:4,signature_def_map:4,simpl:[10,12],simpli:5,simplifi:4,sinc:[4,5],singleton:5,size:4,smi:7,sock:10,socket:10,sourc:[3,4,5,7,8,10,12],spark:[4,5],spawn:5,special:8,specif:4,specifi:4,standard:5,start:[3,4,10],start_cluster_serv:[4,5],startup:10,state:5,statu:10,step:4,still:4,stop:[4,5,10],store:10,string:[3,4,5,7,12],submodul:0,suppli:3,tag_set:[4,5],task_index:5,tcp:10,tensor1:4,tensor2:4,tensor:4,tensorboard:5,tensorflow:[4,5],tensorn:4,termin:4,tf_arg:5,tfcluster:[0,1,4,5,10],tfmanag:[0,1,4,5],tfnode:[0,1,5],tfnodecontext:[4,5],tfsparknod:[0,1],than:[4,5],thi:[4,5,7,10],thread:10,time:7,timeout:10,told:4,train:[4,5],train_mod:[4,5],tupl:[3,4,10],unfulfil:10,uniqu:[3,5],unrel:4,until:[7,10],used:[4,5],user:5,using:4,util:[0,1],valu:4,via:5,where:5,which:3,within:4,work:5,worker:[3,4,5],worker_num:5,working_dir:5,wrap:[4,5],yarn:5,you:4,your:4},titles:["Welcome to TensorFlowOnSpark\u2019s documentation!","tensorflowonspark package","tensorflowonspark.TFCluster module","tensorflowonspark.TFManager module","tensorflowonspark.TFNode module","tensorflowonspark.TFSparkNode module","tensorflowonspark.dfutil module","tensorflowonspark.gpu_info module","tensorflowonspark.marker module","tensorflowonspark.pipeline module","tensorflowonspark.reservation module","tensorflowonspark.reservation_client module","tensorflowonspark.util module"],titleterms:{dfutil:6,document:0,gpu_info:7,indic:0,marker:8,modul:[2,3,4,5,6,7,8,9,10,11,12],packag:1,pipelin:9,reserv:10,reservation_cli:11,submodul:1,tabl:0,tensorflowonspark:[0,1,2,3,4,5,6,7,8,9,10,11,12],tfcluster:2,tfmanag:3,tfnode:4,tfsparknod:5,util:12,welcom:0}}) \ No newline at end of file diff --git a/docs/tensorflowonspark.TFCluster.html b/docs/tensorflowonspark.TFCluster.html index 02bf55f7..ab2ad4e5 100644 --- a/docs/tensorflowonspark.TFCluster.html +++ b/docs/tensorflowonspark.TFCluster.html @@ -4,20 +4,12 @@ + - tensorflowonspark.TFCluster module — TensorFlowOnSpark 1.2.0 documentation + tensorflowonspark.TFCluster module — TensorFlowOnSpark 1.2.1 documentation - + @@ -25,8 +17,7 @@ - - + @@ -52,223 +43,8 @@

    Navigation

    -
    -

    tensorflowonspark.TFCluster module

    -

    This module provides a high-level API to manage the TensorFlowOnSpark cluster.

    -

    There are three main phases of operation:

    -
      -
    1. Reservation/Startup - reserves a port for the TensorFlow process on each executor, starts a multiprocessing.Manager to -listen for data/control messages, and then launches the Tensorflow main function on the executors.
    2. -
    3. Data feeding - For InputMode.SPARK only. Sends RDD data to the TensorFlow nodes via each executor’s multiprocessing.Manager. PS -nodes will tie up their executors, so they won’t receive any subsequent data feeding tasks.
    4. -
    5. Shutdown - sends a shutdown control message to the multiprocessing.Managers of the PS nodes and pushes end-of-feed markers into the data -queues of the worker nodes.
    6. -
    -
    -
    -class InputMode[source]
    -

    Bases: object

    -

    Enum for the input modes of data feeding.

    -
    -
    -SPARK = 1
    -

    Spark is responsible for feeding data to the TensorFlow application via an RDD.

    -
    - -
    -
    -TENSORFLOW = 0
    -

    TensorFlow application is responsible for reading any data.

    -
    - -
    - -
    -
    -class TFCluster[source]
    -

    Bases: object

    -
    -
    -cluster_id = None
    -

    Unique ID for this cluster, used to invalidate state for new clusters.

    -
    - -
    -
    -cluster_info = None
    -

    Cluster node reservations

    -
    - -
    -
    -cluster_meta = None
    -

    Cluster metadata dictionary, e.g. cluster_id, defaultFS, reservation.Server address, etc.

    -
    - -
    -
    -defaultFS = None
    -

    Default FileSystem string, e.g. file:// or hdfs://<namenode>/

    -
    - -
    -
    -inference(dataRDD, qname='input')[source]
    -

    For InputMode.SPARK only: Feeds Spark RDD partitions into the TensorFlow worker nodes and returns an RDD of results

    -

    It is the responsibility of the TensorFlow “main” function to interpret the rows of the RDD and provide valid data for the output RDD.

    -

    This will use the distributed TensorFlow cluster for inferencing, so the TensorFlow “main” function should be capable of inferencing. -Per Spark design, the output RDD will be lazily-executed only when a Spark action is invoked on the RDD.

    -
    -
    Args:
    -
    --- - - - - - -
    dataRDD:input data as a Spark RDD
    qname:INTERNAL_USE
    -
    -
    Returns:
    -
    A Spark RDD representing the output of the TensorFlow inferencing
    -
    -
    - -
    -
    -input_mode = None
    -

    TFCluster.InputMode for this cluster

    -
    - -
    -
    -nodeRDD = None
    -

    RDD representing the nodes of the cluster, i.e. sc.parallelize(range(num_executors), num_executors)

    -
    - -
    -
    -num_executors = None
    -

    Number of executors in the Spark job (and therefore, the number of nodes in the TensorFlow cluster).

    -
    - -
    -
    -queues = None
    -

    INTERNAL_USE

    -
    - -
    -
    -sc = None
    -

    SparkContext

    -
    - -
    -
    -server = None
    -

    reservation.Server for this cluster

    -
    - -
    -
    -shutdown(ssc=None)[source]
    -

    Stops the distributed TensorFlow cluster.

    -
    -
    Args:
    -
    --- - - - -
    ssc:For Streaming applications only. Spark StreamingContext
    -
    -
    -
    - -
    -
    -tensorboard_url()[source]
    -

    Utility function to get the Tensorboard URL

    -
    - -
    -
    -train(dataRDD, num_epochs=0, qname='input')[source]
    -

    For InputMode.SPARK only. Feeds Spark RDD partitions into the TensorFlow worker nodes

    -

    It is the responsibility of the TensorFlow “main” function to interpret the rows of the RDD.

    -

    Since epochs are implemented via RDD.union() and the entire RDD must generally be processed in full, it is recommended -to set num_epochs to closely match your training termination condition (e.g. steps or accuracy). See TFNode.DataFeed -for more details.

    -
    -
    Args:
    -
    --- - - - - - - - -
    dataRDD:input data as a Spark RDD.
    num_epochs:number of times to repeat the dataset during training.
    qname:INTERNAL USE.
    -
    -
    -
    - -
    -
    -working_dir = None
    -

    Current working directory

    -
    - -
    - -
    -
    -run(sc, map_fun, tf_args, num_executors, num_ps, tensorboard=False, input_mode=0, log_dir=None, driver_ps_nodes=False, queues=['input', 'output'])[source]
    -

    Starts the TensorFlowOnSpark cluster and Runs the TensorFlow “main” function on the Spark executors

    -
    -
    Args:
    -
    --- - - - - - - - - - - - - - - - - - - - - - - -
    sc:SparkContext
    map_fun:user-supplied TensorFlow “main” function
    tf_args:argparse args, or command-line ARGV. These will be passed to the map_fun.
    num_executors:number of Spark executors. This should match your Spark job’s --num_executors.
    num_ps:number of Spark executors which are reserved for TensorFlow PS nodes. All other executors will be used as TensorFlow worker nodes.
    tensorboard:boolean indicating if the chief worker should spawn a Tensorboard server.
    input_mode:TFCluster.InputMode
    log_dir:directory to save tensorboard event logs. If None, defaults to a fixed path on local filesystem.
    driver_ps_nodes:
     run the PS nodes on the driver locally instead of on the spark executors; this help maximizing computing resources (esp. GPU). You will need to set cluster_size = num_executors + num_ps
    queues:INTERNAL_USE
    -
    -
    Returns:
    -
    A TFCluster object representing the started cluster.
    -
    -
    - +
    +

    tensorflowonspark.TFCluster module

    @@ -292,12 +68,14 @@

    This Page

    @@ -319,13 +97,13 @@

    Navigation

  • previous |
  • - +
    \ No newline at end of file diff --git a/docs/tensorflowonspark.TFManager.html b/docs/tensorflowonspark.TFManager.html index 682e6ec9..bc1348fc 100644 --- a/docs/tensorflowonspark.TFManager.html +++ b/docs/tensorflowonspark.TFManager.html @@ -4,20 +4,12 @@ + - tensorflowonspark.TFManager module — TensorFlowOnSpark 1.2.0 documentation + tensorflowonspark.TFManager module — TensorFlowOnSpark 1.2.1 documentation - + @@ -25,8 +17,7 @@ - - + @@ -56,8 +47,8 @@

    Navigation

    tensorflowonspark.TFManager module

    -class TFManager(address=None, authkey=None, serializer='pickle')[source]
    -

    Bases: multiprocessing.managers.BaseManager

    +class TFManager(address=None, authkey=None, serializer='pickle', ctx=None)[source] +

    Bases: multiprocessing.managers.BaseManager

    Python multiprocessing.Manager for distributed, multi-process communication.

    @@ -130,12 +121,14 @@

    This Page

    @@ -157,13 +150,13 @@

    Navigation

  • previous |
  • - + \ No newline at end of file diff --git a/docs/tensorflowonspark.TFNode.html b/docs/tensorflowonspark.TFNode.html index 3295a4b7..2258e898 100644 --- a/docs/tensorflowonspark.TFNode.html +++ b/docs/tensorflowonspark.TFNode.html @@ -4,20 +4,12 @@ + - tensorflowonspark.TFNode module — TensorFlowOnSpark 1.2.0 documentation + tensorflowonspark.TFNode module — TensorFlowOnSpark 1.2.1 documentation - + @@ -25,8 +17,7 @@ - - + @@ -57,13 +48,13 @@

    Navigation

    This module provides helper functions for the TensorFlow application.

    Primarily, these functions help with:

    class DataFeed(mgr, train_mode=True, qname_in='input', qname_out='output', input_mapping=None)[source]
    -

    Bases: object

    +

    Bases: object

    This class manages the InputMode.SPARK data feeding process from the perspective of the TensorFlow application.

    Args:
    @@ -88,8 +79,8 @@

    Navigation

    batch_results(results)[source]
    -

    Push a batch of output results to the Spark output RDD of TFCluster.inference().

    -

    Note: this currently expects a one-to-one mapping of input to output data, so the length of the results array should match the length of +

    Push a batch of output results to the Spark output RDD of TFCluster.inference().

    +

    Note: this currently expects a one-to-one mapping of input to output data, so the length of the results array should match the length of the previously retrieved batch of input data.

    Args:
    @@ -111,12 +102,12 @@

    Navigation

    Gets a batch of items from the input RDD.

    If multiple tensors are provided per row in the input RDD, e.g. tuple of (tensor1, tensor2, …, tensorN) and:

      -
    • no input_mapping was provided to the DataFeed constructor, this will return an array of batch_size tuples, +
    • no input_mapping was provided to the DataFeed constructor, this will return an array of batch_size tuples, and the caller is responsible for separating the tensors.
    • -
    • an input_mapping was provided to the DataFeed constructor, this will return a dictionary of N tensors, -with tensor names as keys and arrays of length batch_size as values.
    • +
    • an input_mapping was provided to the DataFeed constructor, this will return a dictionary of N tensors, +with tensor names as keys and arrays of length batch_size as values.
    -

    Note: if the end of the data is reached, this may return with fewer than batch_size items.

    +

    Note: if the end of the data is reached, this may return with fewer than batch_size items.

    Args:
    @@ -136,7 +127,7 @@

    Navigation

    should_stop()[source]
    -

    Check if the feed process was told to stop (by a call to terminate).

    +

    Check if the feed process was told to stop (by a call to terminate).

    @@ -162,7 +153,7 @@

    Navigation

    export_saved_model(sess, export_dir, tag_set, signatures)[source]

    Convenience function to export a saved_model using provided arguments

    The caller specifies the saved_model signatures in a simplified python dictionary form, as follows:

    -
    signatures = {
    +
    signatures = {
       'signature_def_key': {
         'inputs': { 'input_tensor_alias': input_tensor_name },
         'outputs': { 'output_tensor_alias': output_tensor_name },
    @@ -190,7 +181,7 @@ 

    Navigation

    Returns:
    -
    A saved_model exported to disk at export_dir.
    +
    A saved_model exported to disk at export_dir.
    @@ -225,9 +216,9 @@

    Navigation

    start_cluster_server(ctx, num_gpus=1, rdma=False)[source]
    -

    Function that wraps the creation of TensorFlow tf.train.Server for a node in a distributed TensorFlow cluster.

    -

    This is intended to be invoked from within the TF map_fun, replacing explicit code to instantiate tf.train.ClusterSpec -and tf.train.Server objects.

    +

    Function that wraps the creation of TensorFlow tf.train.Server for a node in a distributed TensorFlow cluster.

    +

    This is intended to be invoked from within the TF map_fun, replacing explicit code to instantiate tf.train.ClusterSpec +and tf.train.Server objects.

    Args:
    @@ -277,12 +268,14 @@

    This Page

    @@ -304,13 +297,13 @@

    Navigation

  • previous |
  • - + \ No newline at end of file diff --git a/docs/tensorflowonspark.TFSparkNode.html b/docs/tensorflowonspark.TFSparkNode.html index 10341f6d..c494aa55 100644 --- a/docs/tensorflowonspark.TFSparkNode.html +++ b/docs/tensorflowonspark.TFSparkNode.html @@ -4,20 +4,12 @@ + - tensorflowonspark.TFSparkNode module — TensorFlowOnSpark 1.2.0 documentation + tensorflowonspark.TFSparkNode module — TensorFlowOnSpark 1.2.1 documentation - + @@ -25,8 +17,7 @@ - - + @@ -58,7 +49,8 @@

    Navigation

    class TFNodeContext(worker_num, job_name, task_index, cluster_spec, defaultFS, working_dir, mgr)[source]
    -

    Encapsulates unique metadata for a TensorFlowOnSpark node/executor and provides methods to interact with Spark and HDFS.

    +

    Bases: object

    +

    Encapsulates unique metadata for a TensorFlowOnSpark node/executor and provides methods to interact with Spark and HDFS.

    An instance of this object will be passed to the TensorFlow “main” function via the ctx argument. To simply the end-user API, this class now mirrors the functions of the TFNode module.

    @@ -67,7 +59,7 @@

    Navigation

    - + @@ -75,7 +67,7 @@

    Navigation

    - + @@ -88,25 +80,25 @@

    Navigation

    absolute_path(path)[source]
    -

    Convenience function to access TFNode.hdfs_path directly from this object instance.

    +

    Convenience function to access TFNode.hdfs_path directly from this object instance.

    export_saved_model(sess, export_dir, tag_set, signatures)[source]
    -

    Convenience function to access TFNode.export_saved_model directly from this object instance.

    +

    Convenience function to access TFNode.export_saved_model directly from this object instance.

    get_data_feed(train_mode=True, qname_in='input', qname_out='output', input_mapping=None)[source]
    -

    Convenience function to access TFNode.DataFeed directly from this object instance.

    +

    Convenience function to access TFNode.DataFeed directly from this object instance.

    start_cluster_server(num_gpus=1, rdma=False)[source]
    -

    Convenience function to access TFNode.start_cluster_server directly from this object instance.

    +

    Convenience function to access TFNode.start_cluster_server directly from this object instance.

    @@ -114,11 +106,11 @@

    Navigation

    class TFSparkNode[source]
    -

    Bases: object

    +

    Bases: object

    Low-level functions used by the high-level TFCluster APIs to manage cluster state.

    This class is not intended for end-users (see TFNode for end-user APIs).

    For cluster management, this wraps the per-node cluster logic as Spark RDD mapPartitions functions, where the RDD is expected to be -a “nodeRDD” of the form: nodeRDD = sc.parallelize(range(num_executors), num_executors).

    +a “nodeRDD” of the form: nodeRDD = sc.parallelize(range(num_executors), num_executors).

    For data feeding, this wraps the feeding logic as Spark RDD mapPartitions functions on a standard “dataRDD”.

    This also manages a reference to the TFManager “singleton” per executor. Since Spark can spawn more than one python-worker per executor, this will reconnect to the “singleton” instance as needed.

    @@ -170,7 +162,7 @@

    Navigation

    - + @@ -193,7 +185,7 @@

    Navigation

    shutdown(cluster_info, queues=['input'])[source]
    -

    Stops all TensorFlow nodes by feeding None into the multiprocessing.Queues.

    +

    Stops all TensorFlow nodes by feeding None into the multiprocessing.Queues.

    Args:
    worker_num:integer identifier for this executor, per nodeRDD = sc.parallelize(range(num_executors), num_executors).
    worker_num:integer identifier for this executor, per nodeRDD = sc.parallelize(range(num_executors), num_executors).
    job_name:TensorFlow job name (e.g. ‘ps’ or ‘worker’) of this TF node, per cluster_spec.
    cluster_spec:dictionary for constructing a tf.train.ClusterSpec.
    defaultFS:string representation of default FileSystem, e.g. file:// or hdfs://<namenode>:8020/.
    defaultFS:string representation of default FileSystem, e.g. file:// or hdfs://<namenode>:8020/.
    working_dir:the current working directory for local filesystems, or YARN containers.
    fn:TensorFlow “main” function provided by the user.
    tf_args:argparse args, or command line ARGV. These will be passed to the fn.
    tf_args:argparse args, or command line ARGV. These will be passed to the fn.
    cluster_meta:dictionary of cluster metadata (e.g. cluster_id, reservation.Server address, etc).
    @@ -259,12 +251,14 @@

    This Page

    @@ -286,13 +280,13 @@

    Navigation

  • previous |
  • - + \ No newline at end of file diff --git a/docs/tensorflowonspark.dfutil.html b/docs/tensorflowonspark.dfutil.html index 42bec6e4..22da42d6 100644 --- a/docs/tensorflowonspark.dfutil.html +++ b/docs/tensorflowonspark.dfutil.html @@ -4,20 +4,12 @@ + - tensorflowonspark.dfutil module — TensorFlowOnSpark 1.2.0 documentation + tensorflowonspark.dfutil module — TensorFlowOnSpark 1.2.1 documentation - + @@ -25,8 +17,7 @@ - - + @@ -52,152 +43,8 @@

    Navigation

    -
    -

    tensorflowonspark.dfutil module

    -

    A collection of utility functions for loading/saving TensorFlow TFRecords files as Spark DataFrames.

    -
    -
    -fromTFExample(iter, binary_features=[])[source]
    -

    mapPartition function to convert an RDD of serialized tf.train.Example bytestring into an RDD of Row.

    -

    Note: TensorFlow represents both strings and binary types as tf.train.BytesList, and we need to -disambiguate these types for Spark DataFrames DTypes (StringType and BinaryType), so we require a “hint” -from the caller in the binary_features argument.

    -
    -
    Args:
    -
    --- - - - - - - -
    iter:the RDD partition iterator
    binary_features:
     a list of tf.train.Example features which are expected to be binary/bytearrays.
    -
    -
    Returns:
    -
    An array/iterator of DataFrame Row with features converted into columns.
    -
    -
    - -
    -
    -infer_schema(example, binary_features=[])[source]
    -

    Given a tf.train.Example, infer the Spark DataFrame schema (StructFields).

    -

    Note: TensorFlow represents both strings and binary types as tf.train.BytesList, and we need to -disambiguate these types for Spark DataFrames DTypes (StringType and BinaryType), so we require a “hint” -from the caller in the binary_features argument.

    -
    -
    Args:
    -
    --- - - - - - - -
    example:a tf.train.Example
    binary_features:
     a list of tf.train.Example features which are expected to be binary/bytearrays.
    -
    -
    Returns:
    -
    A DataFrame StructType schema
    -
    -
    - -
    -
    -isLoadedDF(df)[source]
    -

    Returns True if the input DataFrame was produced by the loadTFRecords() method.

    -

    This is primarily used by the Spark ML Pipelines APIs.

    -
    -
    Args:
    -
    --- - - - -
    df:Spark Dataframe
    -
    -
    -
    - -
    -
    -loadTFRecords(sc, input_dir, binary_features=[])[source]
    -

    Load TFRecords from disk into a Spark DataFrame.

    -

    This will attempt to automatically convert the tf.train.Example features into Spark DataFrame columns of equivalent types.

    -

    Note: TensorFlow represents both strings and binary types as tf.train.BytesList, and we need to -disambiguate these types for Spark DataFrames DTypes (StringType and BinaryType), so we require a “hint” -from the caller in the binary_features argument.

    -
    -
    Args:
    -
    --- - - - - - - - - -
    sc:SparkContext
    input_dir:location of TFRecords on disk.
    binary_features:
     a list of tf.train.Example features which are expected to be binary/bytearrays.
    -
    -
    Returns:
    -
    A Spark DataFrame mirroring the tf.train.Example schema.
    -
    -
    - -
    -
    -saveAsTFRecords(df, output_dir)[source]
    -

    Save a Spark DataFrame as TFRecords.

    -

    This will convert the DataFrame rows to TFRecords prior to saving.

    -
    -
    Args:
    -
    --- - - - - - -
    df:Spark DataFrame
    output_dir:Path to save TFRecords
    -
    -
    -
    - -
    -
    -toTFExample(dtypes)[source]
    -

    mapPartition function to convert a Spark RDD of Row into an RDD of serialized tf.train.Example bytestring.

    -

    Note that tf.train.Example is a fairly flat structure with limited datatypes, e.g. tf.train.FloatList, -tf.train.Int64List, and tf.train.BytesList, so most DataFrame types will be coerced into one of these types.

    -
    -
    Args:
    -
    --- - - - -
    dtypes:the DataFrame.dtypes of the source DataFrame.
    -
    -
    Returns:
    -
    A mapPartition function which converts the source DataFrame into tf.train.Example bytestrings.
    -
    -
    - +
    +

    tensorflowonspark.dfutil module

    @@ -221,12 +68,14 @@

    This Page

    @@ -248,13 +97,13 @@

    Navigation

  • previous |
  • - + \ No newline at end of file diff --git a/docs/tensorflowonspark.gpu_info.html b/docs/tensorflowonspark.gpu_info.html index 5def8bad..9601ec04 100644 --- a/docs/tensorflowonspark.gpu_info.html +++ b/docs/tensorflowonspark.gpu_info.html @@ -4,20 +4,12 @@ + - tensorflowonspark.gpu_info module — TensorFlowOnSpark 1.2.0 documentation + tensorflowonspark.gpu_info module — TensorFlowOnSpark 1.2.1 documentation - + @@ -25,8 +17,7 @@ - - + @@ -64,7 +55,7 @@

    Navigation

    get_gpus(num_gpu=1)[source]

    Get list of free GPUs according to nvidia-smi.

    -

    This will retry for MAX_RETRIES times until the requested number of GPUs are available.

    +

    This will retry for MAX_RETRIES times until the requested number of GPUs are available.

    Args:
    @@ -104,12 +95,14 @@

    This Page

    @@ -131,13 +124,13 @@

    Navigation

  • previous |
  • - + \ No newline at end of file diff --git a/docs/tensorflowonspark.html b/docs/tensorflowonspark.html index 5b8e2e3b..794d2eda 100644 --- a/docs/tensorflowonspark.html +++ b/docs/tensorflowonspark.html @@ -4,20 +4,12 @@ + - tensorflowonspark package — TensorFlowOnSpark 1.2.0 documentation + tensorflowonspark package — TensorFlowOnSpark 1.2.1 documentation - + @@ -25,8 +17,7 @@ - - + @@ -102,12 +93,14 @@

    This Page

    @@ -129,12 +122,12 @@

    Navigation

  • previous |
  • - + \ No newline at end of file diff --git a/docs/tensorflowonspark.marker.html b/docs/tensorflowonspark.marker.html index fb13ebf3..effae43a 100644 --- a/docs/tensorflowonspark.marker.html +++ b/docs/tensorflowonspark.marker.html @@ -4,20 +4,12 @@ + - tensorflowonspark.marker module — TensorFlowOnSpark 1.2.0 documentation + tensorflowonspark.marker module — TensorFlowOnSpark 1.2.1 documentation - + @@ -25,8 +17,7 @@ - - + @@ -57,14 +48,14 @@

    Navigation

    class EndPartition[source]
    -

    Bases: tensorflowonspark.marker.Marker

    +

    Bases: tensorflowonspark.marker.Marker

    Marks the end of an RDD Partition during data feeding

    class Marker[source]
    -

    Bases: object

    +

    Bases: object

    Base class for special marker objects in the data queue

    @@ -91,12 +82,14 @@

    This Page

    @@ -118,13 +111,13 @@

    Navigation

  • previous |
  • - + \ No newline at end of file diff --git a/docs/tensorflowonspark.pipeline.html b/docs/tensorflowonspark.pipeline.html index 06dc963f..12dcdbbb 100644 --- a/docs/tensorflowonspark.pipeline.html +++ b/docs/tensorflowonspark.pipeline.html @@ -4,20 +4,12 @@ + - tensorflowonspark.pipeline module — TensorFlowOnSpark 1.2.0 documentation + tensorflowonspark.pipeline module — TensorFlowOnSpark 1.2.1 documentation - + @@ -25,8 +17,7 @@ - - + @@ -52,540 +43,8 @@

    Navigation

    -
    -

    tensorflowonspark.pipeline module

    -

    This module extends the TensorFlowOnSpark API to support Spark ML Pipelines.

    -

    It provides a TFEstimator class to fit a TFModel using TensorFlow. The TFEstimator will actually spawn a TensorFlowOnSpark cluster -to conduct distributed training, but due to architectural limitations, the TFModel will only run single-node TensorFlow instances -when inferencing on the executors. The executors will run in parallel, but the TensorFlow model must fit in the memory -of each executor.

    -

    There is also an option to provide a separate “export” function, which allows users to export a different graph for inferencing vs. training. -This is useful when the training graph uses InputMode.TENSORFLOW with queue_runners, but the inferencing graph needs placeholders. -And this is especially useful for exporting saved_models for TensorFlow Serving.

    -
    -
    -class HasBatchSize[source]
    -

    Bases: pyspark.ml.param.Params

    -
    -
    -batch_size = Param(parent='undefined', name='batch_size', doc='Number of records per batch')
    -
    - -
    -
    -getBatchSize()[source]
    -
    - -
    -
    -setBatchSize(value)[source]
    -
    - -
    - -
    -
    -class HasClusterSize[source]
    -

    Bases: pyspark.ml.param.Params

    -
    -
    -cluster_size = Param(parent='undefined', name='cluster_size', doc='Number of nodes in the cluster')
    -
    - -
    -
    -getClusterSize()[source]
    -
    - -
    -
    -setClusterSize(value)[source]
    -
    - -
    - -
    -
    -class HasEpochs[source]
    -

    Bases: pyspark.ml.param.Params

    -
    -
    -epochs = Param(parent='undefined', name='epochs', doc='Number of epochs to train')
    -
    - -
    -
    -getEpochs()[source]
    -
    - -
    -
    -setEpochs(value)[source]
    -
    - -
    - -
    -
    -class HasExportDir[source]
    -

    Bases: pyspark.ml.param.Params

    -
    -
    -export_dir = Param(parent='undefined', name='export_dir', doc='Directory to export saved_model')
    -
    - -
    -
    -getExportDir()[source]
    -
    - -
    -
    -setExportDir(value)[source]
    -
    - -
    - -
    -
    -class HasInputMapping[source]
    -

    Bases: pyspark.ml.param.Params

    -
    -
    -getInputMapping()[source]
    -
    - -
    -
    -input_mapping = Param(parent='undefined', name='input_mapping', doc='Mapping of input DataFrame column to input tensor')
    -
    - -
    -
    -setInputMapping(value)[source]
    -
    - -
    - -
    -
    -class HasInputMode[source]
    -

    Bases: pyspark.ml.param.Params

    -
    -
    -getInputMode()[source]
    -
    - -
    -
    -input_mode = Param(parent='undefined', name='input_mode', doc='Input data feeding mode (0=TENSORFLOW, 1=SPARK)')
    -
    - -
    -
    -setInputMode(value)[source]
    -
    - -
    - -
    -
    -class HasModelDir[source]
    -

    Bases: pyspark.ml.param.Params

    -
    -
    -getModelDir()[source]
    -
    - -
    -
    -model_dir = Param(parent='undefined', name='model_dir', doc='Path to save/load model checkpoints')
    -
    - -
    -
    -setModelDir(value)[source]
    -
    - -
    - -
    -
    -class HasNumPS[source]
    -

    Bases: pyspark.ml.param.Params

    -
    -
    -driver_ps_nodes = Param(parent='undefined', name='driver_ps_nodes', doc='Run PS nodes on driver locally')
    -
    - -
    -
    -getDriverPSNodes()[source]
    -
    - -
    -
    -getNumPS()[source]
    -
    - -
    -
    -num_ps = Param(parent='undefined', name='num_ps', doc='Number of PS nodes in cluster')
    -
    - -
    -
    -setDriverPSNodes(value)[source]
    -
    - -
    -
    -setNumPS(value)[source]
    -
    - -
    - -
    -
    -class HasOutputMapping[source]
    -

    Bases: pyspark.ml.param.Params

    -
    -
    -getOutputMapping()[source]
    -
    - -
    -
    -output_mapping = Param(parent='undefined', name='output_mapping', doc='Mapping of output tensor to output DataFrame column')
    -
    - -
    -
    -setOutputMapping(value)[source]
    -
    - -
    - -
    -
    -class HasProtocol[source]
    -

    Bases: pyspark.ml.param.Params

    -
    -
    -getProtocol()[source]
    -
    - -
    -
    -protocol = Param(parent='undefined', name='protocol', doc='Network protocol for Tensorflow (grpc|rdma)')
    -
    - -
    -
    -setProtocol(value)[source]
    -
    - -
    - -
    -
    -class HasReaders[source]
    -

    Bases: pyspark.ml.param.Params

    -
    -
    -getReaders()[source]
    -
    - -
    -
    -readers = Param(parent='undefined', name='readers', doc='number of reader/enqueue threads')
    -
    - -
    -
    -setReaders(value)[source]
    -
    - -
    - -
    -
    -class HasSignatureDefKey[source]
    -

    Bases: pyspark.ml.param.Params

    -
    -
    -getSignatureDefKey()[source]
    -
    - -
    -
    -setSignatureDefKey(value)[source]
    -
    - -
    -
    -signature_def_key = Param(parent='undefined', name='signature_def_key', doc='Identifier for a specific saved_model signature')
    -
    - -
    - -
    -
    -class HasSteps[source]
    -

    Bases: pyspark.ml.param.Params

    -
    -
    -getSteps()[source]
    -
    - -
    -
    -setSteps(value)[source]
    -
    - -
    -
    -steps = Param(parent='undefined', name='steps', doc='Maximum number of steps to train')
    -
    - -
    - -
    -
    -class HasTFRecordDir[source]
    -

    Bases: pyspark.ml.param.Params

    -
    -
    -getTFRecordDir()[source]
    -
    - -
    -
    -setTFRecordDir(value)[source]
    -
    - -
    -
    -tfrecord_dir = Param(parent='undefined', name='tfrecord_dir', doc='Path to temporarily export a DataFrame as TFRecords (for InputMode.TENSORFLOW apps)')
    -
    - -
    - -
    -
    -class HasTagSet[source]
    -

    Bases: pyspark.ml.param.Params

    -
    -
    -getTagSet()[source]
    -
    - -
    -
    -setTagSet(value)[source]
    -
    - -
    -
    -tag_set = Param(parent='undefined', name='tag_set', doc='Comma-delimited list of tags identifying a saved_model metagraph')
    -
    - -
    - -
    -
    -class HasTensorboard[source]
    -

    Bases: pyspark.ml.param.Params

    -
    -
    -getTensorboard()[source]
    -
    - -
    -
    -setTensorboard(value)[source]
    -
    - -
    -
    -tensorboard = Param(parent='undefined', name='tensorboard', doc='Launch tensorboard process')
    -
    - -
    - -
    -
    -class Namespace(d)[source]
    -

    Bases: object

    -

    Utility class to convert dictionaries to Namespace-like objects.

    -

    Based on https://docs.python.org/dev/library/types.html#types.SimpleNamespace

    -
    -
    -argv = None
    -
    - -
    - -
    -
    -class TFEstimator(train_fn, tf_args, export_fn=None)[source]
    -

    Bases: pyspark.ml.base.Estimator, tensorflowonspark.pipeline.TFParams, tensorflowonspark.pipeline.HasInputMapping, tensorflowonspark.pipeline.HasClusterSize, tensorflowonspark.pipeline.HasNumPS, tensorflowonspark.pipeline.HasInputMode, tensorflowonspark.pipeline.HasProtocol, tensorflowonspark.pipeline.HasTensorboard, tensorflowonspark.pipeline.HasModelDir, tensorflowonspark.pipeline.HasExportDir, tensorflowonspark.pipeline.HasTFRecordDir, tensorflowonspark.pipeline.HasBatchSize, tensorflowonspark.pipeline.HasEpochs, tensorflowonspark.pipeline.HasReaders, tensorflowonspark.pipeline.HasSteps

    -

    Spark ML Estimator which launches a TensorFlowOnSpark cluster for distributed training.

    -

    The columns of the DataFrame passed to the fit() method will be mapped to TensorFlow tensors according to the setInputMapping() method.

    -

    If an export_fn was provided to the constructor, it will be run on a single executor immediately after the distributed training has completed. -This allows users to export a TensorFlow saved_model with a different execution graph for inferencing, e.g. replacing an input graph of -TFReaders and QueueRunners with Placeholders.

    -

    For InputMode.TENSORFLOW, the input DataFrame will be exported as TFRecords to a temporary location specified by the tfrecord_dir. -The TensorFlow application will then be expected to read directly from this location during training. However, if the input DataFrame was -produced by the dfutil.loadTFRecords() method, i.e. originated from TFRecords on disk, then the tfrecord_dir will be set to the -original source location of the TFRecords with the additional export step.

    -
    -
    Args:
    -
    --- - - - - - - - -
    train_fn:TensorFlow “main” function for training.
    tf_args:Arguments specific to the TensorFlow “main” function.
    export_fn:TensorFlow function for exporting a saved_model.
    -
    -
    -
    -
    -export_fn = None
    -
    - -
    -
    -train_fn = None
    -
    - -
    - -
    -
    -class TFModel(tf_args)[source]
    -

    Bases: pyspark.ml.base.Model, tensorflowonspark.pipeline.TFParams, tensorflowonspark.pipeline.HasInputMapping, tensorflowonspark.pipeline.HasOutputMapping, tensorflowonspark.pipeline.HasBatchSize, tensorflowonspark.pipeline.HasModelDir, tensorflowonspark.pipeline.HasExportDir, tensorflowonspark.pipeline.HasSignatureDefKey, tensorflowonspark.pipeline.HasTagSet

    -

    Spark ML Model backed by a TensorFlow model checkpoint/saved_model on disk.

    -

    During transform(), each executor will run an independent, single-node instance of TensorFlow in parallel, so the model must fit in memory. -The model/session will be loaded/initialized just once for each Spark Python worker, and the session will be cached for -subsequent tasks/partitions to avoid re-loading the model for each partition.

    -
    -
    Args:
    -
    --- - - - -
    tf_args:Dictionary of arguments specific to TensorFlow “main” function.
    -
    -
    -
    - -
    -
    -class TFParams[source]
    -

    Bases: pyspark.ml.param.Params

    -

    Mix-in class to store namespace-style args and merge w/ SparkML-style params.

    -
    -
    -args = None
    -
    - -
    -
    -merge_args_params()[source]
    -
    - -
    - -
    -
    -class TFTypeConverters[source]
    -

    Bases: object

    -

    Custom DataFrame TypeConverter for dictionary types (since this is not provided by Spark core).

    -
    -
    -static toDict(value)[source]
    -
    - -
    - -
    -
    -get_meta_graph_def(saved_model_dir, tag_set)[source]
    -

    Utility function to read a meta_graph_def from disk.

    -

    From saved_model_cli.py

    -
    -
    Args:
    -
    --- - - - - - - -
    saved_model_dir:
     path to saved_model.
    tag_set:list of string tags identifying the TensorFlow graph within the saved_model.
    -
    -
    Returns:
    -
    A TensorFlow meta_graph_def, or raises an Exception otherwise.
    -
    -
    - -
    -
    -single_node_env(args)[source]
    -

    Sets up environment for a single-node TF session.

    -
    -
    Args:
    -
    --- - - - - - -
    args:command line arguments as argparse args.
    argv:command line arguments as ARGV (array of string).
    -
    -
    -
    - -
    -
    -yield_batch(iterable, batch_size, num_tensors=1)[source]
    -

    Generator that yields batches of a DataFrame iterator.

    -
    -
    Args:
    -
    --- - - - - - - - -
    iterable:Spark partition iterator.
    batch_size:number of items to retrieve per invocation.
    num_tensors:number of tensors (columns) expected in each item.
    -
    -
    Returns:
    -
    An array of num_tensors arrays, each of length batch_size
    -
    -
    - +
    +

    tensorflowonspark.pipeline module

    @@ -609,12 +68,14 @@

    This Page

    @@ -636,13 +97,13 @@

    Navigation

  • previous |
  • - + \ No newline at end of file diff --git a/docs/tensorflowonspark.reservation.html b/docs/tensorflowonspark.reservation.html index 80d8e145..6e28df54 100644 --- a/docs/tensorflowonspark.reservation.html +++ b/docs/tensorflowonspark.reservation.html @@ -4,20 +4,12 @@ + - tensorflowonspark.reservation module — TensorFlowOnSpark 1.2.0 documentation + tensorflowonspark.reservation module — TensorFlowOnSpark 1.2.1 documentation - + @@ -25,8 +17,7 @@ - - + @@ -58,7 +49,7 @@

    Navigation

    class Client(server_addr)[source]
    -

    Bases: tensorflowonspark.reservation.MessageSocket

    +

    Bases: tensorflowonspark.reservation.MessageSocket

    Client to register and await node reservations.

    Args:
    @@ -93,7 +84,7 @@

    Navigation

    register(reservation)[source]
    -

    Register reservation with server.

    +

    Register reservation with server.

    @@ -119,18 +110,18 @@

    Navigation

    class MessageSocket[source]
    -

    Bases: object

    +

    Bases: object

    Abstract class w/ length-prefixed socket send/receive functions.

    receive(sock)[source]
    -

    Receive a message on sock.

    +

    Receive a message on sock.

    send(sock, msg)[source]
    -

    Send msg to destination sock.

    +

    Send msg to destination sock.

    @@ -138,7 +129,8 @@

    Navigation

    class Reservations(required)[source]
    -

    Thread-safe store for node reservations.

    +

    Bases: object

    +

    Thread-safe store for node reservations.

    Args:
    @@ -172,7 +164,7 @@

    Navigation

    done()[source]
    -

    Returns True if the required number of reservations have been fulfilled.

    +

    Returns True if the required number of reservations have been fulfilled.

    @@ -192,7 +184,7 @@

    Navigation

    class Server(count)[source]
    -

    Bases: tensorflowonspark.reservation.MessageSocket

    +

    Bases: tensorflowonspark.reservation.MessageSocket

    Simple socket server with length-prefixed pickle messages.

    Args:
    @@ -208,7 +200,7 @@

    Navigation

    -await_reservations()[source]
    +await_reservations(sc, status={}, timeout=600)[source]

    Block until all reservations are received.

    @@ -265,12 +257,14 @@

    This Page

    @@ -292,13 +286,13 @@

    Navigation

  • previous |
  • - + \ No newline at end of file diff --git a/docs/tensorflowonspark.reservation_client.html b/docs/tensorflowonspark.reservation_client.html index c8aebdef..d9d87f6c 100644 --- a/docs/tensorflowonspark.reservation_client.html +++ b/docs/tensorflowonspark.reservation_client.html @@ -4,20 +4,12 @@ + - tensorflowonspark.reservation_client module — TensorFlowOnSpark 1.2.0 documentation + tensorflowonspark.reservation_client module — TensorFlowOnSpark 1.2.1 documentation - + @@ -25,8 +17,7 @@ - - + @@ -52,10 +43,8 @@

    Navigation

    -
    -

    tensorflowonspark.reservation_client module

    -

    Simple utility to shutdown a Spark StreamingContext by signaling the reservation Server. -Note: use the reservation server address (host, port) reported in the driver logs.

    +
    +

    tensorflowonspark.reservation_client module

    @@ -79,12 +68,14 @@

    This Page

    @@ -106,13 +97,13 @@

    Navigation

  • previous |
  • - +
    \ No newline at end of file diff --git a/docs/tensorflowonspark.util.html b/docs/tensorflowonspark.util.html index 0b4d0d28..fb2895af 100644 --- a/docs/tensorflowonspark.util.html +++ b/docs/tensorflowonspark.util.html @@ -4,28 +4,19 @@ + - tensorflowonspark.util module — TensorFlowOnSpark 1.2.0 documentation + tensorflowonspark.util module — TensorFlowOnSpark 1.2.1 documentation - + - - + @@ -82,12 +73,14 @@

    This Page

    @@ -106,13 +99,13 @@

    Navigation

  • previous |
  • - + \ No newline at end of file diff --git a/examples/cifar10/README.md b/examples/cifar10/README.md index 01bff20d..370f5848 100644 --- a/examples/cifar10/README.md +++ b/examples/cifar10/README.md @@ -18,7 +18,6 @@ Also, you will need to download the CIFAR-10 dataset per the [original example]( # set environment variables (if not already done) export PYTHON_ROOT=~/Python export PYSPARK_PYTHON=${PYTHON_ROOT}/bin/python - export SPARK_YARN_USER_ENV="PYSPARK_PYTHON=Python/bin/python" export PATH=${PYTHON_ROOT}/bin/:$PATH export QUEUE=gpu export CIFAR10_DATA= diff --git a/examples/imagenet/README.md b/examples/imagenet/README.md index d82943ae..58acf72d 100644 --- a/examples/imagenet/README.md +++ b/examples/imagenet/README.md @@ -19,7 +19,6 @@ Also, you will need to [download the Imagenet dataset per the original example]( # set environment variables (if not already done) export PYTHON_ROOT=~/Python export PYSPARK_PYTHON=${PYTHON_ROOT}/bin/python - export SPARK_YARN_USER_ENV="PYSPARK_PYTHON=Python/bin/python" export PATH=${PYTHON_ROOT}/bin/:$PATH export QUEUE=gpu export IMAGENET_DATA= diff --git a/examples/imagenet/inception/imagenet_distributed_train_pipeline.py b/examples/imagenet/inception/imagenet_distributed_train_pipeline.py index 324f11fb..9f0adb9b 100644 --- a/examples/imagenet/inception/imagenet_distributed_train_pipeline.py +++ b/examples/imagenet/inception/imagenet_distributed_train_pipeline.py @@ -13,7 +13,7 @@ from tensorflowonspark.pipeline import TFEstimator from datetime import datetime -import inception_export +from inception import inception_export import sys import tensorflow as tf diff --git a/examples/imagenet/inception/inception_export.py b/examples/imagenet/inception/inception_export.py index 3b038bf8..fcca80b2 100644 --- a/examples/imagenet/inception/inception_export.py +++ b/examples/imagenet/inception/inception_export.py @@ -27,7 +27,7 @@ tf.app.flags.DEFINE_string('subset', 'validation', """Either 'validation' or 'train'.""") -def export(args): +def export(_): FLAGS = tf.app.flags.FLAGS """Evaluate model on Dataset for a number of steps.""" @@ -99,7 +99,7 @@ def preprocess_image(image_buffer): print('Successfully loaded model from %s at step=%s.' % (ckpt.model_checkpoint_path, global_step)) - print("Exporting saved_model to: {}".format(args.export_dir)) + print("Exporting saved_model to: {}".format(FLAGS.export_dir)) # exported signatures defined in code signatures = { tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: { @@ -109,7 +109,7 @@ def preprocess_image(image_buffer): } } TFNode.export_saved_model(sess, - args.export_dir, + FLAGS.export_dir, tf.saved_model.tag_constants.SERVING, signatures) print("Exported saved_model") diff --git a/examples/slim/README.md b/examples/slim/README.md index b700e8ae..ec4815d0 100644 --- a/examples/slim/README.md +++ b/examples/slim/README.md @@ -17,7 +17,6 @@ And, you will need to [download an image dataset](https://github.com/tensorflow/ # set environment variables (if not already done) export PYTHON_ROOT=~/Python export PYSPARK_PYTHON=${PYTHON_ROOT}/bin/python - export SPARK_YARN_USER_ENV="PYSPARK_PYTHON=Python/bin/python" export PATH=${PYTHON_ROOT}/bin/:$PATH export QUEUE=gpu export DATASET_DIR= @@ -63,7 +62,6 @@ And, you will need to [download an image dataset](https://github.com/tensorflow/ --conf spark.dynamicAllocation.enabled=false \ --conf spark.yarn.maxAppAttempts=1 \ --conf spark.ui.view.acls=* \ - --conf spark.task.maxFailures=1 \ --archives hdfs:///user/${USER}/Python.zip#Python \ --conf spark.executorEnv.LD_LIBRARY_PATH="/usr/local/cuda-7.5/lib64:$JAVA_HOME/jre/lib/amd64/server" \ --driver-library-path="/usr/local/cuda-7.5/lib64" \ diff --git a/setup.py b/setup.py index 3ce5a86b..e373784e 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ setup( name = 'tensorflowonspark', packages = ['tensorflowonspark'], - version = '1.2.0', + version = '1.2.1', description = 'Deep learning with TensorFlow on Apache Spark clusters', author = 'Yahoo, Inc.', url = 'https://github.com/yahoo/TensorFlowOnSpark', diff --git a/tensorflowonspark/TFCluster.py b/tensorflowonspark/TFCluster.py index 382da4c6..102fa96f 100644 --- a/tensorflowonspark/TFCluster.py +++ b/tensorflowonspark/TFCluster.py @@ -25,6 +25,7 @@ import logging import os import random +import sys import threading import time from pyspark.streaming import DStream @@ -32,6 +33,9 @@ from . import TFManager from . import TFSparkNode +# status of TF background job +tf_status = {} + class InputMode(object): """Enum for the input modes of data feeding.""" TENSORFLOW = 0 #: TensorFlow application is responsible for reading any data. @@ -158,8 +162,15 @@ def shutdown(self, ssc=None): workerRDD = self.sc.parallelize(range(workers), workers) workerRDD.foreachPartition(TFSparkNode.shutdown(self.cluster_info, self.queues)) + # exit Spark application w/ err status if TF job had any errors + if 'error' in tf_status: + logging.error("Exiting Spark application with error status.") + self.sc.cancelAllJobs() + self.sc.stop() + sys.exit(1) + logging.info("Shutting down cluster") - # shutdown queues and manageres for "PS" executors. + # shutdown queues and managers for "PS" executors. # note: we have to connect/shutdown from the spark driver, because these executors are "busy" and won't accept any other tasks. for node in ps_list: addr = node['addr'] @@ -187,7 +198,7 @@ def tensorboard_url(self): return tb_url def run(sc, map_fun, tf_args, num_executors, num_ps, tensorboard=False, input_mode=InputMode.TENSORFLOW, - log_dir=None, driver_ps_nodes=False, queues=['input', 'output']): + log_dir=None, driver_ps_nodes=False, reservation_timeout=600, queues=['input', 'output', 'error']): """Starts the TensorFlowOnSpark cluster and Runs the TensorFlow "main" function on the Spark executors Args: @@ -200,6 +211,7 @@ def run(sc, map_fun, tf_args, num_executors, num_ps, tensorboard=False, input_mo :input_mode: TFCluster.InputMode :log_dir: directory to save tensorboard event logs. If None, defaults to a fixed path on local filesystem. :driver_ps_nodes: run the PS nodes on the driver locally instead of on the spark executors; this help maximizing computing resources (esp. GPU). You will need to set cluster_size = num_executors + num_ps + :reservation_timeout: number of seconds after which cluster reservation times out (600 sec default) :queues: *INTERNAL_USE* Returns: @@ -261,20 +273,28 @@ def _start_ps(node_index): ps_thread.start() # start TF on a background thread (on Spark driver) to allow for feeding job - def _start(): - nodeRDD.foreachPartition(TFSparkNode.run(map_fun, - tf_args, - cluster_meta, - tensorboard, - log_dir, - queues, - background=(input_mode == InputMode.SPARK))) - t = threading.Thread(target=_start) + def _start(status): + try: + nodeRDD.foreachPartition(TFSparkNode.run(map_fun, + tf_args, + cluster_meta, + tensorboard, + log_dir, + queues, + background=(input_mode == InputMode.SPARK))) + except Exception as e: + logging.error("Exception in TF background thread") + status['error'] = str(e) + + t = threading.Thread(target=_start, args=(tf_status,)) + # run as daemon thread so that in spark mode main thread can exit + # if feeder spark stage fails and main thread can't do explicit shutdown + t.daemon = True t.start() # wait for executors to register and start TFNodes before continuing logging.info("Waiting for TFSparkNodes to start") - cluster_info = server.await_reservations() + cluster_info = server.await_reservations(sc, tf_status, reservation_timeout) logging.info("All TFSparkNodes started") # print cluster_info and extract TensorBoard URL diff --git a/tensorflowonspark/TFManager.py b/tensorflowonspark/TFManager.py index 0a3277f3..6cc21154 100644 --- a/tensorflowonspark/TFManager.py +++ b/tensorflowonspark/TFManager.py @@ -26,6 +26,12 @@ def _get(key): def _set(key, value): kdict[key] = value +def _get_queue(qname): + try: + return qdict[qname] + except KeyError: + return None + def start(authkey, queues, mode='local'): """Create a new multiprocess.Manager (or return existing one). @@ -42,7 +48,8 @@ def start(authkey, queues, mode='local'): kdict.clear() for q in queues: qdict[q] = JoinableQueue() - TFManager.register('get_queue', callable=lambda qname: qdict[qname]) + + TFManager.register('get_queue', callable=lambda qname: _get_queue(qname)) TFManager.register('get', callable=lambda key: _get(key)) TFManager.register('set', callable=lambda key, value: _set(key, value)) if mode == 'remote': diff --git a/tensorflowonspark/TFSparkNode.py b/tensorflowonspark/TFSparkNode.py index f0d3239d..b79316dc 100644 --- a/tensorflowonspark/TFSparkNode.py +++ b/tensorflowonspark/TFSparkNode.py @@ -9,18 +9,21 @@ from __future__ import print_function import logging +import multiprocessing import os -import sys import platform import socket import subprocess -import multiprocessing +import sys import uuid +import time +import traceback +from threading import Thread from . import TFManager from . import TFNode -from . import reservation from . import marker +from . import reservation from . import util class TFNodeContext: @@ -97,6 +100,14 @@ def _get_manager(cluster_info, host, ppid): authkey = node['authkey'] TFSparkNode.mgr = TFManager.connect(addr,authkey) break + + if TFSparkNode.mgr is None: + msg = "No TFManager found on this node, please ensure that:\n" + \ + "1. Spark num_executors matches TensorFlow cluster_size\n" + \ + "2. Spark cores/tasks per executor is 1.\n" + \ + "3. Spark dynamic allocation is disabled." + raise Exception(msg) + logging.info("Connected to TFSparkNode.mgr on {0}, ppid={1}, state={2}".format(host, ppid, str(TFSparkNode.mgr.get('state')))) return TFSparkNode.mgr @@ -152,7 +163,7 @@ def _mapfn(iter): addr = None if job_name == 'ps': # PS nodes must be remotely accessible in order to shutdown from Spark driver. - TFSparkNode.mgr = TFManager.start(authkey, ['control'], 'remote') + TFSparkNode.mgr = TFManager.start(authkey, ['control', 'error'], 'remote') addr = (host, TFSparkNode.mgr.address[1]) else: # worker nodes only need to be locally accessible within the executor for data feeding @@ -238,7 +249,11 @@ def _mapfn(iter): # construct a TensorFlow clusterspec from cluster_info sorted_cluster_info = sorted(cluster_info, key=lambda k: k['worker_num']) spec = {} + last_worker_num = -1 for node in sorted_cluster_info: + if (node['worker_num'] == last_worker_num): + raise Exception("Duplicate worker/task in cluster_info") + last_worker_num = node['worker_num'] logging.info("node: {0}".format(node)) (njob, nhost, nport) = (node['job_name'], node['host'], node['port']) hosts = [] if njob not in spec else spec[njob] @@ -268,11 +283,21 @@ def wrapper_fn(args, context): sys.argv = args fn(args, context) + def wrapper_fn_background(args, context): + """Wrapper function that signals exceptions to foreground process.""" + errq = TFSparkNode.mgr.get_queue('error') + try: + wrapper_fn(args, context) + except Exception: + errq.put(traceback.format_exc()) + errq.join() + if job_name == 'ps' or background: # invoke the TensorFlow main function in a background thread logging.info("Starting TensorFlow {0}:{1} as {2} on cluster node {3} on background process".format( job_name, task_index, job_name, worker_num)) - p = multiprocessing.Process(target=wrapper_fn, args=(tf_args, ctx)) + + p = multiprocessing.Process(target=wrapper_fn_background, args=(tf_args, ctx)) if job_name == 'ps': p.daemon = True p.start() @@ -280,8 +305,15 @@ def wrapper_fn(args, context): # for ps nodes only, wait indefinitely in foreground thread for a "control" event (None == "stop") if job_name == 'ps': queue = TFSparkNode.mgr.get_queue('control') + equeue = TFSparkNode.mgr.get_queue('error') done = False while not done: + while (queue.empty() and equeue.empty()): + time.sleep(1) + if (not equeue.empty()): + e_str = equeue.get() + equeue.task_done() + raise Exception("exception in ps:\n" + e_str) msg = queue.get(block=True) logging.info("Got msg: {0}".format(msg)) if msg is None: @@ -311,7 +343,13 @@ def train(cluster_info, cluster_meta, qname='input'): def _train(iter): # get shared queue, reconnecting if necessary mgr = _get_manager(cluster_info, util.get_ip_address(), os.getppid()) - queue = mgr.get_queue(qname) + try: + queue = mgr.get_queue(qname) + equeue = mgr.get_queue('error') + except (AttributeError, KeyError): + msg = "Queue '{}' not found on this node, check for exceptions on other nodes.".format(qname) + raise Exception(msg) + state = str(mgr.get('state')) logging.info("mgr.state={0}".format(state)) terminating = state == "'terminating'" @@ -321,15 +359,23 @@ def _train(iter): for item in iter: count += 1 logging.info("Skipped {0} items from partition".format(count)) - else: logging.info("Feeding partition {0} into {1} queue {2}".format(iter, qname, queue)) count = 0 for item in iter: count += 1 queue.put(item, block=True) + # wait for consumers to finish processing all items in queue before "finishing" this iterator - queue.join() + joinThr = Thread(target=queue.join) + joinThr.start() + while (joinThr.isAlive()): + if (not equeue.empty()): + e_str = equeue.get() + equeue.task_done() + raise Exception("exception in worker:\n" + e_str) + time.sleep(1) +# queue.join() logging.info("Processed {0} items in partition".format(count)) # check if TF is terminating feed after this partition @@ -361,7 +407,12 @@ def inference(cluster_info, qname='input'): def _inference(iter): # get shared queue, reconnecting if necessary mgr = _get_manager(cluster_info, util.get_ip_address(), os.getppid()) - queue_in = mgr.get_queue(qname) + try: + queue_in = mgr.get_queue(qname) + equeue = mgr.get_queue('error') + except (AttributeError, KeyError): + msg = "Queue '{}' not found on this node, check for exceptions on other nodes.".format(qname) + raise Exception(msg) logging.info("Feeding partition {0} into {1} queue {2}".format(iter, qname, queue_in)) count = 0 @@ -377,7 +428,15 @@ def _inference(iter): return [] # wait for consumers to finish processing all items in queue before "finishing" this iterator - queue_in.join() + joinThr = Thread(target=queue_in.join) + joinThr.start() + while (joinThr.isAlive()): + if (not equeue.empty()): + e_str = equeue.get() + equeue.task_done() + raise Exception("exception in worker:\n" + e_str) + time.sleep(1) + logging.info("Processed {0} items in partition".format(count)) # read result queue @@ -422,9 +481,13 @@ def _shutdown(iter): # terminate any listening queues logging.info("Stopping all queues") for q in queues: - queue = mgr.get_queue(q) - logging.info("Feeding None into {0} queue".format(q)) - queue.put(None, block=True) + try: + queue = mgr.get_queue(q) + logging.info("Feeding None into {0} queue".format(q)) + queue.put(None, block=True) + except (AttributeError, KeyError): + msg = "Queue '{}' not found on this node, check for exceptions on other nodes.".format(q) + raise Exception(msg) logging.info("Setting mgr.state to 'stopped'") mgr.set('state', 'stopped') diff --git a/tensorflowonspark/pipeline.py b/tensorflowonspark/pipeline.py index 7846ec6a..1a4f5c9c 100755 --- a/tensorflowonspark/pipeline.py +++ b/tensorflowonspark/pipeline.py @@ -364,7 +364,9 @@ class TFModel(Model, TFParams, def __init__(self, tf_args): super(TFModel, self).__init__() self.args = Namespace(tf_args) - self._setDefault(batch_size=100, + self._setDefault(input_mapping={}, + output_mapping={}, + batch_size=100, model_dir=None, export_dir=None, signature_def_key=None, @@ -378,11 +380,6 @@ def _transform(self, dataset): """ spark = SparkSession.builder.getOrCreate() - logging.info("===== 1. inference args: {0}".format(self.args)) - logging.info("===== 2. inference params: {0}".format(self._paramMap)) - local_args = self.merge_args_params() - logging.info("===== 3. inference args + params: {0}".format(local_args)) - # set a deterministic order for input/output columns (lexicographic by key) input_cols = [ col for col, tensor in sorted(self.getInputMapping().items()) ] # input col => input tensor output_cols = [ col for tensor, col in sorted(self.getOutputMapping().items()) ] # output tensor => output col @@ -391,8 +388,14 @@ def _transform(self, dataset): logging.info("input_cols: {}".format(input_cols)) logging.info("output_cols: {}".format(output_cols)) + # merge args + params + logging.info("===== 1. inference args: {0}".format(self.args)) + logging.info("===== 2. inference params: {0}".format(self._paramMap)) + local_args = self.merge_args_params() + logging.info("===== 3. inference args + params: {0}".format(local_args)) + tf_args = self.args.argv if self.args.argv else local_args - rdd_out = dataset.select(input_cols).rdd.mapPartitions(lambda it: _run_model(it, tf_args)) + rdd_out = dataset.select(input_cols).rdd.mapPartitions(lambda it: _run_model(it, local_args, tf_args)) # convert to a DataFrame-friendly format rows_out = rdd_out.map(lambda x: Row(*x)) @@ -403,17 +406,18 @@ def _transform(self, dataset): global_sess = None # tf.Session cache global_args = None # args provided to the _run_model() method. Any change will invalidate the global_sess cache. -def _run_model(iterator, args): +def _run_model(iterator, args, tf_args): """mapPartitions function to run single-node inferencing from a checkpoint/saved_model, using the model's input/output mappings. Args: :iterator: input RDD partition iterator. - :args: a merged view of command-line args and ML Params. + :args: arguments for TFModel, in argparse format + :tf_args: arguments for TensorFlow inferencing code, in argparse or ARGV format. Returns: An iterator of result data. """ - single_node_env(args) + single_node_env(tf_args) logging.info("===== input_mapping: {}".format(args.input_mapping)) logging.info("===== output_mapping: {}".format(args.output_mapping)) @@ -426,11 +430,11 @@ def _run_model(iterator, args): logging.info("===== loading meta_graph_def for tag_set ({0}) from saved_model: {1}".format(args.tag_set, args.export_dir)) meta_graph_def = get_meta_graph_def(args.export_dir, args.tag_set) signature = signature_def_utils.get_signature_def_by_key(meta_graph_def, args.signature_def_key) - logging.info("signature: {}".format(signature)) + logging.debug("signature: {}".format(signature)) inputs_tensor_info = signature.inputs - logging.info("inputs_tensor_info: {0}".format(inputs_tensor_info)) + logging.debug("inputs_tensor_info: {0}".format(inputs_tensor_info)) outputs_tensor_info = signature.outputs - logging.info("outputs_tensor_info: {0}".format(outputs_tensor_info)) + logging.debug("outputs_tensor_info: {0}".format(outputs_tensor_info)) result = [] @@ -489,10 +493,11 @@ def single_node_env(args): """Sets up environment for a single-node TF session. Args: - :args: command line arguments as argparse args. - :argv: command line arguments as ARGV (array of string). + :args: command line arguments as either argparse args or argv list """ - if args.argv: + if isinstance(args, list): + sys.argv = args + elif args.argv: sys.argv = args.argv # ensure expanded CLASSPATH w/o glob characters (required for Spark 2.1 + JNI) diff --git a/tensorflowonspark/reservation.py b/tensorflowonspark/reservation.py index b66e2ae8..78e4cc5e 100644 --- a/tensorflowonspark/reservation.py +++ b/tensorflowonspark/reservation.py @@ -13,6 +13,7 @@ import select import socket import struct +import sys import threading import time @@ -102,11 +103,20 @@ def __init__(self, count): assert count > 0 self.reservations = Reservations(count) - def await_reservations(self): + def await_reservations(self, sc, status={}, timeout=600): """Block until all reservations are received.""" + timespent = 0 while not self.reservations.done(): logging.info("waiting for {0} reservations".format(self.reservations.remaining())) + # check status flags for any errors + if 'error' in status: + sc.cancelAllJobs() + sc.stop() + sys.exit(1) time.sleep(1) + timespent += 1 + if (timespent > timeout): + raise Exception("timed out waiting for reservations to complete") logging.info("all reservations completed") return self.reservations.get()