From b3f0c8396c74146644e5fdcc445652f99ddbe5bf Mon Sep 17 00:00:00 2001 From: Yilei Cai Date: Tue, 3 Jul 2018 15:07:00 -0700 Subject: [PATCH 1/2] add readme for examples of running dataset and pipeline --- examples/mnist/tf/README.md | 107 ++++++++++++++++++++++++++++++++++++ 1 file changed, 107 insertions(+) create mode 100644 examples/mnist/tf/README.md diff --git a/examples/mnist/tf/README.md b/examples/mnist/tf/README.md new file mode 100644 index 00000000..354848e3 --- /dev/null +++ b/examples/mnist/tf/README.md @@ -0,0 +1,107 @@ +## Running distributed MNIST training / inference + +### _using Dataset_ +```bash +# for CPU mode: +# export QUEUE=default +# remove references to $LIB_CUDA + +# hdfs dfs -rm -r mnist_model +# hdfs dfs -rm -r predictions + +${SPARK_HOME}/bin/spark-submit \ +--master yarn \ +--deploy-mode cluster \ +--queue ${QUEUE} \ +--num-executors 4 \ +--executor-memory 27G \ +--py-files TensorFlowOnSpark/tfspark.zip,TensorFlowOnSpark/examples/mnist/tf/mnist_dist_dataset.py \ +--conf spark.dynamicAllocation.enabled=false \ +--conf spark.yarn.maxAppAttempts=1 \ +--archives hdfs:///user/${USER}/Python.zip#Python \ +--conf spark.executorEnv.LD_LIBRARY_PATH=$LIB_CUDA:$LIB_JVM:$LIB_HDFS \ +--driver-library-path=$LIB_CUDA \ +TensorFlowOnSpark/examples/mnist/tf/mnist_spark_dataset.py \ +${TF_ROOT}/${TF_VERSION}/examples/mnist/tf/mnist_spark_dataset.py \ +--images_labels mnist/csv2/train \ +--format csv2 \ +--mode train \ +--model mnist_model + +# to use inference mode, change `--mode train` to `--mode inference` and add `--output predictions` +# one item in csv2 format is `image | label`, to use input data in TFRecord format, change `--format csv` to `--format tfr` +# to use infiniband, add `--rdma` +``` + +### _using QueueRunners_ +```bash +# for CPU mode: +# export QUEUE=default +# remove references to $LIB_CUDA + +# hdfs dfs -rm -r mnist_model +# hdfs dfs -rm -r predictions + +${SPARK_HOME}/bin/spark-submit \ +--master yarn \ +--deploy-mode cluster \ +--queue ${QUEUE} \ +--num-executors 4 \ +--executor-memory 27G \ +--py-files TensorFlowOnSpark/tfspark.zip,TensorFlowOnSpark/examples/mnist/tf/mnist_dist.py \ +--conf spark.dynamicAllocation.enabled=false \ +--conf spark.yarn.maxAppAttempts=1 \ +--archives hdfs:///user/${USER}/Python.zip#Python \ +--conf spark.executorEnv.LD_LIBRARY_PATH=$LIB_CUDA:$LIB_JVM:$LIB_HDFS \ +--driver-library-path=$LIB_CUDA \ +TensorFlowOnSpark/examples/mnist/tf/mnist_spark.py \ +--images mnist/tfr/train/images \ +--labels mnist/tfr/train/labels \ +--format csv \ +--mode train \ +--model mnist_model + +# to use inference mode, change `--mode train` to `--mode inference` and add `--output predictions` +# to use input data in TFRecord format, change `--format csv` to `--format tfr` +# to use infiniband, add `--rdma` +``` + +## Running distributed MNIST training and inference utilizing Spark ML Pipeline +```bash +# for CPU mode: +# export QUEUE=default +# remove references to $LIB_CUDA + +# hdfs dfs -rm -r mnist_model +# hdfs dfs -rm -r mnist_export +# hdfs dfs -rm -r tfrecords +# hdfs dfs -rm -r predictions + +${SPARK_HOME}/bin/spark-submit \ +--master yarn \ +--deploy-mode cluster \ +--queue ${QUEUE} \ +--num-executors 4 \ +--executor-memory 27G \ +--jars hdfs:///user/${USER}/tensorflow-hadoop-1.0-SNAPSHOT.jar \ +--py-files TensorFlowOnSpark/tfspark.zip,TensorFlowOnSpark/examples/mnist/tf/mnist_dist_pipeline.py \ +--conf spark.dynamicAllocation.enabled=false \ +--conf spark.yarn.maxAppAttempts=1 \ +--archives hdfs:///user/${USER}/Python.zip#Python \ +--conf spark.executorEnv.LD_LIBRARY_PATH=$LIB_CUDA:$LIB_JVM:$LIB_HDFS \ +--driver-library-path=$LIB_CUDA \ +TensorFlowOnSpark/examples/mnist/tf/mnist_spark_pipeline.py \ +--images mnist/csv/train/images \ +--labels mnist/csv/train/labels \ +--tfrecord_dir tfrecords \ +--format csv \ +--model_dir mnist_model \ +--export_dir mnist_export \ +--train \ +--inference_mode signature \ +--inference_output predictions + +# to use input data in TFRecord format, change `--format csv` to `--format tfr` +# tensorflow-hadoop-1.0-SNAPSHOT.jar is needed for transforming csv input to TFRecord +# `--tfrecord_dir` is needed for temporarily saving dataframe to TFRecord on hdfs +``` From ce9b3ea2de2e1b5f22b5f544208f208d6f189ec3 Mon Sep 17 00:00:00 2001 From: yileic Date: Thu, 5 Jul 2018 13:48:51 -0700 Subject: [PATCH 2/2] update indent level --- examples/mnist/tf/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/mnist/tf/README.md b/examples/mnist/tf/README.md index 354848e3..39833696 100644 --- a/examples/mnist/tf/README.md +++ b/examples/mnist/tf/README.md @@ -66,7 +66,7 @@ TensorFlowOnSpark/examples/mnist/tf/mnist_spark.py \ # to use infiniband, add `--rdma` ``` -## Running distributed MNIST training and inference utilizing Spark ML Pipeline +### _using Spark ML Pipeline_ ```bash # for CPU mode: # export QUEUE=default