From f6253da13ef42e5c8395c6bdb05f7b1f3f9efc3f Mon Sep 17 00:00:00 2001 From: Lee Yang Date: Thu, 20 Sep 2018 13:43:44 -0700 Subject: [PATCH] matrix builds for python + scala; bintray integration; fix mnist/spark example --- .travis.settings.xml | 9 +++ .travis.yml | 98 ++++++++++++++++++----------- examples/mnist/spark/mnist_dist.py | 1 + examples/mnist/spark/mnist_spark.py | 22 +++---- pom.xml | 20 ++++-- 5 files changed, 99 insertions(+), 51 deletions(-) create mode 100644 .travis.settings.xml diff --git a/.travis.settings.xml b/.travis.settings.xml new file mode 100644 index 00000000..eccc2b50 --- /dev/null +++ b/.travis.settings.xml @@ -0,0 +1,9 @@ + + + + bintray-tensorflowonspark-repo + ${env.BINTRAY_USER} + ${env.BINTRAY_API_KEY} + + + diff --git a/.travis.yml b/.travis.yml index 54969c89..b7248a54 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,39 +1,65 @@ -language: python -python: - - 2.7 - - 3.6 -cache: pip -before_install: - - curl -LO http://www-us.apache.org/dist/spark/spark-2.3.1/spark-2.3.1-bin-hadoop2.7.tgz - - export SPARK_HOME=./spark - - mkdir $SPARK_HOME - - tar -xf spark-2.3.1-bin-hadoop2.7.tgz -C $SPARK_HOME --strip-components=1 - - export PATH=$SPARK_HOME/bin:$PATH - - export SPARK_LOCAL_IP=127.0.0.1 - - export SPARK_CLASSPATH=./lib/tensorflow-hadoop-1.0-SNAPSHOT.jar - - export PYTHONPATH=$(pwd) -install: - - pip install -r requirements.txt -script: - - sphinx-build -b html docs/source docs/build/html - - test/run_tests.sh +matrix: + include: + - language: python + python: 2.7 + before_install: + - curl -LO http://www-us.apache.org/dist/spark/spark-2.3.1/spark-2.3.1-bin-hadoop2.7.tgz + - export SPARK_HOME=./spark + - mkdir $SPARK_HOME + - tar -xf spark-2.3.1-bin-hadoop2.7.tgz -C $SPARK_HOME --strip-components=1 + - export PATH=$SPARK_HOME/bin:$PATH + - export SPARK_LOCAL_IP=127.0.0.1 + - export SPARK_CLASSPATH=./lib/tensorflow-hadoop-1.0-SNAPSHOT.jar + - export PYTHONPATH=$(pwd) + install: + - pip install -r requirements.txt + script: + - test/run_tests.sh + - language: python + python: 3.6 + before_install: + - curl -LO http://www-us.apache.org/dist/spark/spark-2.3.1/spark-2.3.1-bin-hadoop2.7.tgz + - export SPARK_HOME=./spark + - mkdir $SPARK_HOME + - tar -xf spark-2.3.1-bin-hadoop2.7.tgz -C $SPARK_HOME --strip-components=1 + - export PATH=$SPARK_HOME/bin:$PATH + - export SPARK_LOCAL_IP=127.0.0.1 + - export SPARK_CLASSPATH=./lib/tensorflow-hadoop-1.0-SNAPSHOT.jar + - export PYTHONPATH=$(pwd) + install: + - pip install -r requirements.txt + script: + - sphinx-build -b html docs/source docs/build/html + - test/run_tests.sh + - language: java + jdk: oraclejdk8 notifications: email: false deploy: - - provider: pages - skip_cleanup: true - github_token: $GITHUB_TOKEN - local_dir: docs/build/html - on: - branch: master - python: 3.6 - tags: true - - provider: pypi - user: leewyang - password: - secure: T2Q8VM6SgcMtJDO2kJbaELE/5ICR5mx8pkM6TyNAJZ2Mr3fLIy6iDfPKunBAYVljl+SDEWmuoPTWqJdqMyo47LBKPKtBHbGzATqGSRTLvxLOYNSXUX+uCpPtr7CMp1eP3xpZ3YbAJZvoEFlWnBQKeBtX/PjNCpmKdp7ir+46CvR/pR1tcM5cFnSgU+uCPAMUt8KTZIxeRo+oJtaE0DM2RxLJ9nGnaRNz9fdXxwhViNj/bMnDRUI0G6k+Iy4sO2669si8nhTDr+Oq66ONUcJtAQymNUM/hzBTCkrJvuIq1TqTlKkA39UrtD5/wCkCqPUbCLVuIfNwkYfW2C8AlXcbphBKN4PhwaoL5XECr3/AOsgNpnPWhCF1Z1uLi58FhIlSyp+5c/x2wVJLZi2IE+c996An7UO3t16ZFpFEgzS6m9PVbi6Qil6Tl4AhV5QLKb0Qn0hLe2l0WixzK9KLMHfkqX8h5ZGC7i0TvCNcU2uIFjY8we91GORZKZhwUVDKbPqiUZIKn64Qq8EwJIsk/S344OrUTzm7z0lFCqtPphg1duU42QOFmaYWi6hgsbtDxN6+CubLw23G3PtKjOpNt8hHnrjZsz9H1MKbSAoYQ4fo+Iwb3owTjXnSTBr94StW7qysggWH6xQimFDh/SKOE9MfroMGt5YTXfduTbqyeameYqE= - distributions: sdist bdist_wheel - on: - branch: master - python: 3.6 - tags: true +- provider: pages + skip_cleanup: true + github_token: "$GITHUB_TOKEN" + local_dir: docs/build/html + on: + branch: master + python: 3.6 + tags: true + condition: "$TRAVIS_TAG =~ ^v.*$" +- provider: pypi + user: leewyang + password: + secure: T2Q8VM6SgcMtJDO2kJbaELE/5ICR5mx8pkM6TyNAJZ2Mr3fLIy6iDfPKunBAYVljl+SDEWmuoPTWqJdqMyo47LBKPKtBHbGzATqGSRTLvxLOYNSXUX+uCpPtr7CMp1eP3xpZ3YbAJZvoEFlWnBQKeBtX/PjNCpmKdp7ir+46CvR/pR1tcM5cFnSgU+uCPAMUt8KTZIxeRo+oJtaE0DM2RxLJ9nGnaRNz9fdXxwhViNj/bMnDRUI0G6k+Iy4sO2669si8nhTDr+Oq66ONUcJtAQymNUM/hzBTCkrJvuIq1TqTlKkA39UrtD5/wCkCqPUbCLVuIfNwkYfW2C8AlXcbphBKN4PhwaoL5XECr3/AOsgNpnPWhCF1Z1uLi58FhIlSyp+5c/x2wVJLZi2IE+c996An7UO3t16ZFpFEgzS6m9PVbi6Qil6Tl4AhV5QLKb0Qn0hLe2l0WixzK9KLMHfkqX8h5ZGC7i0TvCNcU2uIFjY8we91GORZKZhwUVDKbPqiUZIKn64Qq8EwJIsk/S344OrUTzm7z0lFCqtPphg1duU42QOFmaYWi6hgsbtDxN6+CubLw23G3PtKjOpNt8hHnrjZsz9H1MKbSAoYQ4fo+Iwb3owTjXnSTBr94StW7qysggWH6xQimFDh/SKOE9MfroMGt5YTXfduTbqyeameYqE= + distributions: sdist bdist_wheel + on: + branch: master + python: 3.6 + tags: true + condition: "$TRAVIS_TAG =~ ^v.*$" +- provider: script + script: mvn deploy -DskipTests --settings .travis.settings.xml + skip_cleanup: true + on: + branch: master + jdk: oraclejdk8 + tags: true + condition: "$TRAVIS_TAG =~ ^scala_.*$" diff --git a/examples/mnist/spark/mnist_dist.py b/examples/mnist/spark/mnist_dist.py index 85dd51cf..d94930f2 100755 --- a/examples/mnist/spark/mnist_dist.py +++ b/examples/mnist/spark/mnist_dist.py @@ -138,6 +138,7 @@ def feed_dict(batch): with tf.train.MonitoredTrainingSession(master=server.target, is_chief=(task_index == 0), checkpoint_dir=logdir, + save_checkpoint_secs=10, hooks=[tf.train.StopAtStepHook(last_step=args.steps)], chief_only_hooks=[ExportHook(ctx.absolute_path(args.export_dir), x, prediction)]) as mon_sess: step = 0 diff --git a/examples/mnist/spark/mnist_spark.py b/examples/mnist/spark/mnist_spark.py index 9c6a4415..c2757e1f 100755 --- a/examples/mnist/spark/mnist_spark.py +++ b/examples/mnist/spark/mnist_spark.py @@ -26,7 +26,7 @@ parser.add_argument("--batch_size", help="number of records per batch", type=int, default=100) parser.add_argument("--epochs", help="number of epochs", type=int, default=1) parser.add_argument("--export_dir", help="HDFS path to export saved_model", default="mnist_export") -parser.add_argument("--format", help="example format: (csv|pickle|tfr)", choices=["csv", "pickle", "tfr"], default="csv") +parser.add_argument("--format", help="example format: (csv|tfr)", choices=["csv", "tfr"], default="csv") parser.add_argument("--images", help="HDFS path to MNIST images in parallelized format") parser.add_argument("--labels", help="HDFS path to MNIST labels in parallelized format") parser.add_argument("--model", help="HDFS path to save/load model during train/inference", default="mnist_model") @@ -56,22 +56,22 @@ def toNumpy(bytestr): return (image, label) dataRDD = images.map(lambda x: toNumpy(bytes(x[0]))) -else: - if args.format == "csv": - images = sc.textFile(args.images).map(lambda ln: [int(x) for x in ln.split(',')]) - labels = sc.textFile(args.labels).map(lambda ln: [float(x) for x in ln.split(',')]) - else: # args.format == "pickle": - images = sc.pickleFile(args.images) - labels = sc.pickleFile(args.labels) +else: # "csv" print("zipping images and labels") + # If partitions of images/labels don't match, you can use the following code: + # images = sc.textFile(args.images).map(lambda ln: [int(x) for x in ln.split(',')]).zipWithIndex().map(lambda x: (x[1], x[0])) + # labels = sc.textFile(args.labels).map(lambda ln: [float(x) for x in ln.split(',')]).zipWithIndex().map(lambda x: (x[1], x[0])) + # dataRDD = images.join(labels).map(lambda x: (x[1][0], x[1][1])) + images = sc.textFile(args.images).map(lambda ln: [int(x) for x in ln.split(',')]) + labels = sc.textFile(args.labels).map(lambda ln: [float(x) for x in ln.split(',')]) dataRDD = images.zip(labels) cluster = TFCluster.run(sc, mnist_dist.map_fun, args, args.cluster_size, num_ps, args.tensorboard, TFCluster.InputMode.SPARK, log_dir=args.model) if args.mode == "train": cluster.train(dataRDD, args.epochs) -else: - labelRDD = cluster.inference(dataRDD) - labelRDD.saveAsTextFile(args.output) +else: # inference + predRDD = cluster.inference(dataRDD) + predRDD.saveAsTextFile(args.output) cluster.shutdown(grace_secs=30) diff --git a/pom.xml b/pom.xml index fb291d55..5bcf35e8 100644 --- a/pom.xml +++ b/pom.xml @@ -5,11 +5,18 @@ 4.0.0 com.yahoo.ml tensorflowonspark - 1.0-SNAPSHOT + 1.0 jar tensorflowonspark Spark Scala inferencing for TensorFlowOnSpark + + + bintray-tensorflowonspark-repo + https://api.bintray.com/maven/yahoo/maven/tensorflowonspark + + + 1.8 1.8 @@ -22,11 +29,11 @@ 2.11.8 3.2.1 1.1.0 - 3.0.3 + 3.0.5 1.0 3.7.0 - 1.8.0 - 1.0-SNAPSHOT + 1.9.0 + 1.9.0 @@ -67,6 +74,11 @@ hadoop ${tensorflow-hadoop.version} + + com.google.protobuf + protobuf-java + 3.5.1 + org.scalatest scalatest_2.11