In [None]:
sc

In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from pyspark.context import SparkContext
from pyspark.conf import SparkConf

import argparse
import os
import numpy
import sys
import tensorflow as tf
import threading
from datetime import datetime

from com.yahoo.ml.tf import TFCluster
import mnist_dist

In [2]:
parser = argparse.ArgumentParser()
parser.add_argument("-i", "--images", help="HDFS path to MNIST images in parallelized format")
parser.add_argument("-f", "--format", help="example format: (csv|pickle|tfr)", choices=["csv","pickle","tfr"], default="csv")
parser.add_argument("-l", "--labels", help="HDFS path to MNIST labels in parallelized format")
parser.add_argument("-m", "--model", help="HDFS path to save/load model during train/test", default="mnist_model")
parser.add_argument("-o", "--output", help="HDFS path to save test/inference output", default="predictions")
parser.add_argument("-r", "--readers", help="number of reader/enqueue threads", type=int, default=1)
parser.add_argument("-s", "--steps", help="maximum number of steps", type=int, default=1000)
parser.add_argument("-X", "--mode", help="train|test", default="train")
parser.add_argument("-tb", "--tensorboard", help="launch tensorboard process", action="store_true")
args = parser.parse_args(['-f', 'csv', '-m', 'mnist_model', '-r', '1', '-i', 'mnist/csv/train/images', '-l', 'mnist/csv/train/labels'])

In [3]:
print(args)

Namespace(format='csv', images='mnist/csv/train/images', labels='mnist/csv/train/labels', mode='train', model='mnist_model', output='predictions', readers=1, steps=1000, tensorboard=False)


In [4]:
images = sc.textFile(args.images).map(lambda ln: [int(x) for x in ln.split(',')])
labels = sc.textFile(args.labels).map(lambda ln: [float(x) for x in ln.split(',')])

dataRDD = images.zip(labels)
dataRDD.count()

60000

In [6]:
num_executors = int(sc._conf.get("spark.executor.instances"))
num_ps=1
cluster = TFCluster.reserve(sc, num_executors, num_ps, args.tensorboard, TFCluster.InputMode.SPARK)

In [7]:
for node in cluster.cluster_info:
  print(node)

{'addr': ('gpbl191n06.blue.ygrid.yahoo.com', 47563), 'task_index': 0, 'port': 54726, 'authkey': UUID('4986b679-ac7f-4a27-ad6b-df1b106441ed'), 'worker_num': 0, 'host': 'gpbl191n06.blue.ygrid.yahoo.com', 'ppid': 151824, 'job_name': 'ps', 'tb_port': 0}
{'addr': '/tmp/pymp-N_3hkm/listener-S8jNxu', 'task_index': 0, 'port': 50619, 'authkey': UUID('388be510-d867-4ef7-a964-671dfab18587'), 'worker_num': 1, 'host': 'gpbl191n06.blue.ygrid.yahoo.com', 'ppid': 151826, 'job_name': 'worker', 'tb_port': 0}
{'addr': '/tmp/pymp-OpjB49/listener-1yoZQ3', 'task_index': 1, 'port': 53088, 'authkey': UUID('ba3ecf8b-aeca-4a54-8d6a-12ef70b85d00'), 'worker_num': 2, 'host': 'gpbl191n06.blue.ygrid.yahoo.com', 'ppid': 151830, 'job_name': 'worker', 'tb_port': 0}
{'addr': '/tmp/pymp-S3gBHS/listener-1Gv00y', 'task_index': 2, 'port': 51228, 'authkey': UUID('ec3b6b8d-7223-41ca-9f60-3c72a38366e6'), 'worker_num': 3, 'host': 'gpbl191n18.blue.ygrid.yahoo.com', 'ppid': 156919, 'job_name': 'worker', 'tb_port': 0}


In [8]:
cluster.start(mnist_dist.map_fun, args)

In [None]:
cluster.train(dataRDD)

In [9]:
cluster.shutdown()

connecting to ('gpbl191n06.blue.ygrid.yahoo.com', 47563), 4986b679-ac7f-4a27-ad6b-df1b106441ed


In [10]:
test_args = parser.parse_args(['-f', 'csv', '-m', 'mnist_test_model', '-r', '1', '-i', 'mnist/csv/test/images', '-l', 'mnist/csv/test/labels', '-X', 'test'])
print(test_args)

Namespace(format='csv', images='mnist/csv/test/images', labels='mnist/csv/test/labels', mode='test', model='mnist_test_model', output='predictions', readers=1, steps=1000, tensorboard=False)


In [11]:
test_images = sc.textFile(test_args.images).map(lambda ln: [int(x) for x in ln.split(',')])
test_labels = sc.textFile(test_args.labels).map(lambda ln: [float(x) for x in ln.split(',')])

testRDD = test_images.zip(test_labels)
testRDD.count()

10000

In [12]:
test_cluster = TFCluster.reserve(sc, num_executors, num_ps, test_args.tensorboard, TFCluster.InputMode.SPARK)

In [13]:
for node in test_cluster.cluster_info:
  print(node)

{'addr': ('gpbl191n18.blue.ygrid.yahoo.com', 46479), 'task_index': 0, 'port': 53676, 'authkey': UUID('d10b0f61-de3a-4c00-b3d0-4b66fdac268b'), 'worker_num': 0, 'host': 'gpbl191n18.blue.ygrid.yahoo.com', 'ppid': 156919, 'job_name': 'ps', 'tb_port': 0}
{'addr': '/tmp/pymp-HP47Ov/listener-pQ5ITM', 'task_index': 0, 'port': 49333, 'authkey': UUID('a449e74e-32bd-4b80-b43c-50fa58ed1b0c'), 'worker_num': 1, 'host': 'gpbl191n06.blue.ygrid.yahoo.com', 'ppid': 151824, 'job_name': 'worker', 'tb_port': 0}
{'addr': '/tmp/pymp-b0u3Qk/listener-eroxhe', 'task_index': 1, 'port': 50778, 'authkey': UUID('2b438707-0a9b-4299-857b-e3618e291af4'), 'worker_num': 2, 'host': 'gpbl191n06.blue.ygrid.yahoo.com', 'ppid': 151826, 'job_name': 'worker', 'tb_port': 0}
{'addr': '/tmp/pymp-tYa212/listener-FvctRV', 'task_index': 2, 'port': 45742, 'authkey': UUID('47673d85-7a68-4ec2-a3bc-900ca4a2801b'), 'worker_num': 3, 'host': 'gpbl191n06.blue.ygrid.yahoo.com', 'ppid': 151830, 'job_name': 'worker', 'tb_port': 0}


In [14]:
test_cluster.start(mnist_dist.map_fun, test_args)


In [None]:
test_resultRDD = cluster.test(testRDD)

In [None]:
test_resultRDD.take(25)


In [None]:
test_cluster.shutdown()

In [None]:

# cluster = TFCluster.reserve(sc, num_executors, num_ps, True, TFCluster.InputMode.SPARK)
# args.mode = "test"
# cluster.start(mnist_dist.map_fun, args)
# resultRDD = cluster.test(dataRDD)