### Run multiple models on different GPUs with the same data
This is very useful when you want to compare your hyper-parameters / models without set seed.
Set seed method has several drawbacks  

1) Time costly <br>
2) RAM memory costly <br>
3) when encounter data augmentation, you will get in trouble... fix augmentation is very annoying <br>

In [1]:
import os
import argparse

parser = argparse.ArgumentParser()
parser.add_argument('--gpu_id', default="1,2", type = str, help = "depends on how many GPUs on your machine and which GPU you want to get")
parser.add_argument('--frame_work', default="Keras", type = str, help = "TF / Keras")

FLAGS = parser.parse_args([])
print(FLAGS)

os.environ['CUDA_VISIBLE_DEVICES'] = FLAGS.gpu_id
import threading
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data

import keras.backend as K
from keras.models import Model, load_model, save_model
from keras.layers import Dense, Activation, Input
from keras.optimizers import SGD
# Modified from source: https://stackoverflow.com/questions/46712272/run-hyperparameter-optimization-on-parallel-gpus-using-tensorflow
# The original version IS NOT CORRECT, you have to add with tf.devices inside the graph

Namespace(frame_work='Keras', gpu_id='1,2')


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
## Define a callback for Keras model
from keras.callbacks import Callback
class Logger(Callback):
    def __init__(self, n, gpu_id = 0):
        self.n = n   # print loss & acc every n epochs
        self.gpu_id = gpu_id

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.n == 0:
            # add what you need here
            train_loss = logs.get('loss')
            train_acc = logs.get('acc')
            valid_loss = logs.get('val_loss')
            valid_acc = logs.get('val_acc')
            print("GPU_ID: %s, epoch: %4d, loss: %0.5f, acc: %0.3f, val_loss: %0.5f, val_acc: %0.3f" \
                  % (self.gpu_id, epoch, 
                     train_loss, train_acc,
                     valid_loss, valid_acc))

### Check GPU ID get

In [3]:
from tensorflow.python.client import device_lib

def get_available_gpus():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos if x.device_type == 'GPU']

# list them, no matter which GPU you pick up, the order must start from 0 (e.g. CUDA_VISIBLE_DEVICES=7 ==> /device:GPU:0)
get_available_gpus()

['/device:GPU:0', '/device:GPU:1']

In [4]:
# Get the data
mnist = input_data.read_data_sets("data/mnist", one_hot=True)
train_x_all = mnist.train.images
train_y_all = mnist.train.labels
test_x = mnist.test.images
test_y = mnist.test.labels

Extracting data/mnist/train-images-idx3-ubyte.gz
Extracting data/mnist/train-labels-idx1-ubyte.gz
Extracting data/mnist/t10k-images-idx3-ubyte.gz
Extracting data/mnist/t10k-labels-idx1-ubyte.gz


In [5]:
# Define the graphs per device
learning_rates = [0.01, 0.03]
jobs = []
devices = ['/device:GPU:0', '/device:GPU:1'] # depends on which GPUs you want to put them in

if FLAGS.frame_work is "TF":
    for device, learning_rate in zip(devices, learning_rates):
        with tf.Graph().as_default() as graph:
            with tf.device(device):
                x = tf.placeholder(tf.float32, [None, 784], name='x')
                y = tf.placeholder(tf.float32, [None, 10], name='y')
                W = tf.Variable(tf.zeros([784, 10]))
                b = tf.Variable(tf.zeros([10]))
                pred = tf.nn.softmax(tf.matmul(x, W) + b)
                accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1)), tf.float32))
                cost = tf.reduce_mean(-tf.reduce_sum(y*tf.log(pred), reduction_indices=1), name='cost')
                optimize = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost, name='optimize')
        jobs.append(graph)
        
elif FLAGS.frame_work is "Keras":
    for learning_rate, device in zip(learning_rates, devices):
        with tf.Graph().as_default() as graph:
            with tf.device(device):
                x = Input(shape= (784,), name = 'Input_layer')
                y = Dense(units=10, activation='softmax', name = 'output_layer')(x)
            # to cpu
            optim = SGD(lr=learning_rate)
            model = Model(inputs=[x], outputs=[y])
            model.compile(loss = 'categorical_crossentropy', metrics= ['acc'], optimizer=optim)
        jobs.append([graph, model])
        
print(jobs)

[[<tensorflow.python.framework.ops.Graph object at 0x7fa304ec59e8>, <keras.engine.training.Model object at 0x7fa304ec5b00>], [<tensorflow.python.framework.ops.Graph object at 0x7fa304ec5a20>, <keras.engine.training.Model object at 0x7fa2fc662908>]]


In [7]:
# Train a graph on a device
if FLAGS.frame_work is "TF":
    def train(device, graph):
        print("Start training on %s" % device)
        with tf.Session(graph=graph) as session:
            x = graph.get_tensor_by_name('x:0')
            y = graph.get_tensor_by_name('y:0')
            cost = graph.get_tensor_by_name('cost:0')
            optimize = graph.get_operation_by_name('optimize')

            session.run(tf.global_variables_initializer())
            batch_size = 500
            for epoch in range(100):
                total_batch = int(train_x_all.shape[0] / batch_size)
                for i in range(total_batch):
                    batch_x = train_x_all[i * batch_size:(i + 1) * batch_size]
                    batch_y = train_y_all[i * batch_size:(i + 1) * batch_size]
                    _, c = session.run([optimize, cost], feed_dict={x: batch_x, y: batch_y})
                    if i % 20 == 0:
                        print("Device %s: epoch #%d step=%d cost=%f" % (device, epoch, i, c))
                        
elif FLAGS.frame_work is "Keras":
    def train(device, graph, model):
        print("Start training on %s" % device)
        logger = Logger(n = 5, gpu_id = device)
        with tf.Session(graph=graph) as session:
            K.set_session(session=session)
            model.fit(x = train_x_all, 
                      y = train_y_all, 
                      batch_size=256, 
                      epochs=50, 
                      verbose=0, 
                      validation_data=(test_x, test_y), callbacks=[logger])

# Start threads in parallel
train_threads = []
for i, item in enumerate(jobs):
    if FLAGS.frame_work is "TF":
        this_graph = item
        train_threads.append(threading.Thread(target=train, args=(devices[i], this_graph)))
    elif FLAGS.frame_work is "Keras":
        this_graph = item[0]
        this_model = item[1]
        train_threads.append(threading.Thread(target=train, args=(devices[i], this_graph, this_model)))
for t in train_threads:
    t.start()
for t in train_threads:
    t.join()

Start training on /device:GPU:0Start training on /device:GPU:1

GPU_ID: /device:GPU:0, epoch:    0, loss: 1.68094, acc: 0.566, val_loss: 1.17871, val_acc: 0.792
GPU_ID: /device:GPU:1, epoch:    0, loss: 1.13659, acc: 0.734, val_loss: 0.70459, val_acc: 0.844
GPU_ID: /device:GPU:0, epoch:    5, loss: 0.58351, acc: 0.861, val_loss: 0.53803, val_acc: 0.874
GPU_ID: /device:GPU:1, epoch:    5, loss: 0.42936, acc: 0.886, val_loss: 0.39668, val_acc: 0.895
GPU_ID: /device:GPU:0, epoch:   10, loss: 0.47851, acc: 0.878, val_loss: 0.44641, val_acc: 0.887
GPU_ID: /device:GPU:1, epoch:   10, loss: 0.37559, acc: 0.897, val_loss: 0.35147, val_acc: 0.906
GPU_ID: /device:GPU:0, epoch:   15, loss: 0.43363, acc: 0.886, val_loss: 0.40605, val_acc: 0.893
GPU_ID: /device:GPU:1, epoch:   15, loss: 0.35115, acc: 0.903, val_loss: 0.33077, val_acc: 0.909
GPU_ID: /device:GPU:0, epoch:   20, loss: 0.40728, acc: 0.890, val_loss: 0.38220, val_acc: 0.899
GPU_ID: /device:GPU:1, epoch:   20, loss: 0.33623, acc: 0.906, 