# Exercise04 : Train on Remote GPU Virtual Machine

Now we run our previous sample (see "[Exercise03 : Just Train in Your Working Machine](https://github.com/tsmatz/azure-ml-tensorflow-complete-sample/blob/master/notebooks/exercise03_train_simple.ipynb)") on remote virtual machine with GPU utilized.    
Here we use remote virtual machine and conda virtual environment, but you can also use Batch AI pool sharing in your team, or run on your favorite docker images.

*back to [index](https://github.com/tsmatz/azure-ml-tensorflow-complete-sample/)*

## Save your training script as file (train.py)

Create ```scirpt``` directory.

In [1]:
import os
script_folder = './script'
os.makedirs(script_folder, exist_ok=True)

Please add the following ```%%writefile``` at the beginning of the source code in "[Exercise03 : Just Train in Your Working Machine](https://github.com/tsmatz/azure-ml-tensorflow-complete-sample/blob/master/notebooks/exercise03_train_simple.ipynb)", and run this cell.    
Then this source code is saved as ```./script/train.py```.

In [2]:
%%writefile script/train.py
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import sys
import os
import shutil
import argparse
import math

import tensorflow as tf

FLAGS = None
batch_size = 100

#
# define functions for Estimator
#

def _my_input_fn(filepath, num_epochs):
    # image - 784 (=28 x 28) elements of grey-scaled integer value [0, 1]
    # label - digit (0, 1, ..., 9)
    data_queue = tf.train.string_input_producer(
        [filepath],
        num_epochs = num_epochs) # data is repeated and it raises OutOfRange when data is over
    data_reader = tf.TFRecordReader()
    _, serialized_exam = data_reader.read(data_queue)
    data_exam = tf.parse_single_example(
        serialized_exam,
        features={
            'image_raw': tf.FixedLenFeature([], tf.string),
            'label': tf.FixedLenFeature([], tf.int64)
        })
    data_image = tf.decode_raw(data_exam['image_raw'], tf.uint8)
    data_image.set_shape([784])
    data_image = tf.cast(data_image, tf.float32) * (1. / 255)
    data_label = tf.cast(data_exam['label'], tf.int32)
    data_batch_image, data_batch_label = tf.train.batch(
        [data_image, data_label],
        batch_size=batch_size)
    return {'inputs': data_batch_image}, data_batch_label

def _get_input_fn(filepath, num_epochs):
    return lambda: _my_input_fn(filepath, num_epochs)

def _my_model_fn(features, labels, mode):
    # with tf.device(...): # You can set device if using GPUs

    # define network and inference
    # (simple 2 fully connected hidden layer : 784->128->64->10)
    with tf.name_scope('hidden1'):
        weights = tf.Variable(
            tf.truncated_normal(
                [784, FLAGS.first_layer],
                stddev=1.0 / math.sqrt(float(784))),
            name='weights')
        biases = tf.Variable(
            tf.zeros([FLAGS.first_layer]),
            name='biases')
        hidden1 = tf.nn.relu(tf.matmul(features['inputs'], weights) + biases)
    with tf.name_scope('hidden2'):
        weights = tf.Variable(
            tf.truncated_normal(
                [FLAGS.first_layer, FLAGS.second_layer],
                stddev=1.0 / math.sqrt(float(FLAGS.first_layer))),
            name='weights')
        biases = tf.Variable(
            tf.zeros([FLAGS.second_layer]),
            name='biases')
        hidden2 = tf.nn.relu(tf.matmul(hidden1, weights) + biases)
    with tf.name_scope('softmax_linear'):
        weights = tf.Variable(
            tf.truncated_normal(
                [FLAGS.second_layer, 10],
                stddev=1.0 / math.sqrt(float(FLAGS.second_layer))),
        name='weights')
        biases = tf.Variable(
            tf.zeros([10]),
            name='biases')
        logits = tf.matmul(hidden2, weights) + biases
 
    # compute evaluation matrix
    predicted_indices = tf.argmax(input=logits, axis=1)
    if mode != tf.estimator.ModeKeys.PREDICT:
        label_indices = tf.cast(labels, tf.int32)
        accuracy = tf.metrics.accuracy(label_indices, predicted_indices)
        tf.summary.scalar('accuracy', accuracy[1]) # output to TensorBoard
 
        loss = tf.losses.sparse_softmax_cross_entropy(
            labels=labels,
            logits=logits)
 
    # define operations
    if mode == tf.estimator.ModeKeys.TRAIN:
        #global_step = tf.train.create_global_step()
        #global_step = tf.contrib.framework.get_or_create_global_step()
        global_step = tf.train.get_or_create_global_step()        
        optimizer = tf.train.GradientDescentOptimizer(
            learning_rate=FLAGS.learning_rate)
        train_op = optimizer.minimize(
            loss=loss,
            global_step=global_step)
        return tf.estimator.EstimatorSpec(
            mode,
            loss=loss,
            train_op=train_op)
    if mode == tf.estimator.ModeKeys.EVAL:
        eval_metric_ops = {
            'accuracy': accuracy
        }
        return tf.estimator.EstimatorSpec(
            mode,
            loss=loss,
            eval_metric_ops=eval_metric_ops)
    if mode == tf.estimator.ModeKeys.PREDICT:
        probabilities = tf.nn.softmax(logits, name='softmax_tensor')
        predictions = {
            'classes': predicted_indices,
            'probabilities': probabilities
        }
        export_outputs = {
            'prediction': tf.estimator.export.PredictOutput(predictions)
        }
        return tf.estimator.EstimatorSpec(
            mode,
            predictions=predictions,
            export_outputs=export_outputs)

def _my_serving_input_fn():
    inputs = {'inputs': tf.placeholder(tf.float32, [None, 784])}
    return tf.estimator.export.ServingInputReceiver(inputs, inputs)

#
# Main
#

parser = argparse.ArgumentParser()
parser.add_argument(
    '--data_folder',
    type=str,
    default='./data',
    help='Folder path for input data')
parser.add_argument(
    '--chkpoint_folder',
    type=str,
    default='./logs',  # AML experiments logs folder
    help='Folder path for checkpoint files')
parser.add_argument(
    '--model_folder',
    type=str,
    default='./outputs',  # AML experiments outputs folder
    help='Folder path for model output')
parser.add_argument(
    '--learning_rate',
    type=float,
    default='0.07',
    help='Learning Rate')
parser.add_argument(
    '--first_layer',
    type=int,
    default='128',
    help='Neuron number for the first hidden layer')
parser.add_argument(
    '--second_layer',
    type=int,
    default='64',
    help='Neuron number for the second hidden layer')
FLAGS, unparsed = parser.parse_known_args()

# clean checkpoint and model folder if exists
if os.path.exists(FLAGS.chkpoint_folder) :
    for file_name in os.listdir(FLAGS.chkpoint_folder):
        file_path = os.path.join(FLAGS.chkpoint_folder, file_name)
        if os.path.isfile(file_path):
            os.remove(file_path)
        elif os.path.isdir(file_path):
            shutil.rmtree(file_path)
if os.path.exists(FLAGS.model_folder) :
    for file_name in os.listdir(FLAGS.model_folder):
        file_path = os.path.join(FLAGS.model_folder, file_name)
        if os.path.isfile(file_path):
            os.remove(file_path)
        elif os.path.isdir(file_path):
            shutil.rmtree(file_path)

# read TF_CONFIG
run_config = tf.contrib.learn.RunConfig()

# create Estimator
mnist_fullyconnected_classifier = tf.estimator.Estimator(
    model_fn=_my_model_fn,
    model_dir=FLAGS.chkpoint_folder,
    config=run_config)
train_spec = tf.estimator.TrainSpec(
    input_fn=_get_input_fn(os.path.join(FLAGS.data_folder, 'train.tfrecords'), 2),
    max_steps=60000 * 2 / batch_size)
eval_spec = tf.estimator.EvalSpec(
    input_fn=_get_input_fn(os.path.join(FLAGS.data_folder, 'test.tfrecords'), 1),
    steps=10000 * 1 / batch_size,
    start_delay_secs=0)

# run !
tf.estimator.train_and_evaluate(
    mnist_fullyconnected_classifier,
    train_spec,
    eval_spec
)

# save model and variables
model_dir = mnist_fullyconnected_classifier.export_savedmodel(
    export_dir_base = FLAGS.model_folder,
    serving_input_receiver_fn = _my_serving_input_fn)
print('current working directory is ', os.getcwd())
print('model is saved ', model_dir)

Overwriting script/train.py


## Train on remote VM

Now let's start to integrate with AML services and run training on remote virtual machine.

### Step 1 : Get workspace setting

Before starting, you must read your configuration settings. (See "[Exercise01 : Prepare Config Settings](https://github.com/tsmatz/azure-ml-tensorflow-complete-sample/blob/master/notebooks/exercise01_prepare_config.ipynb)")

In [3]:
from azureml.core import Workspace
import azureml.core

ws = Workspace.from_config()

Found the config file in: /data/home/tsmatsuz/Tutorial/aml_config/config.json


### Step 2 : Create new remote virtual machine

Create your new Data Science Virtual Machine (which is pre-configured for data science) with **GPU** (NC6).    
If you chose AML workspace location (region) which doesn't support GPU VM instance, please provide ```location``` argument in ```provisioning_configuration```.

e.g, ```dsvm_config = DsvmCompute.provisioning_configuration(vm_size='STANDARD_NC6', location='east-us')```

If already exists, this script will get the existing one.

In [4]:
from azureml.core.compute import DsvmCompute
from azureml.core.compute_target import ComputeTargetException

try:
    dsvm_compute = DsvmCompute(workspace=ws, name='mydsvm01')
    print('found existing:', dsvm_compute.name)
except ComputeTargetException:
    print('creating new.')
    dsvm_config = DsvmCompute.provisioning_configuration(vm_size='STANDARD_NC6')
    dsvm_compute = DsvmCompute.create(ws, name='mydsvm01', provisioning_configuration=dsvm_config)
    dsvm_compute.wait_for_completion(show_output=True)

creating new.
Creating.............................................................................
SucceededProvisioning operation finished, operation "Succeeded"


### Step 3 : Generate data reference config

You can configure to automatically download your dataset (including train.tfrecords, test.tfrecords) from your ```Datastore``` in your compute target.    
See "[Exercise02 : Prepare Datastore](https://github.com/tsmatz/azure-ml-tensorflow-complete-sample/blob/master/notebooks/exercise02_prepare_datastore.ipynb)".

In [5]:
from azureml.core import Datastore
from azureml.core.runconfig import DataReferenceConfiguration

# get your datastore (See "Exercise 02 : Prepare Datastore")
ds = Datastore.get(ws, datastore_name="myblob01")

# generate data reference configuration
dr_conf = DataReferenceConfiguration(
    datastore_name=ds.name,
    path_on_datastore='tfdata',
    mode='download', # set 'mount' if you mount folder instead of downloading (but it's not supported in remote VM)
    overwrite=True)

### Step 4 : Generate VM config

Here we build a conda environment and set dependencies. In this configuration, we use previous data reference configuration. (The data is copied automatically.)

In [13]:
from azureml.core.runconfig import RunConfiguration
from azureml.core.conda_dependencies import CondaDependencies

run_config = RunConfiguration(framework="python")
run_config.target = dsvm_compute.name
run_config.data_references = {ds.name: dr_conf}
run_config.environment.python.conda_dependencies = CondaDependencies.create(conda_packages=['tensorflow-gpu'])

### Step 5 : Run script and wait for completion

In [14]:
from azureml.core import Experiment
from azureml.core import Run
from azureml.core import ScriptRunConfig

src = ScriptRunConfig(
    source_directory='./script',
    script='train.py',
    run_config=run_config,
    arguments=['--data_folder', str(ds.as_download())]
)
exp = Experiment(workspace=ws, name='tf_remote_experiment')
run = exp.submit(config=src)
run.wait_for_completion(show_output=True)

RunId: matsu_test1116_02_1542332931636

Streaming azureml-logs/60_control_log.txt

Streaming log file azureml-logs/60_control_log.txt
Running ['conda', '--version']
Creating Conda environment...
Logging experiment preparation status in history service.
Solving environment: ...working... done

Downloading and Extracting Packages
numpy-base-1.15.4    | 4.2 MB    | ########## | 100% 
_tflow_select-2.1.0  | 2 KB      | ########## | 100% 
astor-0.7.1          | 43 KB     | ########## | 100% 
cupti-9.2.148        | 1.7 MB    | ########## | 100% 
tensorflow-1.12.0    | 3 KB      | ########## | 100% 
cudatoolkit-9.2      | 351.0 MB  | ########## | 100% 
gast-0.2.0           | 15 KB     | ########## | 100% 
cudnn-7.2.1          | 322.8 MB  | ########## | 100% 
numpy-1.15.4         | 35 KB     | ########## | 100% 
libprotobuf-3.6.1    | 4.1 MB    | ########## | 100% 
grpcio-1.14.1        | 1.0 MB    | ########## | 100% 
h5py-2.8.0           | 1.1 MB    | ########## | 100% 
mkl_fft-1.0.6        |

  Using cached https://files.pythonhosted.org/packages/26/2d/f749a5c82f6192d77ed061a38e02001afcba55fe8477336d26a950ab17ce/websocket_client-0.54.0-py2.py3-none-any.whl
Collecting docker-pycreds>=0.3.0 (from docker->azureml-core==0.1.74.*->azureml-defaults==0.1.74->-r /tmp/azureml_runs/matsu_test1116_02_1542332931636/azureml-setup/condaenv.v6lpJe.requirements.txt (line 1))
  Using cached https://files.pythonhosted.org/packages/ea/bf/7e70aeebc40407fbdb96fa9f79fc8e4722ea889a99378303e3bcc73f4ab5/docker_pycreds-0.3.0-py2.py3-none-any.whl
Collecting pyasn1>=0.1.1 (from ndg-httpsclient->azureml-core==0.1.74.*->azureml-defaults==0.1.74->-r /tmp/azureml_runs/matsu_test1116_02_1542332931636/azureml-setup/condaenv.v6lpJe.requirements.txt (line 1))
  Using cached https://files.pythonhosted.org/packages/d1/a1/7790cc85db38daa874f6a2e6308131b9953feb1367f2ae2d1123bb93a9f5/pyasn1-0.4.4-py2.py3-none-any.whl
Collecting idna>=2.1 (from cryptography!=1.9,!=2.0.*,!=2.1.*,!=2.2.*->azureml-core==0.1.74.*->azur


Streaming azureml-logs/80_driver_log.txt

Instructions for updating:
When switching to tf.estimator.Estimator, use tf.estimator.RunConfig instead.
Instructions for updating:
Queue-based input pipelines have been replaced by `tf.data`. Use `tf.data.Dataset.from_tensor_slices(string_tensor).shuffle(tf.shape(input_tensor, out_type=tf.int64)[0]).repeat(num_epochs)`. If `shuffle=False`, omit the `.shuffle(...)`.
Instructions for updating:
Queue-based input pipelines have been replaced by `tf.data`. Use `tf.data.Dataset.from_tensor_slices(input_tensor).shuffle(tf.shape(input_tensor, out_type=tf.int64)[0]).repeat(num_epochs)`. If `shuffle=False`, omit the `.shuffle(...)`.
Instructions for updating:
Queue-based input pipelines have been replaced by `tf.data`. Use `tf.data.Dataset.from_tensors(tensor).repeat(num_epochs)`.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.
Instruct

{'runId': 'matsu_test1116_02_1542332931636',
 'target': 'mydsvm01',
 'status': 'Finalizing',
 'startTimeUtc': '2018-11-16T01:54:06.743491Z',
 'properties': {'azureml.runsource': 'experiment',
  'ContentSnapshotId': 'dcb2178c-8836-4c1d-b0d4-480e97b9c5f5'},
 'runDefinition': {'Script': 'train.py',
  'Arguments': ['--data_folder', '$AZUREML_DATAREFERENCE_myblob01'],
  'SourceDirectoryDataStore': None,
  'Framework': 0,
  'Communicator': 0,
  'Target': 'mydsvm01',
  'DataReferences': {'myblob01': {'DataStoreName': 'myblob01',
    'Mode': 'Download',
    'PathOnDataStore': 'tfdata',
    'PathOnCompute': None,
    'Overwrite': True}},
  'JobName': None,
  'AutoPrepareEnvironment': True,
  'MaxRunDurationSeconds': None,
  'NodeCount': 1,
  'Environment': {'Python': {'InterpreterPath': 'python',
    'UserManagedDependencies': False,
    'CondaDependencies': {'name': 'project_environment',
     'dependencies': ['python=3.6.2',
      {'pip': ['azureml-defaults==0.1.74']},
      'tensorflow-gpu']

### Step 6 : Download results and check

Check generated files.

In [15]:
run.get_file_names()

['azureml-logs/60_control_log.txt',
 'azureml-logs/80_driver_log.txt',
 'logs/events.out.tfevents.1542333253.mydsvm01462390892e8d',
 'logs/model.ckpt-0.index',
 'logs/model.ckpt-0.meta',
 'logs/checkpoint',
 'logs/graph.pbtxt',
 'logs/model.ckpt-0.data-00000-of-00001',
 'logs/model.ckpt-1100.meta',
 'logs/model.ckpt-1100.data-00000-of-00001',
 'logs/model.ckpt-1100.index',
 'logs/eval/events.out.tfevents.1542333279.mydsvm01462390892e8d',
 'outputs/1542333279/saved_model.pb',
 'outputs/1542333279/variables/variables.data-00000-of-00001',
 'outputs/1542333279/variables/variables.index',
 'driver_log',
 'azureml-logs/azureml.log']

Download model into your local machine.    
**Please change ```1542333279``` to meet previous results.**

In [20]:
run.download_file(
    name='outputs/1542333279/saved_model.pb',
    output_file_path='remote_model/saved_model.pb')
run.download_file(
    name='outputs/1542333279/variables/variables.data-00000-of-00001',
    output_file_path='remote_model/variables/variables.data-00000-of-00001')
run.download_file(
    name='outputs/1542333279/variables/variables.index',
    output_file_path='remote_model/variables/variables.index')

Predict your test data using downloaded model.

In [23]:
import tensorflow as tf

# Read data by tensor
dataset = tf.data.TFRecordDataset('./data/test.tfrecords')
iterator = dataset.make_one_shot_iterator()
data_org = iterator.get_next()
data_exam = tf.parse_single_example(
    data_org,
    features={
        'image_raw': tf.FixedLenFeature([], tf.string),
        'label': tf.FixedLenFeature([], tf.int64)
    })
data_image = tf.decode_raw(data_exam['image_raw'], tf.uint8)
data_image.set_shape([784])
data_image = tf.cast(data_image, tf.float32) * (1. / 255)
data_label = tf.cast(data_exam['label'], tf.int32)

# Run tensor and generate data
with tf.Session() as sess:
    image_arr = []
    label_arr = []
    for i in range(3):
        image, label = sess.run([data_image, data_label])
        image_arr.append(image)
        label_arr.append(label)

# Predict
pred_fn = tf.contrib.predictor.from_saved_model('./remote_model')
pred = pred_fn({'inputs': image_arr})

print('Predicted: ', pred['classes'].tolist())
print('Actual   : ', label_arr)

INFO:tensorflow:Restoring parameters from ./remote_model/variables/variables
Predicted:  [7, 2, 1]
Actual   :  [7, 2, 1]


### Step 6 : Remove VM

In [24]:
dsvm_compute.delete()