# Exercise04 : Train on Remote GPU Virtual Machine

Now we run our previous sample (see "[Exercise03 : Just Train in Your Working Machine](./exercise03_train_simple.ipynb)") on remote virtual machine with GPU utilized.<br>
Here we use remote virtual machine and conda virtual environment, but you can also use Batch AI pool sharing in your team, or run on your favorite docker images.

*back to [index](https://github.com/tsmatz/azureml-tutorial/)*

## Save your training script as file (train.py)

Create ```scirpt``` directory.

In [1]:
import os
script_folder = './script'
os.makedirs(script_folder, exist_ok=True)

By adding the following ```%%writefile``` at the beginning of the source code in "[Exercise03 : Just Train in Your Working Machine](./exercise03_train_simple.ipynb)", this source code is saved as ```./script/train.py```.

In [2]:
%%writefile script/train.py
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import sys
import os
import shutil
import argparse
import math

import tensorflow as tf

FLAGS = None
batch_size = 100

#
# define functions for Estimator
#

def _my_input_fn(filepath, num_epochs):
    # image - 784 (=28 x 28) elements of grey-scaled integer value [0, 1]
    # label - digit (0, 1, ..., 9)
    data_queue = tf.train.string_input_producer(
        [filepath],
        num_epochs = num_epochs) # data is repeated and it raises OutOfRange when data is over
    data_reader = tf.TFRecordReader()
    _, serialized_exam = data_reader.read(data_queue)
    data_exam = tf.parse_single_example(
        serialized_exam,
        features={
            'image_raw': tf.FixedLenFeature([], tf.string),
            'label': tf.FixedLenFeature([], tf.int64)
        })
    data_image = tf.decode_raw(data_exam['image_raw'], tf.uint8)
    data_image.set_shape([784])
    data_image = tf.cast(data_image, tf.float32) * (1. / 255)
    data_label = tf.cast(data_exam['label'], tf.int32)
    data_batch_image, data_batch_label = tf.train.batch(
        [data_image, data_label],
        batch_size=batch_size)
    return {'inputs': data_batch_image}, data_batch_label

def _get_input_fn(filepath, num_epochs):
    return lambda: _my_input_fn(filepath, num_epochs)

def _my_model_fn(features, labels, mode):
    # with tf.device(...): # You can set device if using GPUs

    # define network and inference
    # (simple 2 fully connected hidden layer : 784->128->64->10)
    with tf.name_scope('hidden1'):
        weights = tf.Variable(
            tf.truncated_normal(
                [784, FLAGS.first_layer],
                stddev=1.0 / math.sqrt(float(784))),
            name='weights')
        biases = tf.Variable(
            tf.zeros([FLAGS.first_layer]),
            name='biases')
        hidden1 = tf.nn.relu(tf.matmul(features['inputs'], weights) + biases)
    with tf.name_scope('hidden2'):
        weights = tf.Variable(
            tf.truncated_normal(
                [FLAGS.first_layer, FLAGS.second_layer],
                stddev=1.0 / math.sqrt(float(FLAGS.first_layer))),
            name='weights')
        biases = tf.Variable(
            tf.zeros([FLAGS.second_layer]),
            name='biases')
        hidden2 = tf.nn.relu(tf.matmul(hidden1, weights) + biases)
    with tf.name_scope('softmax_linear'):
        weights = tf.Variable(
            tf.truncated_normal(
                [FLAGS.second_layer, 10],
                stddev=1.0 / math.sqrt(float(FLAGS.second_layer))),
        name='weights')
        biases = tf.Variable(
            tf.zeros([10]),
            name='biases')
        logits = tf.matmul(hidden2, weights) + biases
 
    # compute evaluation matrix
    predicted_indices = tf.argmax(input=logits, axis=1)
    if mode != tf.estimator.ModeKeys.PREDICT:
        label_indices = tf.cast(labels, tf.int32)
        accuracy = tf.metrics.accuracy(label_indices, predicted_indices)
        tf.summary.scalar('accuracy', accuracy[1]) # output to TensorBoard
 
        loss = tf.losses.sparse_softmax_cross_entropy(
            labels=labels,
            logits=logits)
 
    # define operations
    if mode == tf.estimator.ModeKeys.TRAIN:
        #global_step = tf.train.create_global_step()
        #global_step = tf.contrib.framework.get_or_create_global_step()
        global_step = tf.train.get_or_create_global_step()        
        optimizer = tf.train.GradientDescentOptimizer(
            learning_rate=FLAGS.learning_rate)
        train_op = optimizer.minimize(
            loss=loss,
            global_step=global_step)
        return tf.estimator.EstimatorSpec(
            mode,
            loss=loss,
            train_op=train_op)
    if mode == tf.estimator.ModeKeys.EVAL:
        eval_metric_ops = {
            'accuracy': accuracy
        }
        return tf.estimator.EstimatorSpec(
            mode,
            loss=loss,
            eval_metric_ops=eval_metric_ops)
    if mode == tf.estimator.ModeKeys.PREDICT:
        probabilities = tf.nn.softmax(logits, name='softmax_tensor')
        predictions = {
            'classes': predicted_indices,
            'probabilities': probabilities
        }
        export_outputs = {
            'prediction': tf.estimator.export.PredictOutput(predictions)
        }
        return tf.estimator.EstimatorSpec(
            mode,
            predictions=predictions,
            export_outputs=export_outputs)

def _my_serving_input_fn():
    inputs = {'inputs': tf.placeholder(tf.float32, [None, 784])}
    return tf.estimator.export.ServingInputReceiver(inputs, inputs)

#
# Main
#

parser = argparse.ArgumentParser()
parser.add_argument(
    '--data_folder',
    type=str,
    default='./data',
    help='Folder path for input data')
parser.add_argument(
    '--chkpoint_folder',
    type=str,
    default='./logs',  # AML experiments logs folder
    help='Folder path for checkpoint files')
parser.add_argument(
    '--model_folder',
    type=str,
    default='./outputs',  # AML experiments outputs folder
    help='Folder path for model output')
parser.add_argument(
    '--learning_rate',
    type=float,
    default='0.07',
    help='Learning Rate')
parser.add_argument(
    '--first_layer',
    type=int,
    default='128',
    help='Neuron number for the first hidden layer')
parser.add_argument(
    '--second_layer',
    type=int,
    default='64',
    help='Neuron number for the second hidden layer')
FLAGS, unparsed = parser.parse_known_args()

# clean checkpoint and model folder if exists
if os.path.exists(FLAGS.chkpoint_folder) :
    for file_name in os.listdir(FLAGS.chkpoint_folder):
        file_path = os.path.join(FLAGS.chkpoint_folder, file_name)
        if os.path.isfile(file_path):
            os.remove(file_path)
        elif os.path.isdir(file_path):
            shutil.rmtree(file_path)
if os.path.exists(FLAGS.model_folder) :
    for file_name in os.listdir(FLAGS.model_folder):
        file_path = os.path.join(FLAGS.model_folder, file_name)
        if os.path.isfile(file_path):
            os.remove(file_path)
        elif os.path.isdir(file_path):
            shutil.rmtree(file_path)

# read TF_CONFIG
run_config = tf.estimator.RunConfig()

# create Estimator
mnist_fullyconnected_classifier = tf.estimator.Estimator(
    model_fn=_my_model_fn,
    model_dir=FLAGS.chkpoint_folder,
    config=run_config)
train_spec = tf.estimator.TrainSpec(
    input_fn=_get_input_fn(os.path.join(FLAGS.data_folder, 'train.tfrecords'), 2),
    max_steps=60000 * 2 / batch_size)
eval_spec = tf.estimator.EvalSpec(
    input_fn=_get_input_fn(os.path.join(FLAGS.data_folder, 'test.tfrecords'), 1),
    steps=10000 * 1 / batch_size,
    start_delay_secs=0)

# run !
tf.estimator.train_and_evaluate(
    mnist_fullyconnected_classifier,
    train_spec,
    eval_spec
)

# save model and variables
model_dir = mnist_fullyconnected_classifier.export_savedmodel(
    export_dir_base = FLAGS.model_folder,
    serving_input_receiver_fn = _my_serving_input_fn)
print('current working directory is ', os.getcwd())
print('model is saved ', model_dir)

Writing script/train.py


## Train on remote VM

Now let's start to integrate with AML and automate training on remote virtual machine.

### Step 1 : Get workspace setting

Before starting, you must read your configuration settings. (See "[Exercise01 : Prepare Config Settings](./exercise01_prepare_config.ipynb)")

In [3]:
from azureml.core import Workspace
import azureml.core

ws = Workspace.from_config()

### Step 2 : Create new remote virtual machine

Create your new reomte virtual machine with GPU.<br>
Before starting, **please check as follows**.

- You should have quota for ML GPU VM in your Azure subscription. If you don't have, please request quota in Azure Portal.
- Please fill the following ```vm_size``` and ```location``` for GPU cluster which you can use.

**If you don't have any quota for GPU, please change VM size (such as, Standard_D2_v2).**

> Note : It's better to use the same location for AML workspace, since data in AML workspace will be mounted on this virtual machine.

By enabling auto-scaling (from 0 to 1), the node will be terminated if it's inactive. (You can save money.)    
If VM already exists, this script will get the existing one.

> Note : You can also attach an existing virtual machine (bring your own compute resource) as a compute target.

In [4]:
from azureml.core.compute import AmlCompute, ComputeTarget
from azureml.core.compute_target import ComputeTargetException

try:
    compute_target = ComputeTarget(workspace=ws, name='myvm01')
    print('found existing:', compute_target.name)
except ComputeTargetException:
    print('creating new.')
    compute_config = AmlCompute.provisioning_configuration(
        vm_size='Standard_NC4as_T4_v3', # change such as Standard_NC6 or Standard_D2_v2 if needed
        min_nodes=0,
        max_nodes=1,
        location="eastus")
    compute_target = ComputeTarget.create(ws, 'myvm01', compute_config)
    compute_target.wait_for_completion(show_output=True)

creating new.
InProgress......
SucceededProvisioning operation finished, operation "Succeeded"
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


### Step 3 : Get dataset reference for files

You can use registered dataset (train.tfrecords, test.tfrecords) to mount in your compute target.    
See "[Exercise02 : Prepare Data](./exercise02_prepare_data.ipynb)" for data preparation.

> Note : Dataset registration is not mandatory. (You can mount any data (as dataset) in AML datastore.)

In [5]:
from azureml.core import Dataset

dataset = Dataset.get_by_name(ws, 'mnist_tfrecords_dataset', version='latest')

# # For using unregistered data, see below
# from azureml.core import Datastore
# from azureml.core import Dataset
# ds = ws.get_default_datastore()
# ds_paths = [(ds, 'tfdata/')]
# dataset = Dataset.File.from_files(path = ds_paths)

### Step 4 : Create environment

Here we create a new docker environments for running scripts. In the first time, it will generate our own conatiner image as following settings. (It will then take a long time for completing experiment.)
However, you can speed up by reusing the generated environment in the next run, once you have registered the generated environment.

In this example, we create our own environment manually, but **you can also use existing environments (called, curated environments) for a variety of purposes**. (In Exercise 05, we will use a curated environment, which includes TensorFlow 1.x.)

In [6]:
from azureml.core.runconfig import DEFAULT_GPU_IMAGE
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core.environment import Environment

# create environment
env = Environment('test-remote-gpu-env')
env.python.conda_dependencies = CondaDependencies.create(
    python_version="3.6",
    conda_packages=['tensorflow-gpu==1.15'])
env.docker.base_image = DEFAULT_GPU_IMAGE

# register environment to re-use later
env.register(workspace=ws)
## # speed up by using the existing environment
## env = Environment.get(ws, name='test-remote-gpu-env')

### Step 5 : Run script and wait for completion

> Note : It will take a long time (over 30 minutes) for the first time run, because it'll pull base image, generate new image (custom environment), start nodes in cluster, and run scripts.<br>
> By using built-in ```AzureML-TensorFlow-1.13-GPU``` environment, it will speed up. (See Exercise 05 for using AML built-in environments, called curated environments.)

In [7]:
from azureml.core import Experiment
from azureml.core import Run
from azureml.core import ScriptRunConfig
from azureml.core.runconfig import DockerConfiguration

# create script run config
src = ScriptRunConfig(
    source_directory='./script',
    script='train.py',
    arguments=['--data_folder', dataset.as_mount()],
    compute_target=compute_target,
    environment=env,
    docker_runtime_config=DockerConfiguration(use_docker=True))

# submit and run !
exp = Experiment(workspace=ws, name='tf_remote_experiment')
run = exp.submit(config=src)
run.wait_for_completion(show_output=True)

RunId: tf_remote_experiment_1630385291_61b6af45
Web View: https://ml.azure.com/runs/tf_remote_experiment_1630385291_61b6af45?wsid=/subscriptions/b3ae1c15-4fef-4362-8c3a-5d804cdeb18d/resourcegroups/TESTML-rg/workspaces/ws01&tid=72f988bf-86f1-41af-91ab-2d7cd011db47

Streaming azureml-logs/55_azureml-execution-tvmps_068e554c6f1a43ff964b2ce12414d42529985df21a7f868dc4c015ca790b955e_d.txt

2021-08-31T04:52:35Z Successfully mounted a/an Blobfuse File System at /mnt/batch/tasks/shared/LS_root/jobs/ws01/azureml/tf_remote_experiment_1630385291_61b6af45/mounts/workspaceblobstore
2021-08-31T04:52:35Z Failed to start nvidia-fabricmanager due to exit status 5 with output Failed to start nvidia-fabricmanager.service: Unit nvidia-fabricmanager.service not found.
. Please ignore this if the GPUs don't utilize NVIDIA® NVLink® switches.
2021-08-31T04:52:35Z Starting output-watcher...
2021-08-31T04:52:35Z IsDedicatedCompute == True, won't poll for Low Pri Preemption
2021-08-31T04:52:36Z Executing 'Copy AC

{'runId': 'tf_remote_experiment_1630385291_61b6af45',
 'target': 'mydsvm01',
 'status': 'Completed',
 'startTimeUtc': '2021-08-31T04:52:33.500864Z',
 'endTimeUtc': '2021-08-31T04:55:31.773239Z',
 'properties': {'_azureml.ComputeTargetType': 'amlcompute',
  'ContentSnapshotId': 'd884c7ef-54ae-4b5a-b3ec-eec1a5fc58f9',
  'ProcessInfoFile': 'azureml-logs/process_info.json',
  'ProcessStatusFile': 'azureml-logs/process_status.json',
  'azureml.git.repository_uri': 'https://github.com/tsmatz/azureml-tutorial-tensorflow-v1.git',
  'mlflow.source.git.repoURL': 'https://github.com/tsmatz/azureml-tutorial-tensorflow-v1.git',
  'azureml.git.branch': 'master',
  'mlflow.source.git.branch': 'master',
  'azureml.git.commit': '9a48ad294217dbd8be2f52f124ecc0d9d68bad6c',
  'mlflow.source.git.commit': '9a48ad294217dbd8be2f52f124ecc0d9d68bad6c',
  'azureml.git.dirty': 'True'},
 'inputDatasets': [{'dataset': {'id': '7c3a3d10-fdfe-4246-97fb-f9a7c499983d'}, 'consumptionDetails': {'type': 'RunInput', 'inputN

### Step 6 : Download results and evaluate

Now let's check the generated model in local computer.

First, check generated files and logs.

In [8]:
run.get_file_names()

['azureml-logs/55_azureml-execution-tvmps_068e554c6f1a43ff964b2ce12414d42529985df21a7f868dc4c015ca790b955e_d.txt',
 'azureml-logs/65_job_prep-tvmps_068e554c6f1a43ff964b2ce12414d42529985df21a7f868dc4c015ca790b955e_d.txt',
 'azureml-logs/70_driver_log.txt',
 'azureml-logs/75_job_post-tvmps_068e554c6f1a43ff964b2ce12414d42529985df21a7f868dc4c015ca790b955e_d.txt',
 'azureml-logs/process_info.json',
 'azureml-logs/process_status.json',
 'logs/azureml/dataprep/backgroundProcess.log',
 'logs/azureml/dataprep/backgroundProcess_Telemetry.log',
 'logs/azureml/job_release_azureml.log',
 'logs/azureml/sidecar/tvmps_068e554c6f1a43ff964b2ce12414d42529985df21a7f868dc4c015ca790b955e_d/all.log',
 'logs/azureml/sidecar/tvmps_068e554c6f1a43ff964b2ce12414d42529985df21a7f868dc4c015ca790b955e_d/task.exit_contexts.log',
 'logs/checkpoint',
 'logs/eval/events.out.tfevents.1630385719.50eec89ecf714ceb9fd036b65baa0762000001',
 'logs/events.out.tfevents.1630385704.50eec89ecf714ceb9fd036b65baa0762000001',
 'logs/gr

Download model into your local machine.    
**Please change ```1630385719``` to meet previous results.**

In [9]:
run.download_file(
    name='outputs/1630385719/saved_model.pb',
    output_file_path='remote_model/saved_model.pb')
run.download_file(
    name='outputs/1630385719/variables/variables.data-00000-of-00001',
    output_file_path='remote_model/variables/variables.data-00000-of-00001')
run.download_file(
    name='outputs/1630385719/variables/variables.index',
    output_file_path='remote_model/variables/variables.index')

Predict your test data using downloaded model.

In [10]:
import tensorflow as tf

# Read data by tensor
tfdata = tf.data.TFRecordDataset('./data/test.tfrecords')
iterator = tf.compat.v1.data.make_one_shot_iterator(tfdata)
data_org = iterator.get_next()
data_exam = tf.parse_single_example(
    data_org,
    features={
        'image_raw': tf.FixedLenFeature([], tf.string),
        'label': tf.FixedLenFeature([], tf.int64)
    })
data_image = tf.decode_raw(data_exam['image_raw'], tf.uint8)
data_image.set_shape([784])
data_image = tf.cast(data_image, tf.float32) * (1. / 255)
data_label = tf.cast(data_exam['label'], tf.int32)

# Run tensor and generate data
with tf.Session() as sess:
    image_arr = []
    label_arr = []
    for i in range(3):
        image, label = sess.run([data_image, data_label])
        image_arr.append(image)
        label_arr.append(label)

# Predict
pred_fn = tf.contrib.predictor.from_saved_model('./remote_model')
pred = pred_fn({'inputs': image_arr})

print('Predicted: ', pred['classes'].tolist())
print('Actual   : ', label_arr)

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
This function will only be available through the v1 compatibility library as tf.compat.v1.saved_model.loader.load or tf.compat.v1.saved_model.load. There will be a new function for importing SavedModels in Tensorflow 2.0.
INFO:tensorflow:Restoring parameters from ./remote_model/variables/variables
Predicted:  [7, 2, 1]
Actual   :  [7, 2, 1]


### Step 7 : Register Model with Dataset reference

By registering model with dataset reference, you can trace the model with the corresponding dataset version.<br>
(**Please change ```1629700431``` to meet previous results.**)

In [11]:
model = run.register_model(
    model_name='mnist_model_test',
    model_path='outputs/1630385719',
    datasets =[('training data',dataset)])

In order to track data used in this model, see this model in [Azure Machine Learning Studio](https://ml.azure.com/) and select "Datasets" tab. (See the following screenshot.)

![data tracking](https://tsmatz.files.wordpress.com/2021/08/20210823_track_data.jpg)

### Step 8 : Remove AML compute

**You don't need to remove your AML compute** for saving money, because the nodes will be automatically terminated, when it's inactive.    
But if you want to clean up, please run the following.

In [12]:
# Delete cluster (nbodes) and remove from AML workspace
mycompute = AmlCompute(workspace=ws, name='myvm01')
mycompute.delete()

In [13]:
# get a status for the current cluster.
print(mycompute.status.serialize())

{'currentNodeCount': 1, 'targetNodeCount': 1, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 1, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2021-08-31T04:52:16.858000+00:00', 'errors': None, 'creationTime': '2021-08-31T04:45:45.747268+00:00', 'modifiedTime': '2021-08-31T04:46:11.291359+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 1, 'nodeIdleTimeBeforeScaleDown': 'PT1800S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_NC4AS_T4_V3'}
