# Exercise04 : Train on Remote GPU Virtual Machine

Now we run our previous sample (see "[Exercise03 : Just Train in Your Working Machine](./exercise03_train_simple.ipynb)") on remote virtual machine with GPU utilized.<br>
You can also run remote training on your favorite docker images.

*back to [index](https://github.com/tsmatz/azureml-tutorial/)*

## Initialize MLClient

Replace below's branket's string with your subscription id, resource group name, and AML workspace name.<br>
(I note that creating ```MLClient``` will not connect to AML workspace, and the client initialization is lazy.)

In [1]:
from azure.ai.ml import MLClient
from azure.identity import DeviceCodeCredential

# When you run on remote
cred = DeviceCodeCredential()

# # When you run on Azure ML Notebook
# from azure.identity import DefaultAzureCredential
# cred = DefaultAzureCredential()

# Get a handle to the workspace
ml_client = MLClient(
    credential=cred,
    subscription_id="{SUBSCRIPTION ID}",
    resource_group_name="{RESOURCE GROUP NAME}",
    workspace_name="{AML WORKSPACE NAME}",
)

  from cryptography import x509


## Save your training script as file (train.py)

Create ```scirpt``` directory and save Python script as ```./script/train.py```.

In [2]:
import os
script_folder = './script'
os.makedirs(script_folder, exist_ok=True)

In [3]:
%%writefile script/train.py
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import sys
import os
import shutil
import argparse
import math

import tensorflow as tf

FLAGS = None
batch_size = 100

#
# define functions for Estimator
#

def _my_input_fn(filepath, num_epochs):
    # image - 784 (=28 x 28) elements of grey-scaled integer value [0, 1]
    # label - digit (0, 1, ..., 9)
    data_queue = tf.train.string_input_producer(
        [filepath],
        num_epochs = num_epochs) # data is repeated and it raises OutOfRange when data is over
    data_reader = tf.TFRecordReader()
    _, serialized_exam = data_reader.read(data_queue)
    data_exam = tf.parse_single_example(
        serialized_exam,
        features={
            'image_raw': tf.FixedLenFeature([], tf.string),
            'label': tf.FixedLenFeature([], tf.int64)
        })
    data_image = tf.decode_raw(data_exam['image_raw'], tf.uint8)
    data_image.set_shape([784])
    data_image = tf.cast(data_image, tf.float32) * (1. / 255)
    data_label = tf.cast(data_exam['label'], tf.int32)
    data_batch_image, data_batch_label = tf.train.batch(
        [data_image, data_label],
        batch_size=batch_size)
    return {'inputs': data_batch_image}, data_batch_label

def _get_input_fn(filepath, num_epochs):
    return lambda: _my_input_fn(filepath, num_epochs)

def _my_model_fn(features, labels, mode):
    # with tf.device(...): # You can set device if using GPUs

    # define network and inference
    # (simple 2 fully connected hidden layer : 784->128->64->10)
    with tf.name_scope('hidden1'):
        weights = tf.Variable(
            tf.truncated_normal(
                [784, FLAGS.first_layer],
                stddev=1.0 / math.sqrt(float(784))),
            name='weights')
        biases = tf.Variable(
            tf.zeros([FLAGS.first_layer]),
            name='biases')
        hidden1 = tf.nn.relu(tf.matmul(features['inputs'], weights) + biases)
    with tf.name_scope('hidden2'):
        weights = tf.Variable(
            tf.truncated_normal(
                [FLAGS.first_layer, FLAGS.second_layer],
                stddev=1.0 / math.sqrt(float(FLAGS.first_layer))),
            name='weights')
        biases = tf.Variable(
            tf.zeros([FLAGS.second_layer]),
            name='biases')
        hidden2 = tf.nn.relu(tf.matmul(hidden1, weights) + biases)
    with tf.name_scope('softmax_linear'):
        weights = tf.Variable(
            tf.truncated_normal(
                [FLAGS.second_layer, 10],
                stddev=1.0 / math.sqrt(float(FLAGS.second_layer))),
        name='weights')
        biases = tf.Variable(
            tf.zeros([10]),
            name='biases')
        logits = tf.matmul(hidden2, weights) + biases
 
    # compute evaluation matrix
    predicted_indices = tf.argmax(input=logits, axis=1)
    if mode != tf.estimator.ModeKeys.PREDICT:
        label_indices = tf.cast(labels, tf.int32)
        accuracy = tf.metrics.accuracy(label_indices, predicted_indices)
        tf.summary.scalar('accuracy', accuracy[1]) # output to TensorBoard
 
        loss = tf.losses.sparse_softmax_cross_entropy(
            labels=labels,
            logits=logits)
 
    # define operations
    if mode == tf.estimator.ModeKeys.TRAIN:
        #global_step = tf.train.create_global_step()
        #global_step = tf.contrib.framework.get_or_create_global_step()
        global_step = tf.train.get_or_create_global_step()        
        optimizer = tf.train.GradientDescentOptimizer(
            learning_rate=FLAGS.learning_rate)
        train_op = optimizer.minimize(
            loss=loss,
            global_step=global_step)
        return tf.estimator.EstimatorSpec(
            mode,
            loss=loss,
            train_op=train_op)
    if mode == tf.estimator.ModeKeys.EVAL:
        eval_metric_ops = {
            'accuracy': accuracy
        }
        return tf.estimator.EstimatorSpec(
            mode,
            loss=loss,
            eval_metric_ops=eval_metric_ops)
    if mode == tf.estimator.ModeKeys.PREDICT:
        probabilities = tf.nn.softmax(logits, name='softmax_tensor')
        predictions = {
            'classes': predicted_indices,
            'probabilities': probabilities
        }
        export_outputs = {
            'prediction': tf.estimator.export.PredictOutput(predictions)
        }
        return tf.estimator.EstimatorSpec(
            mode,
            predictions=predictions,
            export_outputs=export_outputs)

def _my_serving_input_fn():
    inputs = {'inputs': tf.placeholder(tf.float32, [None, 784])}
    return tf.estimator.export.ServingInputReceiver(inputs, inputs)

#
# Main
#

parser = argparse.ArgumentParser()
parser.add_argument(
    '--data_folder',
    type=str,
    default='./data',
    help='Folder path for input data')
parser.add_argument(
    '--chkpoint_folder',
    type=str,
    default='./logs',  # AML experiments logs folder
    help='Folder path for checkpoint files')
parser.add_argument(
    '--model_folder',
    type=str,
    default='./outputs',  # AML experiments outputs folder
    help='Folder path for model output')
parser.add_argument(
    '--learning_rate',
    type=float,
    default='0.07',
    help='Learning Rate')
parser.add_argument(
    '--first_layer',
    type=int,
    default='128',
    help='Neuron number for the first hidden layer')
parser.add_argument(
    '--second_layer',
    type=int,
    default='64',
    help='Neuron number for the second hidden layer')
FLAGS, unparsed = parser.parse_known_args()

# clean checkpoint and model folder if exists
if os.path.exists(FLAGS.chkpoint_folder) :
    for file_name in os.listdir(FLAGS.chkpoint_folder):
        file_path = os.path.join(FLAGS.chkpoint_folder, file_name)
        if os.path.isfile(file_path):
            os.remove(file_path)
        elif os.path.isdir(file_path):
            shutil.rmtree(file_path)
if os.path.exists(FLAGS.model_folder) :
    for file_name in os.listdir(FLAGS.model_folder):
        file_path = os.path.join(FLAGS.model_folder, file_name)
        if os.path.isfile(file_path):
            os.remove(file_path)
        elif os.path.isdir(file_path):
            shutil.rmtree(file_path)

# read TF_CONFIG
run_config = tf.estimator.RunConfig()

# create Estimator
mnist_fullyconnected_classifier = tf.estimator.Estimator(
    model_fn=_my_model_fn,
    model_dir=FLAGS.chkpoint_folder,
    config=run_config)
train_spec = tf.estimator.TrainSpec(
    input_fn=_get_input_fn(os.path.join(FLAGS.data_folder, 'train.tfrecords'), 2),
    max_steps=60000 * 2 / batch_size)
eval_spec = tf.estimator.EvalSpec(
    input_fn=_get_input_fn(os.path.join(FLAGS.data_folder, 'test.tfrecords'), 1),
    steps=10000 * 1 / batch_size,
    start_delay_secs=0)

# run !
tf.estimator.train_and_evaluate(
    mnist_fullyconnected_classifier,
    train_spec,
    eval_spec
)

# save model and variables
model_dir = mnist_fullyconnected_classifier.export_savedmodel(
    export_dir_base = FLAGS.model_folder,
    serving_input_receiver_fn = _my_serving_input_fn)
print('current working directory is ', os.getcwd())
print('model is saved ', model_dir)

Writing script/train.py


## Train on remote VM

Now let's start to integrate with AML and automate training on remote virtual machine.

### Step 1 : Create new remote virtual machine

Create your new reomte virtual machine with GPU.<br>
Before starting, **please check the following**.

- Make sure that the following size (in the following script, ```Standard_NC4as_T4_v3```) is supported in the location (in which AML workspace resides).
- You should have quota for ML GPU VM in your Azure subscription. If you don't have, please request quota in Azure Portal.

**If you don't have any quota for GPU, please change VM size (such as, Standard_D2_v2).**

By setting 0 in ```min_instances```, the node will be terminated if it's inactive. (You can save money.)

In [4]:
from azure.ai.ml.entities import AmlCompute

try:
    compute_target = ml_client.compute.get("myvm01")
    print("found existing: ", compute_target.name)
except Exception:
    print("creating new.")
    compute_target = AmlCompute(
        name="myvm01",
        type="amlcompute",
        size="Standard_NC4as_T4_v3", # change such as Standard_NC6 or Standard_D2_v2 if needed
        min_instances=0,
        max_instances=1,
        tier="Dedicated",
    )
    compute_target = ml_client.begin_create_or_update(compute_target)

To sign in, use a web browser to open the page https://microsoft.com/devicelogin and enter the code R3JH9DQK7 to authenticate.
creating new.


### Step 2 : Create environment

Here we create a new docker environments for running scripts. In the first time, it will generate our own conatiner image as following settings. (It will then take a long time for completing experiment.)
However, you can speed up by reusing the generated environment in the next run, once you have registered the generated environment.

In this example, I create my own environment manually, but **you can also use existing environments (called, curated environments) for a variety of purposes**. (In Exercise 05, we will use a curated environment, which includes TensorFlow 1.x.)

First I create conda dependancies yaml and save as ```04_conda_pydata.yml```.<br>
To run TensorFlow 1.x, here I use Python version 3.6.

In [5]:
%%writefile 04_conda_pydata.yml
name: project_environment
dependencies:
- python=3.6
- pip:
  - tensorflow-gpu==1.15
channels:
- anaconda
- conda-forge

Writing 04_conda_pydata.yml


Register custom environment (named ```test-remote-gpu-env```) in AML with previous conda configuration.

In [6]:
from azure.ai.ml.entities import Environment

myenv = Environment(
    name="test-remote-gpu-env",
    description="This is example",
    conda_file="04_conda_pydata.yml",
    image="mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.2-cudnn8-ubuntu18.04:latest",
)
myenv = ml_client.environments.create_or_update(myenv)

### Step 3 : Submit training job

Submit a training job with above compute and environment.

In this example, I use the registered data asset  (train.tfrecords, test.tfrecords) named ```mnist_tfrecords_data``` to mount in your compute target. (Run "[Exercise02 : Prepare Data](./exercise02_prepare_data.ipynb)" for data preparation.)
In order to use data asset in AML, set ```{DATA_NAME}:{DATA_VERSION}``` or ```{DATA_NAME}@latest``` for the latest version of assets as follows.

See the progress and results in [AML Studio](https://ml.azure.com/) experiments.

> Note : It will take a long time (over 30 minutes) for the first time run, because it'll pull base image, generate new image (custom environment), start nodes in cluster, and run scripts.<br>
> By using built-in ```AzureML-TensorFlow-1.13-GPU``` environment, it will speed up. (See Exercise 05 for using AML built-in environments, called curated environments.)

In [8]:
from azure.ai.ml import command, Input

# create the command
job = command(
    code="./script",
    command="python train.py --data_folder ${{inputs.mnist_tf}}",
    inputs={
        "mnist_tf": Input(
            type="uri_folder",
            path="mnist_tfrecords_data@latest",
        ),
    },
    environment="test-remote-gpu-env@latest",
    compute="myvm01",
    display_name="tf_remote_experiment",
    experiment_name="tf_remote_experiment",
    description="This is example",
)

# submit the command
returned_job = ml_client.create_or_update(job)

You can get job name as follows.<br>
Job name is always used to get detailed information about job.

In [9]:
returned_job.name

'olden_dolphin_tt4yfrl08d'

Please wait until the job is completed.

You can see current status (progress) with [AML studio UI](https://ml.azure.com/) (see "Jobs" pane) or with the following CLI command.

In [10]:
ml_client.jobs.get(returned_job.name)

Experiment,Name,Type,Status,Details Page
tf_remote_experiment,olden_dolphin_tt4yfrl08d,command,Completed,Link to Azure Machine Learning studio


### Step 4 : Download results and evaluate

After the training has completed, go to [Azure ML studio UI](https://ml.azure.com/).<br>
You can then see the saved model in outputs directory.

![Saved Outputs](https://tsmatz.github.io/images/github/azure-ml-tensorflow-complete-sample/20220225_Experiment_Outputs.jpg)

Now let's check the generated model in local computer.<br>
Download artifacts (including the generated model in outputs) with SDK as follows.

In [11]:
ml_client.jobs.download(returned_job.name)

Downloading artifact azureml://datastores/workspaceartifactstore/ExperimentRun/dcid.olden_dolphin_tt4yfrl08d to /home/tsmatsuz/azureml-tutorial/python_sdk2/artifacts


Now check the downloaded result.<br>
Before running this script, **replace ```1654653771``` with your generated model name**.

In [27]:
import tensorflow as tf

MODEL_NAME = "1654653771"

# Read data by tensor
tfdata = tf.data.TFRecordDataset('./data/test.tfrecords')
iterator = tf.compat.v1.data.make_one_shot_iterator(tfdata)
data_org = iterator.get_next()
data_exam = tf.parse_single_example(
    data_org,
    features={
        'image_raw': tf.FixedLenFeature([], tf.string),
        'label': tf.FixedLenFeature([], tf.int64)
    })
data_image = tf.decode_raw(data_exam['image_raw'], tf.uint8)
data_image.set_shape([784])
data_image = tf.cast(data_image, tf.float32) * (1. / 255)
data_label = tf.cast(data_exam['label'], tf.int32)

# Run tensor and generate data
with tf.Session() as sess:
    image_arr = []
    label_arr = []
    for i in range(3):
        image, label = sess.run([data_image, data_label])
        image_arr.append(image)
        label_arr.append(label)

# Predict
pred_fn = tf.contrib.predictor.from_saved_model('./artifacts/outputs/{}'.format(MODEL_NAME))
pred = pred_fn({'inputs': image_arr})

print('Predicted: ', pred['classes'].tolist())
print('Actual   : ', label_arr)

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
This function will only be available through the v1 compatibility library as tf.compat.v1.saved_model.loader.load or tf.compat.v1.saved_model.load. There will be a new function for importing SavedModels in Tensorflow 2.0.
INFO:tensorflow:Restoring parameters from ./artifacts/outputs/1654653771/variables/variables
Predicted:  [7, 2, 1]
Actual   :  [7, 2, 1]


### Step 5 : Register Model

Now upload (register) the downloaded model into AML model management.<br>
(**Replace the following ```1654653771``` with your job name and model name**.)

In [28]:
!mv ./artifacts/outputs/1654653771 ./generated_model

In [30]:
from azure.ai.ml.entities import Model
#from azure.ai.ml.constants import ModelType

file_model = Model(
    path="./generated_model",
    type="custom_model",
    name="mnist_model_test",
)
ml_client.models.create_or_update(file_model)

Model({'job_name': None, 'is_anonymous': False, 'auto_increment_version': False, 'name': 'mnist_model_test', 'description': None, 'tags': {}, 'properties': {}, 'id': '/subscriptions/b3ae1c15-4fef-4362-8c3a-5d804cdeb18d/resourceGroups/AzureML-rg/providers/Microsoft.MachineLearningServices/workspaces/ws01/models/mnist_model_test/versions/1', 'base_path': './', 'creation_context': <azure.ai.ml._restclient.v2022_05_01.models._models_py3.SystemData object at 0x7f072b90d5c0>, 'serialize': <msrest.serialization.Serializer object at 0x7f072b8b0f60>, 'version': '1', 'latest_version': None, 'path': 'azureml://subscriptions/b3ae1c15-4fef-4362-8c3a-5d804cdeb18d/resourceGroups/AzureML-rg/workspaces/ws01/datastores/workspaceblobstore/paths/LocalUpload/f0ac381722553230c14d00946884f18a/generated_model', 'utc_time_created': None, 'flavors': None, 'arm_type': 'model_version', 'type': 'custom_model'})

In [None]:
""" Note : You can also regiser model using the run object in job."""
# run_model = Model(
#     path="azureml://subscriptions/XXXXXXXXXXXXXXXXXXXXX/resourceGroups/XXXXXXXXX/workspaces/XXXXXXXXXXXXX/jobs/XXXXXXXXXXXX/outputs/artifacts/paths/model/",
#     name="mnist_model_test",
#     type="custom_model",
# )
# ml_client.models.create_or_update(run_model)

### Step 6 : Remove AML compute

**You don't need to remove your AML compute** for saving money, because the nodes will be automatically terminated, when it's inactive.<br>
But if you want to clean up, please run as follows.

In [None]:
ml_client.compute.begin_delete("myvm01")

Deleting compute myvm01 


......