# Exercise04 : Train on Remote GPU Virtual Machine

Now we run our previous sample (see "[Exercise03 : Just Train in Your Working Machine](https://github.com/tsmatz/azure-ml-tensorflow-complete-sample/blob/master/notebooks/exercise03_train_simple.ipynb)") on remote virtual machine with GPU utilized.    
Here we use remote virtual machine and conda virtual environment, but you can also use Batch AI pool sharing in your team, or run on your favorite docker images.

*back to [index](https://github.com/tsmatz/azure-ml-tensorflow-complete-sample/)*

## Save your training script as file (train.py)

Create ```scirpt``` directory.

In [1]:
import os
script_folder = './script'
os.makedirs(script_folder, exist_ok=True)

Please add the following ```%%writefile``` at the beginning of the source code in "[Exercise03 : Just Train in Your Working Machine](https://github.com/tsmatz/azure-ml-tensorflow-complete-sample/blob/master/notebooks/exercise03_train_simple.ipynb)", and run this cell.    
Then this source code is saved as ```./script/train.py```.

In [3]:
%%writefile script/train.py
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import sys
import os
import shutil
import argparse
import math

import tensorflow as tf

FLAGS = None
batch_size = 100

#
# define functions for Estimator
#

def _my_input_fn(filepath, num_epochs):
    # image - 784 (=28 x 28) elements of grey-scaled integer value [0, 1]
    # label - digit (0, 1, ..., 9)
    data_queue = tf.train.string_input_producer(
        [filepath],
        num_epochs = num_epochs) # data is repeated and it raises OutOfRange when data is over
    data_reader = tf.TFRecordReader()
    _, serialized_exam = data_reader.read(data_queue)
    data_exam = tf.parse_single_example(
        serialized_exam,
        features={
            'image_raw': tf.FixedLenFeature([], tf.string),
            'label': tf.FixedLenFeature([], tf.int64)
        })
    data_image = tf.decode_raw(data_exam['image_raw'], tf.uint8)
    data_image.set_shape([784])
    data_image = tf.cast(data_image, tf.float32) * (1. / 255)
    data_label = tf.cast(data_exam['label'], tf.int32)
    data_batch_image, data_batch_label = tf.train.batch(
        [data_image, data_label],
        batch_size=batch_size)
    return {'inputs': data_batch_image}, data_batch_label

def _get_input_fn(filepath, num_epochs):
    return lambda: _my_input_fn(filepath, num_epochs)

def _my_model_fn(features, labels, mode):
    # with tf.device(...): # You can set device if using GPUs

    # define network and inference
    # (simple 2 fully connected hidden layer : 784->128->64->10)
    with tf.name_scope('hidden1'):
        weights = tf.Variable(
            tf.truncated_normal(
                [784, FLAGS.first_layer],
                stddev=1.0 / math.sqrt(float(784))),
            name='weights')
        biases = tf.Variable(
            tf.zeros([FLAGS.first_layer]),
            name='biases')
        hidden1 = tf.nn.relu(tf.matmul(features['inputs'], weights) + biases)
    with tf.name_scope('hidden2'):
        weights = tf.Variable(
            tf.truncated_normal(
                [FLAGS.first_layer, FLAGS.second_layer],
                stddev=1.0 / math.sqrt(float(FLAGS.first_layer))),
            name='weights')
        biases = tf.Variable(
            tf.zeros([FLAGS.second_layer]),
            name='biases')
        hidden2 = tf.nn.relu(tf.matmul(hidden1, weights) + biases)
    with tf.name_scope('softmax_linear'):
        weights = tf.Variable(
            tf.truncated_normal(
                [FLAGS.second_layer, 10],
                stddev=1.0 / math.sqrt(float(FLAGS.second_layer))),
        name='weights')
        biases = tf.Variable(
            tf.zeros([10]),
            name='biases')
        logits = tf.matmul(hidden2, weights) + biases
 
    # compute evaluation matrix
    predicted_indices = tf.argmax(input=logits, axis=1)
    if mode != tf.estimator.ModeKeys.PREDICT:
        label_indices = tf.cast(labels, tf.int32)
        accuracy = tf.metrics.accuracy(label_indices, predicted_indices)
        tf.summary.scalar('accuracy', accuracy[1]) # output to TensorBoard
 
        loss = tf.losses.sparse_softmax_cross_entropy(
            labels=labels,
            logits=logits)
 
    # define operations
    if mode == tf.estimator.ModeKeys.TRAIN:
        #global_step = tf.train.create_global_step()
        #global_step = tf.contrib.framework.get_or_create_global_step()
        global_step = tf.train.get_or_create_global_step()        
        optimizer = tf.train.GradientDescentOptimizer(
            learning_rate=FLAGS.learning_rate)
        train_op = optimizer.minimize(
            loss=loss,
            global_step=global_step)
        return tf.estimator.EstimatorSpec(
            mode,
            loss=loss,
            train_op=train_op)
    if mode == tf.estimator.ModeKeys.EVAL:
        eval_metric_ops = {
            'accuracy': accuracy
        }
        return tf.estimator.EstimatorSpec(
            mode,
            loss=loss,
            eval_metric_ops=eval_metric_ops)
    if mode == tf.estimator.ModeKeys.PREDICT:
        probabilities = tf.nn.softmax(logits, name='softmax_tensor')
        predictions = {
            'classes': predicted_indices,
            'probabilities': probabilities
        }
        export_outputs = {
            'prediction': tf.estimator.export.PredictOutput(predictions)
        }
        return tf.estimator.EstimatorSpec(
            mode,
            predictions=predictions,
            export_outputs=export_outputs)

def _my_serving_input_fn():
    inputs = {'inputs': tf.placeholder(tf.float32, [None, 784])}
    return tf.estimator.export.ServingInputReceiver(inputs, inputs)

#
# Main
#

parser = argparse.ArgumentParser()
parser.add_argument(
    '--data_folder',
    type=str,
    default='./data',
    help='Folder path for input data')
parser.add_argument(
    '--chkpoint_folder',
    type=str,
    default='./logs',  # AML experiments logs folder
    help='Folder path for checkpoint files')
parser.add_argument(
    '--model_folder',
    type=str,
    default='./outputs',  # AML experiments outputs folder
    help='Folder path for model output')
parser.add_argument(
    '--learning_rate',
    type=float,
    default='0.07',
    help='Learning Rate')
parser.add_argument(
    '--first_layer',
    type=int,
    default='128',
    help='Neuron number for the first hidden layer')
parser.add_argument(
    '--second_layer',
    type=int,
    default='64',
    help='Neuron number for the second hidden layer')
FLAGS, unparsed = parser.parse_known_args()

# clean checkpoint and model folder if exists
if os.path.exists(FLAGS.chkpoint_folder) :
    for file_name in os.listdir(FLAGS.chkpoint_folder):
        file_path = os.path.join(FLAGS.chkpoint_folder, file_name)
        if os.path.isfile(file_path):
            os.remove(file_path)
        elif os.path.isdir(file_path):
            shutil.rmtree(file_path)
if os.path.exists(FLAGS.model_folder) :
    for file_name in os.listdir(FLAGS.model_folder):
        file_path = os.path.join(FLAGS.model_folder, file_name)
        if os.path.isfile(file_path):
            os.remove(file_path)
        elif os.path.isdir(file_path):
            shutil.rmtree(file_path)

# read TF_CONFIG
run_config = tf.estimator.RunConfig()

# create Estimator
mnist_fullyconnected_classifier = tf.estimator.Estimator(
    model_fn=_my_model_fn,
    model_dir=FLAGS.chkpoint_folder,
    config=run_config)
train_spec = tf.estimator.TrainSpec(
    input_fn=_get_input_fn(os.path.join(FLAGS.data_folder, 'train.tfrecords'), 2),
    max_steps=60000 * 2 / batch_size)
eval_spec = tf.estimator.EvalSpec(
    input_fn=_get_input_fn(os.path.join(FLAGS.data_folder, 'test.tfrecords'), 1),
    steps=10000 * 1 / batch_size,
    start_delay_secs=0)

# run !
tf.estimator.train_and_evaluate(
    mnist_fullyconnected_classifier,
    train_spec,
    eval_spec
)

# save model and variables
model_dir = mnist_fullyconnected_classifier.export_savedmodel(
    export_dir_base = FLAGS.model_folder,
    serving_input_receiver_fn = _my_serving_input_fn)
print('current working directory is ', os.getcwd())
print('model is saved ', model_dir)

Writing script/train.py


## Train on remote VM

Now let's start to integrate with AML services and run training on remote virtual machine.

### Step 1 : Get workspace setting

Before starting, you must read your configuration settings. (See "[Exercise01 : Prepare Config Settings](https://github.com/tsmatz/azure-ml-tensorflow-complete-sample/blob/master/notebooks/exercise01_prepare_config.ipynb)")

In [4]:
from azureml.core import Workspace
import azureml.core

ws = Workspace.from_config()

Found the config file in: /data/home/tsmatsuz/azure-ml-tensorflow-complete-sample/notebooks/aml_config/config.json


### Step 2 : Create new remote virtual machine

Create your new Data Science Virtual Machine (which is pre-configured for data science) with **GPU** (NC6). Before starting, please make sure to use NC6 supported location as workspace location. By enabling auto-scaling (from 0 to 1), you can save money (the node is terminated) if it's inactive.    
If already exists, this script will get the existing one.

You can also attach an existing virtual machine (bring your own compute resource) as a compute target.

In [19]:
from azureml.core.compute import AmlCompute, ComputeTarget
from azureml.core.compute_target import ComputeTargetException

try:
    compute_target = ComputeTarget(workspace=ws, name='mydsvm01')
    print('found existing:', compute_target.name)
except ComputeTargetException:
    print('creating new.')
    compute_config = AmlCompute.provisioning_configuration(
        vm_size='STANDARD_NC6',
        min_nodes=0,
        max_nodes=1)
    compute_target = ComputeTarget.create(ws, 'mydsvm01', compute_config)
    compute_target.wait_for_completion(show_output=True)

creating new.
Creating
Succeeded
AmlCompute wait for completion finished
Minimum number of nodes requested have been provisioned


### Step 3 : Generate data reference config

You can configure to mount your preconfigured dataset (including train.tfrecords, test.tfrecords) from your ```Datastore``` in your compute target.    
See "[Exercise02 : Prepare Datastore](https://github.com/tsmatz/azure-ml-tensorflow-complete-sample/blob/master/notebooks/exercise02_prepare_datastore.ipynb)".

In [20]:
from azureml.core import Datastore
from azureml.core.runconfig import DataReferenceConfiguration
# from azureml.data.data_reference import DataReference

# get your datastore (See "Exercise 02 : Prepare Datastore")
ds = Datastore.get(ws, datastore_name="myblob01")

# generate data reference configuration
dr_conf = DataReferenceConfiguration(
    datastore_name=ds.name,
    path_on_datastore='tfdata',
    mode='mount') # set 'download' if you copy all files instead of mounting

### Step 4 : Generate config

Here we set docker environments for running scripts. We want to use ```Datastore``` as input data, so we set previous data reference configuration in this configuration.

In [21]:
from azureml.core.runconfig import RunConfiguration, DEFAULT_GPU_IMAGE
from azureml.core.conda_dependencies import CondaDependencies

run_config = RunConfiguration(
    framework="python",
    conda_dependencies=CondaDependencies.create(conda_packages=['tensorflow-gpu==1.15']))
run_config.target = compute_target.name
run_config.data_references = {ds.name: dr_conf}
run_config.environment.docker.enabled = True
run_config.environment.docker.base_image = DEFAULT_GPU_IMAGE

### Step 5 : Run script and wait for completion

In [22]:
from azureml.core import Experiment
from azureml.core import Run
from azureml.core import ScriptRunConfig

src = ScriptRunConfig(
    source_directory='./script',
    script='train.py',
    run_config=run_config,
    arguments=['--data_folder', str(ds.as_mount())]
)
# exp = Experiment(workspace=ws, name='test20181210-09')
exp = Experiment(workspace=ws, name='tf_remote_experiment')
run = exp.submit(config=src)
run.wait_for_completion(show_output=True)

RunId: tf_remote_experiment_1544490241565

Streaming azureml-logs/20_image_build_log.txt

2018/12/11 01:04:08 Using acb_vol_01cc2f8e-5b96-45bc-a849-583fda075ca5 as the home volume
2018/12/11 01:04:08 Creating Docker network: acb_default_network, driver: 'bridge'
2018/12/11 01:04:09 Successfully set up Docker network: acb_default_network
2018/12/11 01:04:09 Setting up Docker configuration...
2018/12/11 01:04:09 Successfully set up Docker configuration
2018/12/11 01:04:09 Logging in to registry: ws016106599079.azurecr.io
2018/12/11 01:04:16 Successfully logged in
2018/12/11 01:04:16 Executing step ID: acb_step_0. Working directory: '', Network: 'acb_default_network'
2018/12/11 01:04:16 Obtaining source code and scanning for dependencies...
2018/12/11 01:04:17 Successfully obtained source code and scanned for dependencies
Sending build context to Docker daemon  159.2kB

Step 1/13 : FROM mcr.microsoft.com/azureml/base-gpu:0.2.0
0.2.0: Pulling from azureml/base-gpu
3b37166ec614: Pulling fs 


tk-8.6.8             | 3.1 MB    |            |   0% [0m[91m
tk-8.6.8             | 3.1 MB    | #######6   |  77% [0m[91m
tk-8.6.8             | 3.1 MB    | ########7  |  88% [0m[91m
tk-8.6.8             | 3.1 MB    | #########6 |  96% [0m[91m
tk-8.6.8             | 3.1 MB    | ########## | 100% [0m[91m

keras-applications-1 | 49 KB     |            |   0% [0m[91m
keras-applications-1 | 49 KB     | ########## | 100% [0m[91m

python-3.6.2         | 27.0 MB   |            |   0% [0m[91m
python-3.6.2         | 27.0 MB   | #1         |  11% [0m[91m
python-3.6.2         | 27.0 MB   | ###8       |  39% [0m[91m
python-3.6.2         | 27.0 MB   | #######    |  70% [0m[91m
python-3.6.2         | 27.0 MB   | ########7  |  87% [0m[91m
python-3.6.2         | 27.0 MB   | #########8 |  99% [0m[91m
python-3.6.2         | 27.0 MB   | ########## | 100% [0m[91m

tensorboard-1.12.0   | 3.1 MB    |            |   0% [0m[91m
tensorboard-1.12.0   | 3.1 MB    | #######6   |  76


ncurses-6.0          | 920 KB    |            |   0% [0m[91m
ncurses-6.0          | 920 KB    | #######9   |  79% [0m[91m
ncurses-6.0          | 920 KB    | #########  |  91% [0m[91m
ncurses-6.0          | 920 KB    | ########## | 100% [0m[91m

mkl_fft-1.0.6        | 150 KB    |            |   0% [0m[91m
mkl_fft-1.0.6        | 150 KB    | ########## | 100% [0m[91m

zlib-1.2.11          | 120 KB    |            |   0% [0m[91m
zlib-1.2.11          | 120 KB    | ########## | 100% [0m[91m

tensorflow-base-1.12 | 216.9 MB  |            |   0% [0m[91m
tensorflow-base-1.12 | 216.9 MB  | 2          |   2% [0m[91m
tensorflow-base-1.12 | 216.9 MB  | 5          |   5% [0m[91m
tensorflow-base-1.12 | 216.9 MB  | 6          |   7% [0m[91m
tensorflow-base-1.12 | 216.9 MB  | #          |  10% [0m[91m
tensorflow-base-1.12 | 216.9 MB  | #4         |  14% [0m[91m
tensorflow-base-1.12 | 216.9 MB  | #7         |  18% [0m[91m
tensorflow-base-1.12 | 216.9 MB  | ##1        |  22


absl-py-0.6.1        | 152 KB    |            |   0% [0m[91m
absl-py-0.6.1        | 152 KB    | ########## | 100% [0m[91m

certifi-2018.11.29   | 146 KB    |            |   0% [0m[91m
certifi-2018.11.29   | 146 KB    | ########## | 100% [0m[91m

gast-0.2.0           | 15 KB     |            |   0% [0m[91m
gast-0.2.0           | 15 KB     | ########## | 100% [0m[91m

tensorflow-gpu-1.12. | 2 KB      |            |   0% [0m[91m
tensorflow-gpu-1.12. | 2 KB      | ########## | 100% [0m[91m

readline-7.0         | 1.1 MB    |            |   0% [0m[91m
readline-7.0         | 1.1 MB    | ########2  |  82% [0m[91m
readline-7.0         | 1.1 MB    | ########## | 100% [0m[91m

libffi-3.2.1         | 43 KB     |            |   0% [0m[91m
libffi-3.2.1         | 43 KB     | ########## | 100% [0m[91m

sqlite-3.23.1        | 1.5 MB    |            |   0% [0m[91m
sqlite-3.23.1        | 1.5 MB    | ########2  |  82% [0m[91m
sqlite-3.23.1        | 1.5 MB    | ########## | 


_tflow_select-2.1.0  | 2 KB      |            |   0% [0m[91m
_tflow_select-2.1.0  | 2 KB      | ########## | 100% [0m[91m

blas-1.0             | 6 KB      |            |   0% [0m[91m
blas-1.0             | 6 KB      | ########## | 100% [0m[91m

cupti-9.2.148        | 1.7 MB    |            |   0% [0m[91m
cupti-9.2.148        | 1.7 MB    | #######7   |  78% [0m[91m
cupti-9.2.148        | 1.7 MB    | ########## | 100% [0m[91m

six-1.11.0           | 21 KB     |            |   0% [0m[91m
six-1.11.0           | 21 KB     | ########## | 100% [0m[91m

markdown-3.0.1       | 107 KB    |            |   0% [0m[91m
markdown-3.0.1       | 107 KB    | ########## | 100% [0m[91m

scipy-1.1.0          | 18.0 MB   |            |   0% [0m[91m
scipy-1.1.0          | 18.0 MB   | ###2       |  32% [0m[91m
scipy-1.1.0          | 18.0 MB   | #######5   |  75% [0m[91m
scipy-1.1.0          | 18.0 MB   | #########4 |  95% [0m[91m
scipy-1.1.0          | 18.0 MB   | ########## | 1

Collecting asn1crypto>=0.21.0 (from cryptography!=1.9,!=2.0.*,!=2.1.*,!=2.2.*->azureml-core==1.0.2.*->azureml-defaults==1.0.2->-r /azureml-setup/condaenv.ic7xkqt9.requirements.txt (line 1))
  Downloading https://files.pythonhosted.org/packages/ea/cd/35485615f45f30a510576f1a56d1e0a7ad7bd8ab5ed7cdc600ef7cd06222/asn1crypto-0.24.0-py2.py3-none-any.whl (101kB)
Collecting idna>=2.1 (from cryptography!=1.9,!=2.0.*,!=2.1.*,!=2.2.*->azureml-core==1.0.2.*->azureml-defaults==1.0.2->-r /azureml-setup/condaenv.ic7xkqt9.requirements.txt (line 1))
  Downloading https://files.pythonhosted.org/packages/14/2c/cd551d81dbe15200be1cf41cd03869a46fe7226e7450af7a6545bfc474c9/idna-2.8-py2.py3-none-any.whl (58kB)
Collecting azure-cli-command-modules-nspkg>=2.0.0 (from azure-cli-profile>=2.0.26->azureml-core==1.0.2.*->azureml-defaults==1.0.2->-r /azureml-setup/condaenv.ic7xkqt9.requirements.txt (line 1))
  Downloading https://files.pythonhosted.org/packages/e6/c9/cdeeeabc550848e2a07caa66cba28aa057d23b6feaa824cea

  Stored in directory: /root/.cache/pip/wheels/ad/da/0c/74eb680767247273e2cf2723482cb9c924fe70af57c334513f
  Running setup.py bdist_wheel for antlr4-python3-runtime: started
  Running setup.py bdist_wheel for antlr4-python3-runtime: finished with status 'done'
  Stored in directory: /root/.cache/pip/wheels/ef/f6/18/ad300e691236a3408a99edc750484b56e8d6b11b2c38eacb10
  Running setup.py bdist_wheel for tabulate: started
  Running setup.py bdist_wheel for tabulate: finished with status 'done'
  Stored in directory: /root/.cache/pip/wheels/2a/85/33/2f6da85d5f10614cbe5a625eab3b3aebfdf43e7b857f25f829
  Running setup.py bdist_wheel for pycparser: started
  Running setup.py bdist_wheel for pycparser: finished with status 'done'
  Stored in directory: /root/.cache/pip/wheels/f2/9a/90/de94f8556265ddc9d9c8b271b0f63e57b26fb1d67a45564511
Successfully built SecretStorage pathspec pyyaml antlr4-python3-runtime tabulate pycparser
Installing collected packages: applicationinsights, pycparser, cffi, asn1

2018-12-11 01:26:36.478663: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1511] Adding visible gpu devices: 0
2018-12-11 01:26:36.478728: I tensorflow/core/common_runtime/gpu/gpu_device.cc:982] Device interconnect StreamExecutor with strength 1 edge matrix:
2018-12-11 01:26:36.478739: I tensorflow/core/common_runtime/gpu/gpu_device.cc:988]      0 
2018-12-11 01:26:36.478747: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1001] 0:   N 
2018-12-11 01:26:36.478860: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1115] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 10757 MB memory) -> physical GPU (device: 0, name: Tesla K80, pci bus id: c37f:00:00.0, compute capability: 3.7)
2018-12-11 01:26:38.314655: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1511] Adding visible gpu devices: 0
2018-12-11 01:26:38.314709: I tensorflow/core/common_runtime/gpu/gpu_device.cc:982] Device interconnect StreamExecutor with strength 1 edge matrix:
2018-12-11 01:

{'runId': 'tf_remote_experiment_1544490241565',
 'target': 'mydsvm01',
 'status': 'Finalizing',
 'startTimeUtc': '2018-12-11T01:23:31.171165Z',
 'properties': {'azureml.runsource': 'experiment',
  'ContentSnapshotId': '763fe65f-a25a-4766-8b87-5d294ed217f2'},
 'runDefinition': {'Script': 'train.py',
  'Arguments': ['--data_folder', '$AZUREML_DATAREFERENCE_myblob01'],
  'SourceDirectoryDataStore': None,
  'Framework': 0,
  'Communicator': 0,
  'Target': 'mydsvm01',
  'DataReferences': {'myblob01': {'DataStoreName': 'myblob01',
    'Mode': 'Mount',
    'PathOnDataStore': 'tfdata',
    'PathOnCompute': None,
    'Overwrite': False}},
  'JobName': None,
  'AutoPrepareEnvironment': True,
  'MaxRunDurationSeconds': None,
  'NodeCount': 1,
  'Environment': {'Python': {'InterpreterPath': 'python',
    'UserManagedDependencies': False,
    'CondaDependencies': {'name': 'project_environment',
     'dependencies': ['python=3.6.2',
      {'pip': ['azureml-defaults==1.0.2']},
      'tensorflow-gpu']

### Step 6 : Download results and check

Check generated files.

In [23]:
run.get_file_names()

['azureml-logs/20_image_build_log.txt',
 'azureml-logs/60_control_log.txt',
 'azureml-logs/80_driver_log.txt',
 'logs/checkpoint',
 'logs/events.out.tfevents.1544491581.91c673dc040b4f40932d50874d998f48000000',
 'logs/model.ckpt-0.meta',
 'logs/graph.pbtxt',
 'logs/model.ckpt-0.index',
 'logs/model.ckpt-0.data-00000-of-00001',
 'logs/model.ckpt-1100.meta',
 'logs/model.ckpt-1100.index',
 'logs/model.ckpt-1100.data-00000-of-00001',
 'logs/eval/events.out.tfevents.1544491598.91c673dc040b4f40932d50874d998f48000000',
 'outputs/1544491598/saved_model.pb',
 'outputs/1544491598/variables/variables.index',
 'outputs/1544491598/variables/variables.data-00000-of-00001',
 'driver_log',
 'azureml-logs/azureml.log',
 'azureml-logs/55_batchai_execution.txt']

Download model into your local machine.    
**Please change ```1544491598``` to meet previous results.**

In [13]:
run.download_file(
    name='outputs/1544491598/saved_model.pb',
    output_file_path='remote_model/saved_model.pb')
run.download_file(
    name='outputs/1544491598/variables/variables.data-00000-of-00001',
    output_file_path='remote_model/variables/variables.data-00000-of-00001')
run.download_file(
    name='outputs/1544491598/variables/variables.index',
    output_file_path='remote_model/variables/variables.index')

Predict your test data using downloaded model.

In [14]:
import tensorflow as tf

# Read data by tensor
dataset = tf.data.TFRecordDataset('./data/test.tfrecords')
iterator = tf.compat.v1.data.make_one_shot_iterator(dataset)
data_org = iterator.get_next()
data_exam = tf.parse_single_example(
    data_org,
    features={
        'image_raw': tf.FixedLenFeature([], tf.string),
        'label': tf.FixedLenFeature([], tf.int64)
    })
data_image = tf.decode_raw(data_exam['image_raw'], tf.uint8)
data_image.set_shape([784])
data_image = tf.cast(data_image, tf.float32) * (1. / 255)
data_label = tf.cast(data_exam['label'], tf.int32)

# Run tensor and generate data
with tf.Session() as sess:
    image_arr = []
    label_arr = []
    for i in range(3):
        image, label = sess.run([data_image, data_label])
        image_arr.append(image)
        label_arr.append(label)

# Predict
pred_fn = tf.contrib.predictor.from_saved_model('./remote_model')
pred = pred_fn({'inputs': image_arr})

print('Predicted: ', pred['classes'].tolist())
print('Actual   : ', label_arr)

INFO:tensorflow:Restoring parameters from ./remote_model/variables/variables
Predicted:  [7, 2, 1]
Actual   :  [7, 2, 1]


### Step 7 : Remove AML compute

**You don't need to remove your AML compute** for saving money, because the nodes will be automatically terminated, when it's inactive.    
But if you want to clean up, please run the following.

In [15]:
# Delete cluster (nbodes) and remove from AML workspace
mycompute = AmlCompute(workspace=ws, name='mydsvm01')
mycompute.delete()

In [18]:
# get a status for the current cluster.
print(mycompute.status.serialize())

{'allocationState': 'Resizing', 'allocationStateTransitionTime': '2018-12-11T00:03:46.368000+00:00', 'creationTime': '2018-12-10T23:45:21.604017+00:00', 'currentNodeCount': 1, 'errors': None, 'modifiedTime': '2018-12-10T23:46:00.505831+00:00', 'nodeStateCounts': {'idleNodeCount': 0, 'leavingNodeCount': 1, 'preemptedNodeCount': 0, 'preparingNodeCount': 0, 'runningNodeCount': 0, 'unusableNodeCount': 0}, 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 1, 'nodeIdleTimeBeforeScaleDown': 'PT120S'}, 'targetNodeCount': 0, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_NC6'}
