# MNIST distributed training with tensorflow

### Set up the environment

In [1]:
import os
import sagemaker
from sagemaker import get_execution_role

sagemaker_session = sagemaker.Session()

role = get_execution_role()

### Download the MNIST dataset

In [2]:
import utils
from tensorflow.contrib.learn.python.learn.datasets import mnist
import tensorflow as tf

data_sets = mnist.read_data_sets('data', dtype=tf.uint8, reshape=False, validation_size=5000)

utils.convert_to(data_sets.train, 'train', 'data')
utils.convert_to(data_sets.validation, 'validation', 'data')
utils.convert_to(data_sets.test, 'test', 'data')

Instructions for updating:
Please use alternatives such as official/mnist/dataset.py from tensorflow/models.
Instructions for updating:
Please write your own downloading logic.
Instructions for updating:
Please use tf.data to implement this functionality.


Extracting data/train-images-idx3-ubyte.gz


Instructions for updating:
Please use tf.data to implement this functionality.
Instructions for updating:
Please use alternatives such as official/mnist/dataset.py from tensorflow/models.


Extracting data/train-labels-idx1-ubyte.gz
Extracting data/t10k-images-idx3-ubyte.gz
Extracting data/t10k-labels-idx1-ubyte.gz
('Writing', 'data/train.tfrecords')
('Writing', 'data/validation.tfrecords')
('Writing', 'data/test.tfrecords')


### Upload the data
We use the ```sagemaker.Session.upload_data``` function to upload our datasets to an S3 location. The return value inputs identifies the location -- we will use this later when we start the training job.

In [3]:
inputs = sagemaker_session.upload_data(path='data', key_prefix='data/DEMO-mnist')

INFO:sagemaker:Created S3 bucket: sagemaker-us-west-2-766924284651


# Construct a script for distributed training 

In [4]:
!cat 'mnist.py'

import os
import tensorflow as tf
from tensorflow.python.estimator.model_fn import ModeKeys as Modes

INPUT_TENSOR_NAME = 'inputs'
SIGNATURE_NAME = 'predictions'

LEARNING_RATE = 0.001


def model_fn(features, labels, mode, params):
    # Input Layer
    input_layer = tf.reshape(features[INPUT_TENSOR_NAME], [-1, 28, 28, 1])

    # Convolutional Layer #1
    conv1 = tf.layers.conv2d(
        inputs=input_layer,
        filters=32,
        kernel_size=[5, 5],
        padding='same',
        activation=tf.nn.relu)

    # Pooling Layer #1
    pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=[2, 2], strides=2)

    # Convolutional Layer #2 and Pooling Layer #2
    conv2 = tf.layers.conv2d(
        inputs=pool1,
        filters=64,
        kernel_size=[5, 5],
        padding='same',
        activation=tf.nn.relu)
    pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size=[2, 2], strides=2)

    # Dense Layer
    pool2_flat = tf.reshape(pool2, [-1, 7

## Create a training job using the sagemaker.TensorFlow estimator

In [5]:
from sagemaker.tensorflow import TensorFlow

mnist_estimator = TensorFlow(entry_point='mnist.py',
                             role=role,
                             training_steps=1000, 
                             evaluation_steps=100,
                             train_instance_count=2,
                             train_instance_type='ml.c4.xlarge')

mnist_estimator.fit(inputs)

INFO:sagemaker:Creating training-job with name: sagemaker-tensorflow-2018-10-31-21-47-50-309


2018-10-31 21:47:51 Starting - Starting the training job...
2018-10-31 21:47:52 Starting - Launching requested ML instances......
2018-10-31 21:49:02 Starting - Preparing the instances for training......
2018-10-31 21:50:19 Downloading - Downloading input data..
[31m2018-10-31 21:50:30,174 INFO - root - running container entrypoint[0m
[31m2018-10-31 21:50:30,174 INFO - root - starting train task[0m
[31m2018-10-31 21:50:30,179 INFO - container_support.training - Training starting[0m
[32m2018-10-31 21:50:31,697 INFO - root - running container entrypoint[0m
[32m2018-10-31 21:50:31,698 INFO - root - starting train task[0m
[32m2018-10-31 21:50:31,703 INFO - container_support.training - Training starting[0m
[31mDownloading s3://sagemaker-us-west-2-766924284651/sagemaker-tensorflow-2018-10-31-21-47-50-309/source/sourcedir.tar.gz to /tmp/script.tar.gz[0m
[31m2018-10-31 21:50:33,219 INFO - tf_container - ----------------------TF_CONFIG--------------------------[0m
[31m2018-10-3


[32mInstructions for updating:[0m
[32mTo construct input pipelines, use the `tf.data` module.[0m
[32mInstructions for updating:[0m
[32mTo construct input pipelines, use the `tf.data` module.[0m
[32m2018-10-31 21:50:40,037 INFO - tensorflow - Calling model_fn.[0m
[32m2018-10-31 21:50:40,521 INFO - tensorflow - Done calling model_fn.[0m
[32m2018-10-31 21:50:40,523 INFO - tensorflow - Create CheckpointSaverHook.[0m
[32m2018-10-31 21:50:40,666 INFO - tensorflow - Graph was finalized.[0m
[32m2018-10-31 21:50:40,742 INFO - tensorflow - Running local_init_op.[0m
[32m2018-10-31 21:50:40,752 INFO - tensorflow - Done running local_init_op.[0m
[32mInstructions for updating:[0m
[32mTo construct input pipelines, use the `tf.data` module.[0m
[32m2018-10-31 21:50:41,410 INFO - tensorflow - loss = 1.4381021, step = 5[0m
[31m2018-10-31 21:50:55,090 INFO - tensorflow - global_step/sec: 6.54008[0m
[32m2018-10-31 21:51:08,294 INFO - tensorflow - loss = 0.07250598, step = 190 


2018-10-31 21:53:34 Uploading - Uploading generated training model
2018-10-31 21:53:34 Completed - Training job completed
Billable seconds: 392


The **```fit```** method will create a training job in two **ml.c4.xlarge** instances. The logs above will show the instances doing training, evaluation, and incrementing the number of **training steps**. 

In the end of the training, the training job will generate a saved model for TF serving.

# Deploy the trained model to prepare for predictions

The deploy() method creates an endpoint which serves prediction requests in real-time.

In [None]:
mnist_predictor = mnist_estimator.deploy(initial_instance_count=1,
                                             instance_type='ml.m4.xlarge')

INFO:sagemaker:Creating model with name: sagemaker-tensorflow-2018-10-31-21-47-50-309
INFO:sagemaker:Creating endpoint with name sagemaker-tensorflow-2018-10-31-21-47-50-309


----------------------------------------------------------

# Invoking the endpoint

In [7]:
import numpy as np
from tensorflow.examples.tutorials.mnist import input_data

mnist = input_data.read_data_sets("/tmp/data/", one_hot=True)

for i in range(10):
    data = mnist.test.images[i].tolist()
    tensor_proto = tf.make_tensor_proto(values=np.asarray(data), shape=[1, len(data)], dtype=tf.float32)
    predict_response = mnist_predictor.predict(tensor_proto)
    
    print("========================================")
    label = np.argmax(mnist.test.labels[i])
    print("label is {}".format(label))
    prediction = predict_response['outputs']['classes']['int64Val'][0]
    print("prediction is {}".format(prediction))

Instructions for updating:
Please use urllib or similar directly.


Successfully downloaded train-images-idx3-ubyte.gz 9912422 bytes.
Extracting /tmp/data/train-images-idx3-ubyte.gz


Instructions for updating:
Please use tf.one_hot on tensors.


Successfully downloaded train-labels-idx1-ubyte.gz 28881 bytes.
Extracting /tmp/data/train-labels-idx1-ubyte.gz
Successfully downloaded t10k-images-idx3-ubyte.gz 1648877 bytes.
Extracting /tmp/data/t10k-images-idx3-ubyte.gz
Successfully downloaded t10k-labels-idx1-ubyte.gz 4542 bytes.
Extracting /tmp/data/t10k-labels-idx1-ubyte.gz
label is 7


KeyError: 'int64Val'