## Build a container

In [1]:
# loging to Sagemaker ECR with Deep Learning Containers
!aws ecr get-login-password --region us-east-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-east-2.amazonaws.com
# loging to your private ECR
!aws ecr get-login-password --region us-east-2 | docker login --username AWS --password-stdin 553020858742.dkr.ecr.us-east-2.amazonaws.com

https://docs.docker.com/engine/reference/commandline/login/#credentials-store

Login Succeeded
https://docs.docker.com/engine/reference/commandline/login/#credentials-store

Login Succeeded


In [8]:
! ./build_and_push.sh mxnet-distributed latest

https://docs.docker.com/engine/reference/commandline/login/#credentials-store

Login Succeeded
Sending build context to Docker daemon    254kB
Step 1/9 : FROM 763104351884.dkr.ecr.us-east-2.amazonaws.com/mxnet-training:1.6.0-gpu-py27-cu101-ubuntu16.04
 ---> b71ed7961e4b
Step 2/9 : LABEL author="vadimd@amazon.com"
 ---> Using cache
 ---> ae83d91b7381
Step 3/9 : COPY container_training /opt/ml/code
 ---> bd0197fdcdee
Step 4/9 : WORKDIR /opt/ml/code
 ---> Running in 1fa1b3e4da76
Removing intermediate container 1fa1b3e4da76
 ---> 44d8c31384bf
Step 5/9 : ENV SAGEMAKER_SUBMIT_DIRECTORY /opt/ml/code
 ---> Running in c8ddcc557e87
Removing intermediate container c8ddcc557e87
 ---> f8bbc4afa450
Step 6/9 : ENV SAGEMAKER_PROGRAM hvd_launcher.py
 ---> Running in 97807f679de1
Removing intermediate container 97807f679de1
 ---> 900e99d7e572
Step 7/9 : RUN pip install gluoncv
 ---> Running in 53fabf57352e
[91mDEPRECATION: Python 2.7 will reach the end of its life on January 1st, 2020. Please upgrade y

## Define common parameters

In [3]:
# Define IAM role
import boto3
import re

import os
import numpy as np
import pandas as pd
from sagemaker import get_execution_role
from sagemaker.mxnet import MXNet
import sagemaker

role = get_execution_role()

In [4]:
import sagemaker
from time import gmtime, strftime

sess = sagemaker.LocalSession() # can use LocalSession() to run container locally

bucket = sess.default_bucket()
region = "us-east-2"
account = sess.boto_session.client('sts').get_caller_identity()['Account']
prefix_input = 'mxnet-distr-input'
prefix_output = 'mxnet-distr-ouput'

In [5]:
container = "mxnet-distributed" # your container name
tag = "latest"
image = '{}.dkr.ecr.{}.amazonaws.com/{}:{}'.format(account, region, container, tag)

print("Following Sagemaker container will be used for training: ", image)

Following Sagemaker container will be used for training:  553020858742.dkr.ecr.us-east-2.amazonaws.com/mxnet-distributed:latest


## Review training script

In [None]:
! pygmentize training_sources/hvd_launcher.py

In [None]:
! pygmentize training_sources/distributed_mnist.py

## Local training

In [6]:
hyperparameters = {
    "train-script" : "/opt/ml/code/distributed_mnist.py",
    "local" : "false"
    # Below you can add args which will passed directly to training script
}

In [None]:
est = sagemaker.estimator.Estimator(image,
                                    role=role,
                                    train_instance_count=2,
                                    train_instance_type='ml.p3.16xlarge',
#                                     train_instance_type='local_gpu',
                                    sagemaker_session = sagemaker.Session(),
                                    hyperparameters = hyperparameters
                                   )

est.fit(wait=True)

2020-05-30 16:29:22 Starting - Starting the training job...
2020-05-30 16:29:24 Starting - Launching requested ML instances.........
2020-05-30 16:30:59 Starting - Preparing the instances for training....