## Build a container

In [28]:
# loging to Sagemaker ECR with Deep Learning Containers
!aws ecr get-login-password --region us-east-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-east-2.amazonaws.com
# loging to your private ECR
!aws ecr get-login-password --region us-east-2 | docker login --username AWS --password-stdin 553020858742.dkr.ecr.us-east-2.amazonaws.com

https://docs.docker.com/engine/reference/commandline/login/#credentials-store

Login Succeeded
https://docs.docker.com/engine/reference/commandline/login/#credentials-store

Login Succeeded


In [126]:
! ./build_and_push.sh mxnet-distributed latest

https://docs.docker.com/engine/reference/commandline/login/#credentials-store

Login Succeeded
Sending build context to Docker daemon  249.3kB
Step 1/7 : FROM 763104351884.dkr.ecr.us-east-2.amazonaws.com/mxnet-training:1.6.0-gpu-py27-cu101-ubuntu16.04
 ---> b71ed7961e4b
Step 2/7 : LABEL author="vadimd@amazon.com"
 ---> Using cache
 ---> ae83d91b7381
Step 3/7 : COPY container_training /opt/ml/code
 ---> Using cache
 ---> 83c94ac22a5b
Step 4/7 : WORKDIR /opt/ml/code
 ---> Using cache
 ---> 87d21b5f1e7e
Step 5/7 : ENV SAGEMAKER_SUBMIT_DIRECTORY /opt/ml/code
 ---> Using cache
 ---> 0d36c86fbff9
Step 6/7 : ENV SAGEMAKER_PROGRAM distr_launcher.py
 ---> Using cache
 ---> dd2c8672f072
Step 7/7 : WORKDIR /
 ---> Using cache
 ---> 14c67e777b1d
Successfully built 14c67e777b1d
Successfully tagged mxnet-distributed:latest
The push refers to repository [553020858742.dkr.ecr.us-east-2.amazonaws.com/mxnet-distributed]

[1B7c16290e: Preparing 
[1B64d8d29c: Preparing 
[1B48f3c3c3: Preparing 
[1Bba41

## Define common parameters

In [17]:
# Define IAM role
import boto3
import re

import os
import numpy as np
import pandas as pd
from sagemaker import get_execution_role
from sagemaker.mxnet import MXNet
import sagemaker

role = get_execution_role()

In [12]:
import sagemaker
from time import gmtime, strftime

sess = sagemaker.LocalSession() # can use LocalSession() to run container locally

bucket = sess.default_bucket()
region = "us-east-2"
account = sess.boto_session.client('sts').get_caller_identity()['Account']
prefix_input = 'mxnet-distr-input'
prefix_output = 'mxnet-distr-ouput'

In [75]:
container = "mxnet-distributed" # your container name
tag = "latest"
image = '{}.dkr.ecr.{}.amazonaws.com/{}:{}'.format(account, region, container, tag)

print("Following Sagemaker container will be used for training: ", image)

Following Sagemaker container will be used for training:  553020858742.dkr.ecr.us-east-2.amazonaws.com/mxnet-distributed:latest


## Review training script

In [None]:
! pygmentize training_sources/distr_launcher.py

In [None]:
! pygmentize training_sources/train_maskrcnn.py

## Local training

In [112]:
hyperparameters = {
#     "train-script" : "train_image_classification.py",
    "train-script" : "print.py",
    # Below are args passed directly to training script
    "dataset" : "coco"
}

In [130]:
est = sagemaker.estimator.Estimator(image,
                                    role=role,
                                    train_instance_count=2,
                                    train_instance_type='ml.p3.16xlarge',
#                                     train_instance_type='local_gpu',
                                    sagemaker_session = sagemaker.Session(),
                                    hyperparameters = hyperparameters
                                   )

est.fit(wait=True)

2020-05-29 19:36:02 Starting - Starting the training job...
2020-05-29 19:36:04 Starting - Launching requested ML instances............
2020-05-29 19:38:08 Starting - Preparing the instances for training......
2020-05-29 19:39:25 Downloading - Downloading input data
2020-05-29 19:39:25 Training - Downloading the training image.........
2020-05-29 19:41:00 Uploading - Uploading generated training model.[35m2020-05-29 19:40:55,702 sagemaker-containers INFO     Imported framework sagemaker_mxnet_container.training[0m
[35m2020-05-29 19:40:55,703 sagemaker-containers INFO     Failed to parse hyperparameter train-script value print.py to Json.[0m
[35mReturning the value itself[0m
[35m2020-05-29 19:40:55,703 sagemaker-containers INFO     Failed to parse hyperparameter dataset value coco to Json.[0m
[35mReturning the value itself[0m
[35m2020-05-29 19:40:55,782 sagemaker_mxnet_container.training INFO     MXNet training environment: {'SM_INPUT_DIR': '/opt/ml/input', 'SM_HP_TRAIN-SCRIP