In [1]:
# Define IAM role
import boto3
import re

import os
import numpy as np
import pandas as pd
from sagemaker import get_execution_role

role = get_execution_role()

In [16]:
import sagemaker
from time import gmtime, strftime

sess = sagemaker.LocalSession() # can use LocalSession() to run container locally

bucket = sess.default_bucket()
region = "us-east-2"
account = sess.boto_session.client('sts').get_caller_identity()['Account']
prefix_input = 'detectron2-input'
prefix_output = 'detectron2-ouput'

# Upload data for training

In [None]:
! ./upload_coco2017_to_s3.sh <your_bucket> <your_s3_path>

## Push Docker image to registry

For this training, we'll extend [Sagemaker PyTorch Container](https://docs.aws.amazon.com/deep-learning-containers/latest/devguide/deep-learning-containers-images.html) with Detectron2 dependencies (using official [D2 Dockerfile](https://github.com/facebookresearch/detectron2/blob/master/docker/Dockerfile)) as baseline. See Dockerfile below.

In [4]:
!pygmentize Dockerfile

[37m# Build an image of Detectron2 that can do [39;49;00m
[37m# distributing training and inference in Amazon Sagemaker[39;49;00m

[37m# using Sagemaker PyTorch container as base image[39;49;00m
[34mFROM[39;49;00m[33m 763104351884.dkr.ecr.us-east-2.amazonaws.com/pytorch-training:1.4.0-gpu-py36-cu101-ubuntu16.04[39;49;00m
LABEL [31mauthor[39;49;00m=[33m"vadimd@amazon.com"[39;49;00m

[37m############# Installing latest builds ############[39;49;00m

[37m# This is to fix issue: https://github.com/pytorch/vision/issues/1489[39;49;00m
[34mRUN[39;49;00m pip install --upgrade --force-reinstall torch torchvision cython

[37m############# D2 section ##############[39;49;00m

[37m# installing dependecies for D2 https://github.com/facebookresearch/detectron2/blob/master/docker/Dockerfile[39;49;00m
[34mRUN[39;49;00m pip install [33m'git+https://github.com/cocodataset/cocoapi.git#subdirectory=PythonAPI'[39;49;00m
[34mRUN[39;49;00m pip install [33m'git+https://github.c

You'll need to build container from this Dockerfile and push it to Amazon Elastic Container Registry using `build_and_push.sh` script. But you'll need to loging to Sagemaker ECR and your private ECR first.

In [1]:
# loging to Sagemaker ECR with Deep Learning Containers
!aws ecr get-login-password --region us-east-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-east-2.amazonaws.com
# loging to your private ECR
!aws ecr get-login-password --region us-east-2 | docker login --username AWS --password-stdin 553020858742.dkr.ecr.us-east-2.amazonaws.com

https://docs.docker.com/engine/reference/commandline/login/#credentials-store

Login Succeeded
https://docs.docker.com/engine/reference/commandline/login/#credentials-store

Login Succeeded


Now you can ready to push your D2 container to private ECR

In [None]:
! ./build_and_push.sh d2-sm-coco

# Train your model

Define algorithm metrics which Sagemaker will scrap, persist, and render in training job console

In [18]:
container = "d2-sm-coco" # your container name
tag = "debug"
image = '{}.dkr.ecr.{}.amazonaws.com/{}:{}'.format(account, region, container, tag)

In [9]:
metric_definitions=[
    {
        "Name": "total_loss",
        "Regex": ".*total_loss:\s([0-9\\.]+)\s*"
    },
    {
        "Name": "loss_cls",
        "Regex": ".*loss_cls:\s([0-9\\.]+)\s*"
    },
    {
        "Name": "loss_box_reg",
        "Regex": ".*loss_box_reg:\s([0-9\\.]+)\s*"
    },
    {
        "Name": "loss_mask",
        "Regex": ".*loss_mask:\s([0-9\\.]+)\s*"
    },
    {
        "Name": "loss_rpn_cls",
        "Regex": ".*loss_rpn_cls:\s([0-9\\.]+)\s*"
    },
    {
        "Name": "loss_rpn_loc",
        "Regex": ".*loss_rpn_loc:\s([0-9\\.]+)\s*"
    }, 
    {
        "Name": "overall_training_speed",
        "Regex": ".*Overall training speed:\s([0-9\\.]+)\s*"
    },
    {
        "Name": "lr",  
        "Regex": ".*lr:\s([0-9\\.]+)\s*"
    },
    {
        "Name": "iter",  
        "Regex": ".*iter:\s([0-9\\.]+)\s*"
    }
]


In [None]:
hyperparameters = {"config-file":"COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml", 
                   #"local-config-file" : "config.yaml", # if you'd like to supply custom config file, please it in container_training folder, and provide file name here
                   "resume":"True", # whether to re-use weights from pre-trained model
                   "eval-only":"True", # whether to perform only D2 model evaluation
                  # opts are D2 model configuration as defined here: https://detectron2.readthedocs.io/modules/config.html#config-references
                  # this is a way to override individual parameters in D2 configuration from Sagemaker API
                   "opts": "DATALOADER.NUM_WORKERS 8"
                   }

    
d2 = sagemaker.estimator.Estimator(image,
                                   role=role,
                                   train_instance_count=1, 
#                                   train_instance_type='ml.p3.16xlarge',
                                   train_instance_type="local_gpu", # use local_gpu for quick troubleshooting
                                   train_volume_size=100,
                                   output_path="s3://{}/{}".format(sess.default_bucket(), prefix_output),
                                   metric_definitions = metric_definitions,
                                   hyperparameters = hyperparameters, 
                                   tags=[{"Key":"Desc", "Value":"test Cristian config"}],
                                   sagemaker_session=sess)

#d2.fit({'training':"s3://coco2017-2a27f"}, wait=False) 
d2.fit()

## Training with Spot Instance

In [78]:
train_use_spot_instances = True
train_max_run=21600
train_max_wait = 30000 if train_use_spot_instances else None

import uuid
checkpoint_suffix = str(uuid.uuid4())[:8]
checkpoint_s3_uri = 's3://{}/artifacts/mxnet-checkpoint-{}/'.format(bucket, checkpoint_suffix) if train_use_spot_instances else None

In [None]:
container = "d2-sm-coco-custom" # your container name
image = '{}.dkr.ecr.{}.amazonaws.com/{}:latest'.format(account, region, container)

hyperparameters = {"config-file":"COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml", 
                   "resume":"False", # whether to re-use weights from pre-trained model
                   "eval-only":"False", # whether to perform only D2 model evaluation
                  # opts are D2 model configuration as defined here: https://detectron2.readthedocs.io/modules/config.html#config-references
                  # this is a way to override default D2 configuration from Sagemaker API
                  "opts": "SOLVER.BASE_LR 0.00025 \
                           MODEL.ROI_HEADS.NUM_CLASSES 80 \
                           DATALOADER.NUM_WORKERS 4"
                  }

d2 = sagemaker.estimator.Estimator(image,
                                   role=role,
                                   train_instance_count=2, 
                                   train_instance_type='ml.p3.16xlarge',
#                                   train_instance_type="local_gpu", # use local_gpu for quick troubleshooting
                                   train_volume_size=100,
                                   output_path="s3://{}/{}".format(sess.default_bucket(), prefix_output),
                                   metric_definitions = metric_definitions,
                                   hyperparameters = hyperparameters, 
                                   tags=[{"Key":"Desc", "Value":"spot training"}],
                                   sagemaker_session=sess,
                                   train_use_spot_instances=train_use_spot_instances,
                                   train_max_run=train_max_run,
                                   train_max_wait=train_max_wait,
                                   checkpoint_s3_uri=checkpoint_s3_uri)

d2.fit({'training':"s3://coco2017-2a27f"}, wait=False) 