In [1]:
# Define IAM role
import boto3
import re

import os
import numpy as np
import pandas as pd
from sagemaker import get_execution_role

role = get_execution_role()

In [2]:
import sagemaker
from time import gmtime, strftime

sess = sagemaker.Session() # can use LocalSession() to run container locally

bucket = sess.default_bucket()
region = "us-east-2"
prefix_input = 'detectron2-input'
prefix_output = 'detectron2-ouput'

# Upload data for training

In [None]:
! ./upload_coco2017_to_s3.sh <your_bucket> <your_s3_path>

## Push Docker image to registry

For this training, we'll extend [Sagemaker PyTorch Container](https://docs.aws.amazon.com/deep-learning-containers/latest/devguide/deep-learning-containers-images.html) with Detectron2 dependencies (using official [D2 Dockerfile](https://github.com/facebookresearch/detectron2/blob/master/docker/Dockerfile)) as baseline. See Dockerfile below.

In [4]:
!pygmentize Dockerfile

[37m# Build an image of Detectron2 that can do [39;49;00m
[37m# distributing training and inference in Amazon Sagemaker[39;49;00m

[37m# using Sagemaker PyTorch container as base image[39;49;00m
[34mFROM[39;49;00m[33m 763104351884.dkr.ecr.us-east-2.amazonaws.com/pytorch-training:1.4.0-gpu-py36-cu101-ubuntu16.04[39;49;00m
LABEL [31mauthor[39;49;00m=[33m"vadimd@amazon.com"[39;49;00m

[37m############# Installing latest builds ############[39;49;00m

[37m# This is to fix issue: https://github.com/pytorch/vision/issues/1489[39;49;00m
[34mRUN[39;49;00m pip install --upgrade --force-reinstall torch torchvision cython

[37m############# D2 section ##############[39;49;00m

[37m# installing dependecies for D2 https://github.com/facebookresearch/detectron2/blob/master/docker/Dockerfile[39;49;00m
[34mRUN[39;49;00m pip install [33m'git+https://github.com/cocodataset/cocoapi.git#subdirectory=PythonAPI'[39;49;00m
[34mRUN[39;49;00m pip install [33m'git+https://github.c

You'll need to build container from this Dockerfile and push it to Amazon Elastic Container Registry using `build_and_push.sh` script. But you'll need to loging to Sagemaker ECR and your private ECR first.

In [None]:
# loging to Sagemaker ECR with Deep Learning Containers
!aws ecr get-login-password --region <your_region> | docker login --username AWS --password-stdin 763104351884.dkr.ecr.<your_region>.amazonaws.com
# loging to your private ECR
!aws ecr get-login-password --region <your_region> | docker login --username AWS --password-stdin <your_aws_account_id>.dkr.ecr.<your_region>.amazonaws.com

Now you can ready to push your D2 container to private ECR

In [None]:
! ./build_and_push.sh <your_container_name>

# Train your model

In [None]:
account = sess.boto_session.client('sts').get_caller_identity()['Account']
region = "us-east-2"
container = "d2-sm-coco"
image = '{}.dkr.ecr.{}.amazonaws.com/{}:latest'.format(account, region, container)

#hyperparameters = {"config-file":"COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml"}
hyperparameteres = {}

d2 = sagemaker.estimator.Estimator(image,
                                   role=role,
                                   train_instance_count=1, 
                                   train_instance_type='ml.p3.16xlarge',
#                                   train_instance_type='local_gpu',
                                   train_volume_size=100,
                                   output_path="s3://{}/{}".format(sess.default_bucket(), prefix_output),
#                                   hyperparameters = hyperparameters,
                                   sagemaker_session=sess)

# TODO: debugging scripts without data
d2.fit({'training':"s3://coco2017-2a27f"}) 

2020-04-08 14:04:05 Starting - Starting the training job...
2020-04-08 14:04:06 Starting - Launching requested ML instances......
2020-04-08 14:05:35 Starting - Preparing the instances for training.........
2020-04-08 14:06:37 Downloading - Downloading input data.................................................................................
2020-04-08 14:20:21 Training - Downloading the training image.................[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2020-04-08 14:23:23,002 sagemaker-containers INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2020-04-08 14:23:23,082 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2020-04-08 14:23:26,100 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2020-04-08 14:23:26,355 sagemaker-containers INFO     Module default_user_module_name does n