In [1]:
# login to Sagemaker ECR with Deep Learning Containers
!aws ecr get-login-password --region us-east-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-east-2.amazonaws.com
# login to your private ECR
!aws ecr get-login-password --region us-east-2 | docker login --username AWS --password-stdin 553020858742.dkr.ecr.us-east-2.amazonaws.com

https://docs.docker.com/engine/reference/commandline/login/#credentials-store

Login Succeeded
https://docs.docker.com/engine/reference/commandline/login/#credentials-store

Login Succeeded


In [2]:

! pygmentize -l docker Dockerfile.training

[37m# Use Sagemaker PyTorch container as base image[39;49;00m
[37m# https://github.com/aws/sagemaker-pytorch-container/blob/master/docker/1.5.0/py3/Dockerfile.gpu[39;49;00m
[34mFROM[39;49;00m [33m763104351884.dkr.ecr.us-east-2.amazonaws.com/pytorch-training:1.5.0-gpu-py36-cu101-ubuntu16.04[39;49;00m
[34mLABEL[39;49;00m [31mauthor[39;49;00m=[33m"vadimd@amazon.com"[39;49;00m


[37m############# Installing MMDetection from source ############[39;49;00m

[34mWORKDIR[39;49;00m[33m /opt/ml/code[39;49;00m
[34mRUN[39;49;00m pip install pytorch torchvision
[34mRUN[39;49;00m pip install mmcv-full==latest+torch1.5.0+cu101 -f https://openmmlab.oss-accelerate.aliyuncs.com/mmcv/dist/index.html

[34mRUN[39;49;00m git clone https://github.com/open-mmlab/mmdetection
[34mRUN[39;49;00m [36mcd[39;49;00m mmdetection/ && [33m\[39;49;00m
    pip install -e .

[37m############# Configuring Sagemaker ##############[39;49;00m
[34mCOPY[39;49;00m container_training /opt/ml/code

In [10]:
! ./build_and_push.sh mmdetection-training latest Dockerfile.training

https://docs.docker.com/engine/reference/commandline/login/#credentials-store

Login Succeeded
Sending build context to Docker daemon  551.3MB
Step 1/12 : FROM 763104351884.dkr.ecr.us-east-2.amazonaws.com/pytorch-training:1.5.0-gpu-py36-cu101-ubuntu16.04
 ---> 47cd15520b75
Step 2/12 : LABEL author="vadimd@amazon.com"
 ---> Using cache
 ---> d12d22ee6b31
Step 3/12 : WORKDIR /opt/ml/code
 ---> Using cache
 ---> 3b9b74ed342c
Step 4/12 : RUN pip install --upgrade --force-reinstall  torch torchvision cython
 ---> Using cache
 ---> adf8278203a8
Step 5/12 : RUN pip install mmcv-full==latest+torch1.5.0+cu101 -f https://openmmlab.oss-accelerate.aliyuncs.com/mmcv/dist/index.html
 ---> Using cache
 ---> b10898a0ee26
Step 6/12 : RUN git clone https://github.com/open-mmlab/mmdetection
 ---> Using cache
 ---> 74daa6e3222c
Step 7/12 : RUN cd mmdetection/ &&     pip install -e .
 ---> Using cache
 ---> 19ee5cee793b
Step 8/12 : COPY container_training /opt/ml/code
 ---> 0da9a7f92d1b
Step 9/12 : ENV SAG

In [13]:

# Define IAM role
import boto3
import re

import os
import numpy as np
import pandas as pd
from sagemaker import get_execution_role

role = get_execution_role()

In [15]:

import sagemaker
from time import gmtime, strftime

sess = sagemaker.Session()
# bucket = sess.default_bucket()
# region = "us-east-2"
account = sess.boto_session.client('sts').get_caller_identity()['Account']
prefix_input = 'mmdetection-input'
prefix_output = 'mmdetection-ouput'

In [16]:
container = "mmdetection-training" # your container name
tag = "latest"
image = '{}.dkr.ecr.{}.amazonaws.com/{}:{}'.format(account, region, container, tag)

In [17]:
hyperparameters = {
    
}

In [18]:
metrics = []

## Local Training in SM Container

In [19]:
local_est = sagemaker.estimator.Estimator(image,
                                          role=role,
                                          train_instance_count=1,
                                          train_instance_type='ml.p3.16xlarge',
#                                           train_instance_type="local_gpu", # use local_gpu for quick troubleshooting
#                                           train_volume_size=100,
                                          output_path="s3://{}/{}".format(sess.default_bucket(), prefix_output),
                                          metric_definitions = metrics,
                                          hyperparameters = hyperparameters, 
#                                           sagemaker_session=sagemaker.LocalSession()
                                          sagemaker_session=sess
)

local_est.fit(inputs = "s3://coco2017-2a27f")

ClientError: An error occurred (SignatureDoesNotMatch) when calling the GetCallerIdentity operation: Credential should be scoped to a valid region, not 'us-east-2'. 

In [9]:
from mmcv import Config
cfg = Config.fromfile('./configs/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_1x_coco.py')

In [21]:
cfg.data_root = "~/SageMaker"
cfg.data.train.ann_file = "annotations/instances_train2017.json"
cfg.data.train.img_prefix = "train2017"
cfg.data.val.ann_file = "annotations/instances_val2017.json"
cfg.data.val.img_prefix = "val2017"
cfg.data.test.ann_file = "annotations/instances_test2017.json"
cfg.data.test.img_prefix = "test2017"
cfg.dump("new_config.py")

new_cfg = Config.fromfile("new_config.py")
print(new_cfg.pretty_text)
print(type(new_cfg))

model = dict(
    type='FasterRCNN',
    pretrained='open-mmlab://detectron2/resnet50_caffe',
    backbone=dict(
        type='ResNet',
        depth=50,
        num_stages=4,
        out_indices=(0, 1, 2, 3),
        frozen_stages=1,
        norm_cfg=dict(type='BN', requires_grad=False),
        norm_eval=True,
        style='caffe'),
    neck=dict(
        type='FPN',
        in_channels=[256, 512, 1024, 2048],
        out_channels=256,
        num_outs=5),
    rpn_head=dict(
        type='RPNHead',
        in_channels=256,
        feat_channels=256,
        anchor_generator=dict(
            type='AnchorGenerator',
            scales=[8],
            ratios=[0.5, 1.0, 2.0],
            strides=[4, 8, 16, 32, 64]),
        bbox_coder=dict(
            type='DeltaXYWHBBoxCoder',
            target_means=[0.0, 0.0, 0.0, 0.0],
            target_stds=[1.0, 1.0, 1.0, 1.0]),
        loss_cls=dict(
            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
        loss_bbox=d