In [2]:
# login to Sagemaker ECR with Deep Learning Containers
!aws ecr get-login-password --region us-east-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-east-2.amazonaws.com
# login to your private ECR
!aws ecr get-login-password --region us-east-2 | docker login --username AWS --password-stdin 553020858742.dkr.ecr.us-east-2.amazonaws.com

https://docs.docker.com/engine/reference/commandline/login/#credentials-store

Login Succeeded
https://docs.docker.com/engine/reference/commandline/login/#credentials-store

Login Succeeded


In [None]:

! pygmentize -l docker Dockerfile.training

In [None]:
! ./build_and_push.sh mmdetection-training latest Dockerfile.training

In [4]:

# Define IAM role
import boto3
import re

import os
import numpy as np
import pandas as pd
from sagemaker import get_execution_role

role = get_execution_role()

In [5]:

import sagemaker
from time import gmtime, strftime

sess = sagemaker.Session()
bucket = sess.default_bucket()
region = "us-east-2"
account = sess.boto_session.client('sts').get_caller_identity()['Account']
prefix_input = 'mmdetection-input'
prefix_output = 'mmdetection-ouput'

In [6]:
container = "mmdetection-training" # your container name
tag = "latest"
image = '{}.dkr.ecr.{}.amazonaws.com/{}:{}'.format(account, region, container, tag)

In [82]:
hyperparameters = {
    "config-file" : "configs/mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py",
    "dataset" : "coco",
    # 'options' allows to override individual config values
    "options" : "total_epochs=1; optimizer.lr=0.04",
    "auto-scale" : "false"
    
}

In [80]:
metrics = [
    {
        "Name": "loss",
        "Regex": ".*loss:\s([0-9\\.]+)\s*"
    },
    {
        "Name": "loss_rpn_cls",
        "Regex": ".*loss_rpn_cls:\s([0-9\\.]+)\s*"
    },
    {
        "Name": "loss_rpn_bbox",
        "Regex": ".*loss_rpn_bbox:\s([0-9\\.]+)\s*"
    },
    {
        "Name": "loss_cls",
        "Regex": ".*loss_cls:\s([0-9\\.]+)\s*"
    },
    {
        "Name": "acc",
        "Regex": ".*acc:\s([0-9\\.]+)\s*"
    },
    {
        "Name": "loss_bbox",
        "Regex": ".*loss_bbox:\s([0-9\\.]+)\s*"
    },
    {
        "Name": "loss_mask",
        "Regex": ".*loss_mask:\s([0-9\\.]+)\s*"
    },
    {
        "Name": "lr",
        "Regex": "lr: (-?\d+.?\d*(?:[Ee]-\d+)?)"
    }
]

## Distributed Training in SM Container

In [83]:
est = sagemaker.estimator.Estimator(image,
                                          role=role,
                                          train_instance_count=4,
                                          train_instance_type='ml.p3.16xlarge',
#                                           train_instance_type="local_gpu", # use local_gpu for quick troubleshooting
#                                           train_instance_count=1,
                                          train_volume_size=100,
                                          output_path="s3://{}/{}".format(sess.default_bucket(), prefix_output),
                                          metric_definitions = metrics,
                                          hyperparameters = hyperparameters, 
#                                           sagemaker_session=sagemaker.LocalSession()
                                          sagemaker_session=sess
)

est.fit({"training" : "s3://coco2017-2a27f/coco"}, wait=False)
# est.fit()



ResourceLimitExceeded: An error occurred (ResourceLimitExceeded) when calling the CreateTrainingJob operation: The account-level service limit 'Number of instances across all training jobs' is 6 Instances, with current utilization of 6 Instances and a request delta of 4 Instances. Please contact AWS support to request an increase for this limit.

TODO:
- test that distributed cluster actually started: requires customization of train.py
- [done, but see issue below] ensure scaling of LR based on number of nodes: currently, it's not scaling at all. Need to add some sort of autoscaling policy. 
- [done] fix workdir (make it a part of config)
- [done] test opts: https://github.com/open-mmlab/mmdetection/issues/2646#issuecomment-626100525
- [done] add metrics

Known issues:
1. if post-training validation is activated, then following error happens after training is done:

    `
    File "/opt/ml/code/mmdetection/tools/train.py", line 153, in <module>
    main()
    File "/opt/ml/code/mmdetection/tools/train.py", line 149, in main
    meta=meta)
    File "/opt/ml/code/mmdetection/mmdet/apis/train.py", line 128, in train_detector
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
    File "/opt/conda/lib/python3.6/site-packages/mmcv/runner/epoch_based_runner.py", line 122, in run
    epoch_runner(data_loaders[i], **kwargs)
    File "/opt/conda/lib/python3.6/site-packages/mmcv/runner/epoch_based_runner.py", line 46, in train
    self.call_hook('after_train_epoch')
    File "/opt/conda/lib/python3.6/site-packages/mmcv/runner/base_runner.py", line 282, in call_hook
    getattr(hook, fn_name)(self)
    File "/opt/ml/code/mmdetection/mmdet/core/evaluation/eval_hooks.py", line 71, in after_train_epoch
    gpu_collect=self.gpu_collect)
    File "/opt/ml/code/mmdetection/mmdet/apis/test.py", line 113, in multi_gpu_test
    results = collect_results_cpu(results, len(dataset), tmpdir)
    File "/opt/ml/code/mmdetection/mmdet/apis/test.py", line 147, in collect_results_cpu
    part_list.append(mmcv.load(part_file))
    File "/opt/conda/lib/python3.6/site-packages/mmcv/fileio/io.py", line 41, in load
    obj = handler.load_from_path(file, **kwargs)
    File "/opt/conda/lib/python3.6/site-packages/mmcv/fileio/handlers/pickle_handler.py", line 14, in load_from_path
    filepath, mode='rb', **kwargs)
    File "/opt/conda/lib/python3.6/site-packages/mmcv/fileio/handlers/base.py", line 20, in load_from_path
    with open(filepath, mode) as f:
    FileNotFoundError: [Errno 2] No such file or directory: '/opt/ml/output/.eval_hook/part_8.pkl'
    Traceback (most recent call last):
    File "/opt/conda/lib/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
    File "/opt/conda/lib/python3.6/runpy.py", line 85, in _run_code
    exec(code, run_globals)
    File "/opt/conda/lib/python3.6/site-packages/torch/distributed/launch.py", line 263, in <module>
    main()
    File "/opt/conda/lib/python3.6/site-packages/torch/distributed/launch.py", line 259, in main
    cmd=cmd)
    subprocess.CalledProcessError: Command '['/opt/conda/bin/python', '-u', '/opt/ml/code/mmdetection/tools/train.py', '--local_rank=7', '/opt/ml/code/updated_config.py', '--launcher', 'pytorch', '--work-dir', '/opt/ml/output']' returned non-zero exit status 1.
    ERROR ExecuteUserScriptError:
    Command "/opt/conda/bin/python mmdetection_train.py --config-file configs/mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py --dataset coco --options total_epochs=1"
    Traceback (most recent call last): File "mmdetection_train.py", line 126, in <module> raise subprocess.CalledProcessError(returncode=process.returncode, cmd=joint_cmd)
    subprocess.CalledProcessError: Command 'python -m torch.distributed.launch --nnodes 2 --node_rank 0 --nproc_per_node 8 --master_addr algo-1 --master_port 55555 /opt/ml/code/mmdetection/tools/train.py /opt/ml/code/updated_config.py --launcher pytorch --work-dir /opt/ml/output' returned non-zero exit status 1.
    `    
    
2. When scaling of LR and Warmup Steps based on number of training nodes, then loss is not being properly calculated:
    `2020-07-26 18:23:57,713 - mmdet - INFO - Epoch [1][1800/1833]#011lr: 8.000e-02, eta: 0:00:20, time: 0.606, data_time: 0.054, memory: 4038, loss_rpn_cls: 0.3840, loss_rpn_bbox: 0.0986, loss_cls: nan, acc: 0.4653, loss_bbox: nan, loss_mask: 0.5701, loss: nan`