In [None]:
import sagemaker

sagemaker_session = sagemaker.Session()

bucket = sagemaker_session.default_bucket()

role = sagemaker.get_execution_role()

In [None]:
sagemaker.__version__

In [None]:
from sagemaker.pytorch import PyTorch
import os
import uuid

instances = 4
processes = 8

one_day_in_seconds = 86400
max_run = one_day_in_seconds * 5

base_job_name = 'rigl'
uu = uuid.uuid1()
print('using uuid:', uu)

estimator = PyTorch(entry_point='train_imagenet_rigl.py',
                    source_dir='..',
                    role=role,
                    framework_version='1.4.0',
                    py_version='py3',
                    checkpoint_s3_uri='s3://%s/checkpoints/%s/%s/' % (bucket, base_job_name, uu),
                    base_job_name=base_job_name,
                    
                    # Instances Setup
                    train_instance_count=instances,
                    train_instance_type='ml.p3.16xlarge',
                    train_use_spot_instances=True,
                    train_max_wait=max_run,
                    train_max_run=max_run,
                    train_volume_size=300,
                    
                    hyperparameters={
                        'multiprocessing-distributed': 1, # separate process per GPU
                        'data':'/opt/ml/input/data/training',
                        'run-extract-script': 1, # if you are using the 1000 .tar file s3 bucket for imagenet, use this = 1. otherwise, use = 0
                        'output-dir': '/opt/ml/model',
                        'checkpoint-dir': '/opt/ml/checkpoints/',
                        'arch': 'resnet50',
                        'workers': 40,
                        'dense-allocation': 0.1,
                        'static-topo': 0,
                        'alpha': 0.3,
                        'delta': 100,
                        'grad-accumulation-n': 1, # if using a smaller batch size, this may be useful
                        'batch-size': 1024 // instances, # batch size per instance
                        'eval-batch-size': 4096,
                        'lr': 0.1,
#                         'lr-warmup-end': 5,
                        'lr-scaling-stop': 91,
                        'epochs': 100,
                    },

                    metric_definitions=[
                        {'Name': 'top1-accuracy', 'Regex': '\*\sAcc@1\s(.*)\sAcc@5'},
                        {'Name': 'top5-accuracy', 'Regex': '\*\sAcc@1\s.*\sAcc@5\s(.*)'},
                    ]
                   )

In [None]:
estimator.fit('s3://imagenet-compressed-oregon') # use imagenet s3 bucket