# Training with HPO using Syne Tune SageMaker Launcher Mode

## 1. Preparation

In [1]:
import sagemaker

sagemaker_session = sagemaker.Session()
bucket = sagemaker_session.default_bucket()
prefix = 'sagemaker/DEMO-pytorch-mnist'
role = sagemaker.get_execution_role()

In [2]:
from torchvision.datasets import MNIST
from torchvision import transforms

local_dir = 'data'
MNIST.mirrors = ["https://sagemaker-sample-files.s3.amazonaws.com/datasets/image/MNIST/"]
MNIST(
    local_dir,
    download=True,
    transform=transforms.Compose(
        [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
    )
)

Dataset MNIST
    Number of datapoints: 60000
    Root location: data
    Split: Train
    StandardTransform
Transform: Compose(
               ToTensor()
               Normalize(mean=(0.1307,), std=(0.3081,))
           )

In [3]:
inputs = sagemaker_session.upload_data(path='data', bucket=bucket, key_prefix=prefix)

In [4]:
from sagemaker.pytorch import PyTorch

estimator = PyTorch(
    entry_point='mnist.py',
    role=role,
    py_version='py3',
    framework_version='1.8.0',
    instance_count=1,
    instance_type='ml.c5.2xlarge',
    hyperparameters={'epochs': 1, 'backend': 'gloo'},
    source_dir='source_dir'
)

## 2. Training

In [13]:
from syne_tune import search_space
from syne_tune.optimizer.baselines import RandomSearch
from syne_tune.backend.sagemaker_backend.sagemaker_backend import SagemakerBackend
from syne_tune.stopping_criterion import StoppingCriterion
from syne_tune.tuner import Tuner
from syne_tune.remote.remote_launcher import RemoteLauncher
from syne_tune.experiments import load_experiment

In [6]:
config_space = {
    'lr': search_space.loguniform(0.001, 0.1),
    'batch-size': search_space.choice([32, 64, 128, 256, 512])
}


In [7]:
scheduler = RandomSearch(
    config_space=config_space,
    mode='min',
    metric='test_loss',
    random_seed=31415927
)

In [8]:
backend = SagemakerBackend(
    sm_estimator=estimator,
    metrics_names=['test_loss'],
    inputs={'training': inputs}
)

In [9]:
stop_criterion = StoppingCriterion(max_wallclock_time=600)

In [10]:
tuner = RemoteLauncher(
    tuner=Tuner(
        scheduler=scheduler,
        backend=backend,
        stop_criterion=stop_criterion,
        n_workers=3,
        sleep_time=5.0,
        tuner_name='hpo-hyperband'
    )
)

In [None]:
tuner.run()

2022-02-09 16:52:53 Starting - Starting the training job...
2022-02-09 16:52:56 Starting - Launching requested ML instancesProfilerReport-1644425572: InProgress
.........
2022-02-09 16:54:51 Starting - Preparing the instances for training...
2022-02-09 16:55:17 Downloading - Downloading input data...
2022-02-09 16:55:51 Training - Downloading the training image.....[34m2022-02-09 16:56:33,138 sagemaker-training-toolkit INFO     Imported framework sagemaker_mxnet_container.training[0m
[34m2022-02-09 16:56:33,141 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2022-02-09 16:56:33,153 sagemaker_mxnet_container.training INFO     MXNet training environment: {'SM_HOSTS': '["algo-1"]', 'SM_NETWORK_INTERFACE_NAME': 'eth0', 'SM_HPS': '{"no_tuner_logging":false,"store_logs":false,"tuner_path":"tuner/"}', 'SM_USER_ENTRY_POINT': 'remote_main.py', 'SM_FRAMEWORK_PARAMS': '{}', 'SM_RESOURCE_CONFIG': '{"current_host":"algo-1","hosts":["algo-1"],"network_i