In [65]:
%reload_ext autoreload
%autoreload 2

import os
import io
import pickle
import sys
import datetime

import boto3
import sagemaker
from sagemaker.tuner import (
    ContinuousParameter,
    IntegerParameter,
    CategoricalParameter,
    HyperparameterTuner,
)

import matplotlib.pyplot as plt

sys.path.append(os.path.dirname(os.getcwd()))
from src.custom_utils import ImageDeserializer

## S3

In [21]:
s3_bucket = "yang-ml-sagemaker"
s3_key = "lesion-segmentation"

# Replicate the entire dataset on each ML compute instance that is launched for model training
distribution = "FullyReplicated"
data_keys = ["train", "validation", "train_annotation", "validation_annotation"]
s3_data_channels = {
    key: sagemaker.inputs.TrainingInput(
        s3_data=f"s3://{s3_bucket}/{s3_key}/input-data/{key}",
        distribution=distribution,
        content_type="image/png",
        input_mode="File",
        s3_data_type="S3Prefix",
    )
    for key in data_keys
}
s3_data_channels["label_map"] = f"s3://{s3_bucket}/{s3_key}/input-data/label_map"

s3_data_channels

{'train': <sagemaker.inputs.TrainingInput at 0x7f8aaab34ac0>,
 'validation': <sagemaker.inputs.TrainingInput at 0x7f8aaa6fe610>,
 'train_annotation': <sagemaker.inputs.TrainingInput at 0x7f8aaa6fedc0>,
 'validation_annotation': <sagemaker.inputs.TrainingInput at 0x7f8aaa6fe8b0>,
 'label_map': 's3://yang-ml-sagemaker/lesion-segmentation/input-data/label_map'}

## Sagemaker

In [3]:
sagemaker_session = sagemaker.Session(default_bucket=s3_bucket)
sagemaker_session
sm_boto3 = boto3.client("sagemaker")
role = sagemaker.get_execution_role()
instance_type = "ml.p3.2xlarge"
instance_count = 1
volume_size = 30
model_dir = "/opt/ml/model"
output_path = f"s3://{s3_bucket}/{s3_key}/models"
code_location = f"s3://{s3_bucket}/{s3_key}/code"

checkpoint_s3_uri = f"s3://{s3_bucket}/{s3_key}/checkpoints"
use_spot_instances = True
max_run = 86400
max_retry_attempts = 2

# Get today's date in YYYYMMDD format
today = datetime.datetime.today().strftime("%Y-%m-%d")

# Define endpoint name with version and date
endpoint_name = f"built-in-lesion-{today}"

Docker image for AWS semantic segmentation algorithm:

In [4]:
training_image = sagemaker.image_uris.retrieve(
    framework="semantic-segmentation", region=sagemaker_session.boto_region_name
)
training_image

'811284229777.dkr.ecr.us-east-1.amazonaws.com/semantic-segmentation:1'

## Estimator

In [53]:
sm_estimator = sagemaker.estimator.Estimator(
    image_uri=training_image,
    role=role,
    instance_count=instance_count,
    instance_type=instance_type,
    volume_size=volume_size,
    sagemaker_session=sagemaker_session,
    output_path=output_path,
    code_location=code_location,
    # Spot training
    max_run=max_run,
    max_wait=max_run,
    max_retry_attempts=max_retry_attempts,
    use_spot_instances=use_spot_instances,
    checkpoint_s3_uri=checkpoint_s3_uri,
    # Hyperparameters
    hyperparameters={
        # Network architecture hyperparameters
        "backbone": "resnet-50",
        "use_pretrained_model": True,
        "algorithm": "fcn",
        # Data hyperparameters
        "num_classes": 22,
        "num_training_samples": len(
            sagemaker.s3.S3Downloader.list(
                f"s3://{s3_bucket}/{s3_key}/input-data/train"
            )
        ),
        # Training Hyperparameters
        "early_stopping": True,
        "early_stopping_min_epochs": 5,
        "early_stopping_patience": 3,
        "epochs": 15,
        "lr_scheduler": "poly",
        "validation_mini_batch_size": 1,  # Score validation on the entire image without cropping
    },
)
sm_estimator

<sagemaker.estimator.Estimator at 0x7f8aa9e7d790>

## Hyperparameter Optimization

In [54]:
search_space = {
    "learning_rate": ContinuousParameter(
        min_value=1e-5, max_value=1e-1, scaling_type="Logarithmic"
    ),
    # Used if 'sgd' is used as the optimizer
    "momentum": ContinuousParameter(min_value=0.9, max_value=0.999),
    "optimizer": CategoricalParameter(values=["adam", "sgd", "rmsprop"]),
    "mini_batch_size": IntegerParameter(min_value=16, max_value=32),
    "weight_decay": ContinuousParameter(
        min_value=1e-5, max_value=1e-3, scaling_type="Logarithmic"
    ),
}

objective_metric_name = "validation:mIOU"
objective_type = "Maximize"
base_tuning_job_name = f"built-in-segment-hpo-job-{today}"
base_tuning_job_name

'built-in-segment-hpo-job-2023-05-27'

In [55]:
hpo_tuner = HyperparameterTuner(
    estimator=sm_estimator,
    objective_metric_name=objective_metric_name,
    hyperparameter_ranges=search_space,
    strategy="Bayesian",
    objective_type=objective_type,
    max_jobs=20,
    max_parallel_jobs=10,
    base_tuning_job_name=base_tuning_job_name,
)

In [None]:
hpo_tuner.fit(inputs=s3_data_channels)

No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config


..............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................!


Check hpo results:

In [58]:
hpo_analytics = sagemaker.HyperparameterTuningJobAnalytics(
    hpo_tuner.latest_tuning_job.name
)

hpo_results = hpo_analytics.dataframe()

hpo_results.sort_values("FinalObjectiveValue", ascending=False)

Unnamed: 0,learning_rate,mini_batch_size,momentum,optimizer,weight_decay,TrainingJobName,TrainingJobStatus,FinalObjectiveValue,TrainingStartTime,TrainingEndTime,TrainingElapsedTimeSeconds
0,0.004413,16.0,0.998535,sgd,1e-05,built-in-segment-hpo-230527-0654-020-5aad825b,Completed,0.045442,2023-05-27 07:33:38+00:00,2023-05-27 08:04:57+00:00,1879.0
11,0.085438,21.0,0.956085,rmsprop,0.000255,built-in-segment-hpo-230527-0654-009-532b369b,Completed,0.045442,2023-05-27 06:56:13+00:00,2023-05-27 07:33:18+00:00,2225.0
18,0.014735,20.0,0.99596,sgd,0.000234,built-in-segment-hpo-230527-0654-002-188b9e2a,Completed,0.045442,2023-05-27 06:55:56+00:00,2023-05-27 07:31:35+00:00,2139.0
17,0.000262,17.0,0.980688,sgd,0.000136,built-in-segment-hpo-230527-0654-003-d4896a13,Completed,0.045442,2023-05-27 06:55:55+00:00,2023-05-27 07:30:13+00:00,2058.0
16,0.003616,19.0,0.991817,adam,0.000156,built-in-segment-hpo-230527-0654-004-f32f8365,Completed,0.045442,2023-05-27 06:56:08+00:00,2023-05-27 07:32:32+00:00,2184.0
15,3.7e-05,24.0,0.996155,rmsprop,0.000107,built-in-segment-hpo-230527-0654-005-c1819399,Completed,0.045442,2023-05-27 06:56:02+00:00,2023-05-27 07:31:01+00:00,2099.0
14,0.005765,26.0,0.986656,adam,8.7e-05,built-in-segment-hpo-230527-0654-006-f8369f06,Completed,0.045442,2023-05-27 06:56:15+00:00,2023-05-27 07:27:53+00:00,1898.0
13,0.000704,28.0,0.984744,sgd,0.000341,built-in-segment-hpo-230527-0654-007-f8d6907c,Completed,0.045442,2023-05-27 06:56:05+00:00,2023-05-27 07:31:29+00:00,2124.0
12,0.076211,29.0,0.99834,rmsprop,0.000148,built-in-segment-hpo-230527-0654-008-c31180d5,Completed,0.045442,2023-05-27 06:56:12+00:00,2023-05-27 07:32:21+00:00,2169.0
10,0.08987,28.0,0.911675,sgd,0.000165,built-in-segment-hpo-230527-0654-010-dd5de575,Completed,0.045442,2023-05-27 06:56:08+00:00,2023-05-27 07:31:34+00:00,2126.0


## Deploy Best Model

In [59]:
# Obtain the name of best training job
best_model_name = sm_boto3.describe_hyper_parameter_tuning_job(
    HyperParameterTuningJobName=hpo_tuner.latest_tuning_job.name
)["BestTrainingJob"]["TrainingJobName"]

# Obtain s3 path to model artifacts
best_model_s3_path = sm_boto3.describe_training_job(TrainingJobName=best_model_name)[
    "ModelArtifacts"
]["S3ModelArtifacts"]

print(f"Best model artifacts persisted at {best_model_s3_path}")

Best model artifacts persisted at s3://yang-ml-sagemaker/lesion-segmentation/model/built-in-segment-hpo-230527-0654-006-f8369f06/output/model.tar.gz


In [60]:
best_model = hpo_tuner.best_estimator()

predictor = best_model.deploy(
    initial_instance_count=1,
    instance_type="ml.c5.xlarge",
    endpoint_name=endpoint_name + "-v1",
)


2023-05-27 07:31:09 Starting - Preparing the instances for training
2023-05-27 07:31:09 Downloading - Downloading input data
2023-05-27 07:31:09 Training - Training image download completed. Training in progress.
2023-05-27 07:31:09 Uploading - Uploading generated training model
2023-05-27 07:31:09 Completed - Resource reused by training job: built-in-segment-hpo-230527-0654-011-cb0ba415
----------------!

## Clean-up

In [70]:
sm_boto3.delete_endpoint(EndpointName=predictor.endpoint_name)
sm_boto3.delete_endpoint_config(EndpointConfigName=predictor.endpoint_name)
sm_boto3.delete_model(ModelName=best_model.name)

{'ResponseMetadata': {'RequestId': 'e178fdde-633e-4e89-9c10-2844d1bf82e0',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'e178fdde-633e-4e89-9c10-2844d1bf82e0',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '0',
   'date': 'Sat, 27 May 2023 08:24:35 GMT'},
  'RetryAttempts': 0}}

In [71]:
sm_boto3.close()