In [None]:
%%sh
pip -q install sagemaker --upgrade

In [None]:
import sagemaker

print(sagemaker.__version__)

session = sagemaker.Session()
bucket = session.default_bucket()

### Define channels

In [None]:
# Fully replicated, Pipe Mode

prefix = 'imagenet-split'
s3_train_path = 's3://{}/{}/input/training/'.format(bucket, prefix)
s3_val_path   = 's3://{}/{}/input/validation/'.format(bucket, prefix)
s3_output     = 's3://{}/{}/output/'.format(bucket, prefix)

from sagemaker.inputs import ShuffleConfig

train_data = sagemaker.TrainingInput(s3_train_path, 
                      distribution='FullyReplicated',
                      shuffle_config=ShuffleConfig(59),
                      content_type='application/x-recordio',
                      s3_data_type='S3Prefix',
                      input_mode='Pipe')

validation_data = sagemaker.TrainingInput(s3_val_path, 
                           distribution='FullyReplicated',
                           content_type='application/x-recordio', 
                           s3_data_type='S3Prefix',
                           input_mode='Pipe')

In [None]:
print(s3_train_path)
print(s3_val_path)
print(s3_output)

In [None]:
s3_channels = {'train': train_data, 'validation': validation_data}

### Get the name of the image classification algorithm in our region

In [None]:
region    = session.boto_session.region_name    
container = sagemaker.image_uris.retrieve('image-classification', region)

print(container)

### Configure the training job

A quick test shows that a single p3.2xlarge instance with batch size set to 128 will crunch through the dataset at about **335 images/second**. 

As we have about **1,281,167 images**, we can expect one epoch to last about **3824 seconds** (about 1h and 4 minutes)...

An ml.p3.2xlarge costs $3.825 per hour in us-east-1. 

[ https://aws.amazon.com/sagemaker/pricing/ ]

Assuming that we need to train for 150 epochs to get decent accuracy:
- Duration: (3824/3600)*150 = 158 hours (about 6.5 days)
- Cost: 158 * 3.825 = $573

6.5 days is probably not acceptable from a business perspective. 
Let's try to speed our job with a multi-GPU instance.

In [None]:
role = sagemaker.get_execution_role()

ic = sagemaker.estimator.Estimator(
    container,
    role,
    instance_count=1, 
    instance_type='ml.p3dn.24xlarge',   # 8 GPUs
    output_path=s3_output,
    volume_size=1
)

### Set algorithm parameters

In [None]:
ic.set_hyperparameters(
    num_layers=50,                 # Train a Resnet-50 model
    use_pretrained_model=0,        # Train from scratch
    num_classes=1000,              # ImageNet has 1000 classes
    num_training_samples=1281167,  # Number of training samples
    mini_batch_size=1024,          # 8 GPUs * 128 = 1024
    learning_rate=0.4,
    epochs=2,
    augmentation_type='crop',
    top_k=3,
    kv_store='dist_sync',          # gradient updates are synchronized after each batch
)


In [None]:
ic.fit(inputs=s3_channels)

An ml.p3dn.24xlarge costs $35.894 per hour (us-east-1)

[ https://aws.amazon.com/sagemaker/pricing/ ]

Time per epoch: 727 seconds

For 150 epochs:
- Duration: (865/3600) * 150 = 30.3 hours (1.25 day)
- Cost: 30.3 * 35.894 = $1,087

We go 5x faster, but at almost 2x the cost. Let's start optimizing this.

CloudWatch shows that total GPU memory utilization is only 300%, meaning 300/8=37.5% on each GPU. Let's bump batch size to (1024/0.375)=2730, rounded up to 2736 to be divisible by 8.

A better way to understand how a training job uses the underlying infrastructure is to use the profiling capability in SageMaker Debugger. Let's run the same job and ask for a profiling report.

In [None]:
# Add profiling report

from sagemaker.debugger import rule_configs, Rule, ProfilerRule
from sagemaker.debugger import ProfilerConfig, FrameworkProfile
    
ic = sagemaker.estimator.Estimator(
    container,
    role,
    instance_count=1, 
    instance_type='ml.p3dn.24xlarge',
    output_path=s3_output,
    volume_size=1,

    # https://docs.aws.amazon.com/sagemaker/latest/dg/debugger-built-in-rules.html
    rules=[ 
        ProfilerRule.sagemaker(rule_configs.ProfilerReport())
    ],
    framework_profile_params=ProfilerConfig(
        framework_profile_params=FrameworkProfile()
    )
)

In [None]:
# Same as above

ic.set_hyperparameters(num_layers=50,                 # Train a Resnet-50 model
                       use_pretrained_model=0,        # Train from scratch
                       num_classes=1000,              # ImageNet has 1000 classes
                       num_training_samples=1281167,  # Number of training samples
                       mini_batch_size=2736,          
                       learning_rate=0.4,
                       epochs=2,
                       kv_store='dist_sync',
                       augmentation_type='crop',
                       top_k=3)

In [None]:
ic.fit(inputs=s3_channels)

Time per epoch: 758 seconds

Maximizing GPU memory usage is good practice, as it keeps GPU cores as busy as possible. However, it didn't make a difference here. Maybe the cost of synchronizing gradients ?

Now, let's add a second instance to scale out the training job. We also introduce Managed Spot Training.

In [None]:
# Add Distributed Training and Managed Spot Training

ic = sagemaker.estimator.Estimator(
    container,
    role,
    instance_count=2,                 # <--------
    instance_type='ml.p3dn.24xlarge',
    output_path=s3_output,
    volume_size=1,

    use_spot_instances=True,          # <--------
    max_run=3600,                     # <--------
    max_wait=3600                     # <--------
)

In [None]:
# Same as above

ic.set_hyperparameters(num_layers=50,                 # Train a Resnet-50 model
                       use_pretrained_model=0,        # Train from scratch
                       num_classes=1000,              # ImageNet has 1000 classes
                       num_training_samples=1281167,  # Number of training samples
                       mini_batch_size=2736,          
                       learning_rate=0.4,
                       epochs=2,
                       kv_store='dist_sync',
                       augmentation_type='crop',
                       top_k=3)

In [None]:
ic.fit(inputs=s3_channels)

Time per epoch: 378 seconds

For 150 epochs:
- Duration: (378/3600) * 150 = 15.75 hours
- Cost: 2 * 15.75 * 38.768 * 0.30 = $366

2x speedup, 3x cheaper :)

In [None]:
# Same job with 4 instances

ic = sagemaker.estimator.Estimator(
    container,
    role,
    instance_count=4,                 # <--------
    instance_type='ml.p3dn.24xlarge',
    output_path=s3_output,
    volume_size=1,
    use_spot_instances=True,          
    max_run=3600,                     
    max_wait=3600                     
)

# Same as above

ic.set_hyperparameters(num_layers=50,                 # Train a Resnet-50 model
                       use_pretrained_model=0,        # Train from scratch
                       num_classes=1000,              # ImageNet has 1000 classes
                       num_training_samples=1281167,  # Number of training samples
                       mini_batch_size=2736,          
                       learning_rate=0.4,
                       epochs=2,
                       kv_store='dist_sync',
                       augmentation_type='crop',
                       top_k=3)

ic.fit(inputs=s3_channels)

Time per epoch: 198 seconds

For 150 epochs:
- Duration: (198/3600) * 150 = 8.25 hours
- Cost: 4 * 8.25 * 38.768 * 0.30 = $382
    
2x speedup, 5% cost increase

In [None]:
# Same job with 8 instances

ic = sagemaker.estimator.Estimator(
    container,
    role,
    instance_count=8,                 # <-------- 64 GPUs
                                      # 327K CUDA cores, 2TB of GPU RAM
                                      # 8 Petaflops (!) for Fused Multiply Add matrix operations (A*B + C)
    instance_type='ml.p3dn.24xlarge',
    output_path=s3_output,
    volume_size=1,
    use_spot_instances=True,          
    max_run=3600,                     
    max_wait=3600                     
)

# Same as above

ic.set_hyperparameters(num_layers=50,                 # Train a Resnet-50 model
                       use_pretrained_model=0,        # Train from scratch
                       num_classes=1000,              # ImageNet has 1000 classes
                       num_training_samples=1281167,  # Number of training samples
                       mini_batch_size=2736,          
                       learning_rate=0.4,
                       epochs=2,
                       kv_store='dist_sync',
                       augmentation_type='crop',
                       top_k=3)

ic.fit(inputs=s3_channels)

Time per epoch: 99 seconds

For 150 epochs:
- Duration: (198/3600) * 150 = 4.12 hours
- Cost: 8 * 4.12 * 38.768 * 0.30 = $383 (!)
    
2x speedup, same cost \m/


We started with 158 hours and a $170 spend (1 p3.2xlarge with 70% spot savings).

For less than 2x the initial cost, we've accelerated our job almost 43x.

We're not quite done yet with cost optimization yet, but let's train this for a little while.

In [None]:
# Add debugging rules
# Add checkpointing paths

checkpoint_s3_uri = 's3://{}/{}/checkpoints/'.format(bucket, prefix)

ic = sagemaker.estimator.Estimator(
    container,
    role,
    instance_count=8, 
    instance_type='ml.p3dn.24xlarge',
    output_path=s3_output,
    volume_size=1,
    
    use_spot_instances=True,
    max_run=3600,
    max_wait=3600,
    
    checkpoint_s3_uri='s3://{}/{}/checkpoints/'.format(bucket, prefix), # <-----
    checkpoint_local_path='/opt/ml/checkpoints',                        # <-----
        
        rules=[
        ProfilerRule.sagemaker(rule_configs.ProfilerReport()),
        
        # https://docs.aws.amazon.com/sagemaker/latest/dg/debugger-built-in-rules.html
        Rule.sagemaker(rule_configs.overfit()),
        Rule.sagemaker(rule_configs.loss_not_decreasing()),            # <----- 
        Rule.sagemaker(rule_configs.vanishing_gradient()),             # <-----
        Rule.sagemaker(rule_configs.exploding_tensor())                # <-----
    ]
)

In [None]:
# Increase epochs
# Add early stopping

ic.set_hyperparameters(num_layers=50,                 # Train a Resnet-50 model
                       use_pretrained_model=0,        # Train from scratch
                       num_classes=1000,              # ImageNet has 1000 classes
                       num_training_samples=1281167,  # Number of training samples
                       mini_batch_size=2736,
                       
                       optimizer='sgd',
                       learning_rate=0.4,
                                              
                       #lr_scheduler_factor=0.5,          
                       #lr_scheduler_step='30,60,90,120',  
                       epochs=10,                      # <-----
                       
                       kv_store='dist_sync',
                       augmentation_type='crop',
                       top_k=3,
                       
                       #early_stopping=True,            # <-----
                       #early_stopping_patience=10      # <-----
                       
                      )

In [None]:
ic.fit(inputs=s3_channels)

### Deploy the model on a GPU instance

**ml.g4dn.xlarge** is the most cost effective GPU instance.

1 NVIDIA T4 GPU, 8 Teraflops FP32
$0.736 / hour (us-east-1)

In [None]:
ic_predictor = ic.deploy(
    initial_instance_count=1,
    instance_type='ml.g4dn.xlarge',         
    endpoint_name='imagenet-endpoint') 

### Deploy the model on a CPU instance accelerated with Amazon Elastic Inference

Alternatively, we can also deploy on an ml.c5.xlarge instance combined with an accelerator.

ml.c5.xlarge: $0.205 / hour (us-east-1)

ml.eia2.medium  : 1 Teraflop FP32, $0.128 / hour (us-east-1)

ml.eia2.large   : 2 Teraflops FP32, $0.240

ml.eia2.xlarge : 4 Teraflops FP32, $0.340

In [None]:
ic_predictor_ei = ic.deploy(
    initial_instance_count=1,
    instance_type='ml.c5.xlarge',          
    accelerator_type='ml.eia2.medium',     
    endpoint_name='imagenet-endpoint-ei')

### Delete endpoints

In [None]:
ic_predictor.delete_endpoint()
ic_predictor_ei.delete_endpoint()