In [50]:
%reload_ext autoreload
%autoreload 2

import sys
import os
import datetime
import json

import boto3
import sagemaker
from sagemaker.tensorflow.estimator import TensorFlow
from sagemaker.tensorflow import TensorFlowModel
from sagemaker.tuner import ContinuousParameter, IntegerParameter, CategoricalParameter, HyperparameterTuner

import numpy as np
import pandas as pd
from tqdm import tqdm

sys.path.append(os.path.dirname(os.getcwd()))
from src.custom_utils import HoverHelper, DataHandler

from bokeh.plotting import figure, show
import bokeh
bokeh.io.output_notebook()

## S3 & Sagemaker

In [2]:
s3 = boto3.client('s3')

s3_bucket = 'yang-ml-sagemaker'
s3_key = 'mnist'

train_val_path = {'train': f's3://{s3_bucket}/{s3_key}/train-data',
                  'val': f's3://{s3_bucket}/{s3_key}/val-data'}
train_val_path

{'train': 's3://yang-ml-sagemaker/mnist/train-data',
 'val': 's3://yang-ml-sagemaker/mnist/val-data'}

In [3]:
sagemaker_session = sagemaker.Session(default_bucket=s3_bucket)
sagemaker_session
sm_boto3 = boto3.client('sagemaker')
role = sagemaker.get_execution_role()
instance_type = 'ml.c5.2xlarge'
instance_count = 1

src_dir_path = os.path.join(os.path.dirname(os.getcwd()), 'src')
training_script_path = 'train.py'
model_dir = '/opt/ml/model'

PY_VERSION='py37'
FRAMEWORK_VERSION = '2.2'

# Get today's date in YYYYMMDD format
today = datetime.datetime.today().strftime('%Y-%m-%d')

# Define endpoint name with version and date
endpoint_name = f'image-cnn-{today}'

## Tensorflow Estimator

In [4]:
tf_estimator = TensorFlow(
    entry_point=training_script_path,
    source_dir=src_dir_path,
    role=role,
    py_version=PY_VERSION,
    framework_version=FRAMEWORK_VERSION,
    model_dir=model_dir,
    output_path=f's3://{s3_bucket}/{s3_key}/model',
    instance_type=instance_type,
    instance_count=instance_count
)
tf_estimator

<sagemaker.tensorflow.estimator.TensorFlow at 0x7f1f6bf9a8c0>

## Hyperparameter Optimization

In [5]:
search_space = {
    'filter_dim_1': CategoricalParameter(values=[32, 64, 128]),
    'filter_dim_2': CategoricalParameter(values=[32, 64, 128]),
    'filter_dim_3': CategoricalParameter(values=[32, 64, 128]),
    'filter_dim_4': CategoricalParameter(values=[32, 64, 128]),
    'filter_dim_5': CategoricalParameter(values=[32, 64, 128]),
    'conv2d_regularizer_decay': ContinuousParameter(min_value=1e-8, max_value=1e-3, scaling_type='Logarithmic'),
    'dense_units_1': CategoricalParameter(values=[32, 64]),
    'dense_units_2': CategoricalParameter(values=[32, 64]),
    'dense_regularizer_decay': ContinuousParameter(min_value=1e-8, max_value=1e-3, scaling_type='Logarithmic'),
    'kernel_size': IntegerParameter(min_value=3, max_value=7),
    'dropout_rate': ContinuousParameter(min_value=0.0, max_value=0.4),
    'batch_norm_momentum': ContinuousParameter(min_value=0.75, max_value=0.99),
    'learning_rate': ContinuousParameter(min_value=1e-4, max_value=1e-2, scaling_type='Logarithmic'),
    'clipnorm': ContinuousParameter(min_value=0.2, max_value=1.0),
    'batch_size': CategoricalParameter(values=[64, 128, 256]),
    'epochs': IntegerParameter(min_value=10, max_value=20)
}

objective_metric_name = 'accuracy'
objective_type = 'Maximize'
metric_definitions = [
    {
        'Name': objective_metric_name,
        "Regex": 'Best validation accuracy: ([0-9\\.]+)',
    }
]
base_tuning_job_name = f'cnn-hpo-job-{today}'

In [6]:
hpo_tuner = HyperparameterTuner(
    estimator=tf_estimator,
    objective_metric_name=objective_metric_name,
    hyperparameter_ranges=search_space,
    metric_definitions=metric_definitions,
    strategy='Bayesian',
    objective_type=objective_type,
    max_jobs=50,
    max_parallel_jobs=5,
    base_tuning_job_name=base_tuning_job_name
)

In [7]:
hpo_tuner.fit(train_val_path)

No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config


.......................................................................................................................................................................................................................................................................................!


Check optimization results:

In [8]:
hpo_analytics = sagemaker.HyperparameterTuningJobAnalytics(hpo_tuner.latest_tuning_job.name)

hpo_results = hpo_analytics.dataframe()

hpo_results.sort_values("FinalObjectiveValue", ascending=False).head(10)

Unnamed: 0,batch_norm_momentum,batch_size,clipnorm,conv2d_regularizer_decay,dense_regularizer_decay,dense_units_1,dense_units_2,dropout_rate,epochs,filter_dim_1,...,filter_dim_4,filter_dim_5,kernel_size,learning_rate,TrainingJobName,TrainingJobStatus,FinalObjectiveValue,TrainingStartTime,TrainingEndTime,TrainingElapsedTimeSeconds
2,0.848839,"""128""",0.924783,9.246374e-06,1e-08,"""64""","""64""",0.08519,14.0,"""32""",...,"""32""","""128""",7.0,0.000337,cnn-hpo-job-2023-04--230429-2307-048-37aeaea3,Completed,0.8,2023-04-29 23:28:54+00:00,2023-04-29 23:30:21+00:00,87.0
17,0.860214,"""128""",0.469414,0.0002190654,1.033873e-07,"""64""","""64""",0.14299,20.0,"""64""",...,"""32""","""128""",6.0,0.000232,cnn-hpo-job-2023-04--230429-2307-033-c877c889,Completed,0.793333,2023-04-29 23:22:04+00:00,2023-04-29 23:23:46+00:00,102.0
1,0.857955,"""128""",0.876657,1.033552e-07,1.283954e-07,"""64""","""64""",0.139166,18.0,"""32""",...,"""32""","""128""",4.0,0.000222,cnn-hpo-job-2023-04--230429-2307-049-7bf2aa37,Completed,0.78,2023-04-29 23:29:12+00:00,2023-04-29 23:30:59+00:00,107.0
0,0.882589,"""128""",0.535411,8.85617e-05,1.627146e-07,"""64""","""64""",0.186092,17.0,"""32""",...,"""32""","""64""",7.0,0.000232,cnn-hpo-job-2023-04--230429-2307-050-dd7d90f7,Completed,0.773333,2023-04-29 23:29:30+00:00,2023-04-29 23:31:26+00:00,116.0
27,0.865835,"""256""",0.373565,8.195108e-07,0.001,"""64""","""32""",0.196572,13.0,"""64""",...,"""128""","""128""",3.0,0.001082,cnn-hpo-job-2023-04--230429-2307-023-5d8b7676,Completed,0.773333,2023-04-29 23:18:03+00:00,2023-04-29 23:19:30+00:00,87.0
18,0.857444,"""256""",0.704177,9.581983e-08,0.000126298,"""64""","""64""",0.324779,17.0,"""64""",...,"""128""","""128""",7.0,0.001021,cnn-hpo-job-2023-04--230429-2307-032-3e229e0c,Completed,0.773333,2023-04-29 23:21:45+00:00,2023-04-29 23:23:20+00:00,95.0
32,0.819796,"""128""",0.312706,1.110267e-07,0.001,"""64""","""32""",0.274235,15.0,"""64""",...,"""32""","""128""",6.0,0.000835,cnn-hpo-job-2023-04--230429-2307-018-e884392b,Completed,0.766667,2023-04-29 23:15:45+00:00,2023-04-29 23:17:03+00:00,78.0
31,0.767531,"""256""",0.25063,1.181151e-06,1.160712e-08,"""64""","""32""",0.16367,14.0,"""32""",...,"""64""","""64""",7.0,0.000191,cnn-hpo-job-2023-04--230429-2307-019-040975bb,Completed,0.766667,2023-04-29 23:16:05+00:00,2023-04-29 23:18:01+00:00,116.0
30,0.791487,"""256""",0.902611,2.869177e-08,3.871795e-07,"""64""","""64""",0.142139,15.0,"""64""",...,"""128""","""64""",3.0,0.002939,cnn-hpo-job-2023-04--230429-2307-020-8b0f4e29,Completed,0.766667,2023-04-29 23:16:23+00:00,2023-04-29 23:17:55+00:00,92.0
11,0.88672,"""128""",0.69124,0.0001563349,5.38322e-06,"""64""","""64""",0.340181,11.0,"""64""",...,"""32""","""128""",7.0,0.001237,cnn-hpo-job-2023-04--230429-2307-039-efcceea8,Completed,0.763333,2023-04-29 23:24:32+00:00,2023-04-29 23:26:54+00:00,142.0


In [16]:
hover = HoverHelper(hpo_analytics)

p = figure(plot_width=900, plot_height=400, tools=hover.tools(), x_axis_type='datetime')
p.circle(source=hpo_results, x='TrainingStartTime', y='FinalObjectiveValue')
p.xaxis.axis_label = 'Trial'
p.yaxis.axis_label = 'Accuracy' 
p.title = 'HPO Results'
show(p);

### Deploy Best Model

Find the s3 uri for the training artifacts of the best model:

In [17]:
# Obtain the name of best training job
best_model_name = sm_boto3.describe_hyper_parameter_tuning_job(
    HyperParameterTuningJobName=hpo_tuner.latest_tuning_job.name
)['BestTrainingJob']['TrainingJobName']

# Obtain s3 path to model artifacts
best_model_s3_path = sm_boto3.describe_training_job(
    TrainingJobName=best_model_name
)["ModelArtifacts"]["S3ModelArtifacts"]

print(f'Best model artifacts persisted at {best_model_s3_path}')

Best model artifacts persisted at s3://yang-ml-sagemaker/mnist/model/cnn-hpo-job-2023-04--230429-2307-048-37aeaea3/output/model.tar.gz


Deploy directly from artifacts:

In [18]:
best_model = TensorFlowModel(
    model_data=best_model_s3_path,
    role=role,
    framework_version=FRAMEWORK_VERSION
)

predictor = best_model.deploy(
    initial_instance_count=1,
    instance_type='ml.c5.2xlarge',
    endpoint_name=endpoint_name + '-v1'
)

update_endpoint is a no-op in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


----!

## Inference on Test Set

In [31]:
# Load test data
data_handler = DataHandler(s3_bucket, s3_key)
X_test = data_handler.load_data('test')
X_test.shape

(10000, 28, 28, 1)

Make predictions in batches:

In [52]:
# Define the batch size
batch_size = 1000

# Generate predictions in batches
num_instances = X_test.shape[0]
num_batches = (num_instances + batch_size - 1) // batch_size

predicted_labels = []
for i in tqdm(range(num_batches)):
    start_idx = i * batch_size
    end_idx = min(start_idx + batch_size, num_instances)
    batch = X_test[start_idx:end_idx]
    
    # Call the predict method of the TensorFlowPredictor object
    predictions = predictor.predict(batch)
    
    # Extract the predicted labels from the response
    predicted_labels_batch = np.argmax(predictions['predictions'], axis=1) # Use axis=1 to find max probability across 10 columns
    
    # Append the predicted labels to the list
    predicted_labels.append(predicted_labels_batch)

# Concatenate the predicted labels from all batches
predicted_labels = np.concatenate(predicted_labels, axis=0)

predicted_labels

100%|██████████| 10/10 [00:11<00:00,  1.11s/it]


array([6, 5, 4, ..., 2, 2, 6])

In [54]:
np.unique(predicted_labels, return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([1154,  990, 1019,  964,  683, 1115, 1149,  970, 1041,  915]))

## Clean-up

In [55]:
sm_boto3.delete_endpoint(EndpointName=predictor.endpoint_name)
sm_boto3.delete_endpoint_config(EndpointConfigName=predictor.endpoint_name)

{'ResponseMetadata': {'RequestId': '859aed69-097c-47b5-af7e-b42f27bd556c',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '859aed69-097c-47b5-af7e-b42f27bd556c',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '0',
   'date': 'Sun, 30 Apr 2023 00:30:42 GMT'},
  'RetryAttempts': 0}}

In [56]:
sm_boto3.close()