In [3]:
%reload_ext autoreload
%autoreload 2

import os
import io
import sys
import datetime

import boto3
import sagemaker
from sagemaker.tensorflow.estimator import TensorFlow
from sagemaker.tensorflow import TensorFlowModel
from sagemaker.tuner import (
    ContinuousParameter,
    IntegerParameter,
    CategoricalParameter,
    HyperparameterTuner,
)
from sklearn.metrics import accuracy_score
from tqdm import tqdm

import numpy as np

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"  # Nopep8

sys.path.append(os.path.dirname(os.getcwd()))
from src.model_utils import HoverHelper

from bokeh.plotting import figure, show
import bokeh

bokeh.io.output_notebook()

## S3

In [4]:
s3 = boto3.client("s3")

s3_bucket = "yang-ml-sagemaker"
s3_key = "weather-classification"

train_val_path = {
    "train": f"s3://{s3_bucket}/{s3_key}/input-data/train",
    "val": f"s3://{s3_bucket}/{s3_key}/input-data/val",
}
train_val_path

{'train': 's3://yang-ml-sagemaker/weather-classification/input-data/train',
 'val': 's3://yang-ml-sagemaker/weather-classification/input-data/val'}

## Sagemaker

In [12]:
sagemaker_session = sagemaker.Session(default_bucket=s3_bucket)
sagemaker_session
sm_boto3 = boto3.client("sagemaker")
role = sagemaker.get_execution_role()
s3_uploader = sagemaker.s3.S3Uploader()
instance_type = "ml.c5.9xlarge"
instance_count = 1

src_dir_path = os.path.join(os.path.dirname(os.getcwd()), "src")
fine_tune_script_path = "fine_tune_entry.py"
model_dir = "/opt/ml/model"
volume_size = 30
output_path = f"s3://{s3_bucket}/{s3_key}/models"
code_location = f"s3://{s3_bucket}/{s3_key}/code"

checkpoint_s3_uri = f"s3://{s3_bucket}/{s3_key}/checkpoints"
use_spot_instances = True
max_run = 86400
max_retry_attempts = 2

PY_VERSION = "py39"
FRAMEWORK_VERSION = (
    "2.8"  # Avoid bug with regards to data augmentation in tf 2.9 - 2.12
)

# Get today's date in YYYYMMDD format
today = datetime.datetime.today().strftime("%Y-%m-%d")

# Define endpoint name with version and date
endpoint_name = f"fine-tune-cnn-{today}"

## Tensorflow Estimator

In [13]:
tf_estimator = TensorFlow(
    entry_point=fine_tune_script_path,
    source_dir=src_dir_path,
    role=role,
    py_version=PY_VERSION,
    framework_version=FRAMEWORK_VERSION,
    volume_size=volume_size,
    model_dir=model_dir,
    output_path=output_path,
    code_location=output_path,
    instance_type=instance_type,
    instance_count=instance_count,
    # Spot training
    max_run=max_run,
    max_wait=max_run,
    max_retry_attempts=max_retry_attempts,
    use_spot_instances=use_spot_instances,
    checkpoint_s3_uri=checkpoint_s3_uri,
)
tf_estimator

<sagemaker.tensorflow.estimator.TensorFlow at 0x7fcdd9e206a0>

## Hyperpameter Optimization

In [14]:
search_space = {
    "random_contrast_factor": ContinuousParameter(min_value=0.1, max_value=1.0),
    "random_flip_mode": CategoricalParameter(
        values=["horizontal", "vertical", "horizontal_and_vertical"]
    ),
    "random_rotation_factor": ContinuousParameter(min_value=0.1, max_value=0.5),
    "random_zoom_factor": ContinuousParameter(min_value=0.1, max_value=0.5),
    "dense_units": CategoricalParameter(values=[64, 128, 256]),
    "dense_weight_decay": ContinuousParameter(
        min_value=1e-8, max_value=1e-3, scaling_type="Logarithmic"
    ),
    "dropout_rate": ContinuousParameter(min_value=0.0, max_value=0.5),
    "learning_rate": ContinuousParameter(
        min_value=1e-4, max_value=1e-2, scaling_type="Logarithmic"
    ),
    "clipnorm": ContinuousParameter(min_value=0.2, max_value=1.0),
    "batch_size": CategoricalParameter(values=[32, 64, 128]),
    "epochs": IntegerParameter(min_value=10, max_value=20),
}

objective_metric_name = "accuracy"
objective_type = "Maximize"
metric_definitions = [
    {
        "Name": objective_metric_name,
        "Regex": "Best validation accuracy after fine-tuning: ([0-9\\.]+)",
    }
]
base_tuning_job_name = f"fine-tune-hpo-job-{today}"

In [15]:
hpo_tuner = HyperparameterTuner(
    estimator=tf_estimator,
    objective_metric_name=objective_metric_name,
    hyperparameter_ranges=search_space,
    metric_definitions=metric_definitions,
    strategy="Bayesian",
    objective_type=objective_type,
    max_jobs=25,
    max_parallel_jobs=5,
    base_tuning_job_name=base_tuning_job_name,
)

In [None]:
hpo_tuner.fit(train_val_path)

No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config


...............................................................................................................

Check hpo results:

In [17]:
hpo_analytics = sagemaker.HyperparameterTuningJobAnalytics(
    hpo_tuner.latest_tuning_job.name
)

hpo_results = hpo_analytics.dataframe()

hpo_results.sort_values("FinalObjectiveValue", ascending=False).head(10)

Unnamed: 0,batch_size,clipnorm,dense_units,dense_weight_decay,dropout_rate,epochs,learning_rate,random_contrast_factor,random_flip_mode,random_rotation_factor,random_zoom_factor,TrainingJobName,TrainingJobStatus,FinalObjectiveValue,TrainingStartTime,TrainingEndTime,TrainingElapsedTimeSeconds
5,"""128""",0.861308,"""128""",0.0001199154,0.205819,12.0,0.0001,0.782325,"""horizontal_and_vertical""",0.1,0.249429,fine-tune-hpo-job-20-230512-2030-020-6f6654bb,Completed,0.982036,2023-05-12 20:55:37+00:00,2023-05-12 21:01:09+00:00,332.0
12,"""128""",0.43188,"""256""",0.0001045473,0.230433,14.0,0.000649,0.448479,"""vertical""",0.13563,0.463527,fine-tune-hpo-job-20-230512-2030-013-f129f307,Completed,0.976048,2023-05-12 20:46:17+00:00,2023-05-12 20:53:00+00:00,403.0
16,"""64""",1.0,"""64""",0.0005557043,0.337955,10.0,0.000149,0.190498,"""horizontal""",0.320856,0.1,fine-tune-hpo-job-20-230512-2030-009-1a871ba7,Completed,0.976048,2023-05-12 20:40:13+00:00,2023-05-12 20:45:25+00:00,312.0
8,"""32""",0.948598,"""256""",1.828381e-07,0.0,19.0,0.0001,1.0,"""horizontal""",0.226589,0.480112,fine-tune-hpo-job-20-230512-2030-017-4ba886a4,Completed,0.976048,2023-05-12 20:50:13+00:00,2023-05-12 20:55:41+00:00,328.0
19,"""64""",0.537339,"""256""",1.488429e-08,0.352538,10.0,0.000173,0.102225,"""horizontal""",0.25675,0.155183,fine-tune-hpo-job-20-230512-2030-006-d0754436,Completed,0.97006,2023-05-12 20:38:33+00:00,2023-05-12 20:42:46+00:00,253.0
18,"""128""",0.918315,"""128""",2.132268e-06,0.037743,20.0,0.000159,1.0,"""vertical""",0.224701,0.349533,fine-tune-hpo-job-20-230512-2030-007-919ec30a,Completed,0.97006,2023-05-12 20:39:16+00:00,2023-05-12 20:45:54+00:00,398.0
4,"""128""",0.386434,"""256""",6.816605e-06,0.237063,19.0,0.0001,0.397628,"""horizontal_and_vertical""",0.27177,0.188863,fine-tune-hpo-job-20-230512-2030-021-33985d06,Completed,0.97006,2023-05-12 20:55:59+00:00,2023-05-12 21:03:30+00:00,451.0
17,"""128""",0.2,"""64""",1e-08,0.0,10.0,0.0001,0.791549,"""vertical""",0.5,0.369037,fine-tune-hpo-job-20-230512-2030-008-2c331e3f,Completed,0.97006,2023-05-12 20:39:30+00:00,2023-05-12 20:46:08+00:00,398.0
1,"""128""",1.0,"""64""",3.239013e-05,0.0,20.0,0.000178,0.1,"""vertical""",0.1,0.479058,fine-tune-hpo-job-20-230512-2030-024-d7d5a4de,Completed,0.97006,2023-05-12 21:00:39+00:00,2023-05-12 21:08:15+00:00,456.0
10,"""32""",0.325136,"""128""",2.628396e-05,0.166228,13.0,0.001136,0.198546,"""horizontal""",0.492498,0.252394,fine-tune-hpo-job-20-230512-2030-015-fe19df8a,Completed,0.964072,2023-05-12 20:49:58+00:00,2023-05-12 20:57:11+00:00,433.0


In [18]:
hover = HoverHelper(hpo_analytics)

p = figure(plot_width=900, plot_height=400, tools=hover.tools(), x_axis_type="datetime")
p.circle(source=hpo_results, x="TrainingStartTime", y="FinalObjectiveValue")
p.xaxis.axis_label = "Trial"
p.yaxis.axis_label = "Accuracy"
p.title = "HPO Results"
show(p);

## Deploy Best Model

In [19]:
# Obtain the name of best training job
best_model_name = sm_boto3.describe_hyper_parameter_tuning_job(
    HyperParameterTuningJobName=hpo_tuner.latest_tuning_job.name
)["BestTrainingJob"]["TrainingJobName"]

# Obtain s3 path to model artifacts
best_model_s3_path = sm_boto3.describe_training_job(TrainingJobName=best_model_name)[
    "ModelArtifacts"
]["S3ModelArtifacts"]

print(f"Best model artifacts persisted at {best_model_s3_path}")

Best model artifacts persisted at s3://sagemaker-us-east-1-722696965592/fine-tune-hpo-job-20-230512-2030-020-6f6654bb/output/model.tar.gz


In [None]:
best_model = TensorFlowModel(
    model_data=best_model_s3_path, role=role, framework_version=FRAMEWORK_VERSION
)

predictor = best_model.deploy(
    initial_instance_count=1,
    instance_type="ml.c5.4xlarge",
    endpoint_name=endpoint_name + "-v1",
    container_startup_health_check_timeout=5 * 60,
)

update_endpoint is a no-op in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


---!

## Inference on Test Set

In [21]:
# Load test data
X_test = np.load(
    io.BytesIO(
        s3.get_object(Bucket=s3_bucket, Key=f"{s3_key}/input-data/test/X_test.npy")[
            "Body"
        ].read()
    )
)
y_test = np.load(
    io.BytesIO(
        s3.get_object(Bucket=s3_bucket, Key=f"{s3_key}/input-data/test/y_test.npy")[
            "Body"
        ].read()
    )
)

X_test.shape, y_test.shape

((172, 256, 256, 3), (172,))

Make predictions in batches:

In [22]:
# Define the batch size
batch_size = 5

# Generate predictions in batches
num_instances = X_test.shape[0]
num_batches = (num_instances + batch_size - 1) // batch_size

predicted_labels = []
for i in tqdm(range(num_batches)):
    start_idx = i * batch_size
    end_idx = min(start_idx + batch_size, num_instances)
    batch = X_test[start_idx:end_idx]

    # Call the predict method of the TensorFlowPredictor object
    predictions = predictor.predict(batch)

    # Extract the predicted labels from the response
    predicted_labels_batch = np.argmax(
        predictions["predictions"], axis=1
    )  # Use axis=1 to find max probability across 4 columns (classes)

    # Append the predicted labels to the list
    predicted_labels.append(predicted_labels_batch)

# Concatenate the predicted labels from all batches
predicted_labels = np.concatenate(predicted_labels, axis=0)

predicted_labels

100%|██████████| 35/35 [01:01<00:00,  1.77s/it]


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 2, 2, 0, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3])

In [23]:
print(
    "Accuracy score on test set:",
    accuracy_score(y_true=y_test, y_pred=predicted_labels),
)

Accuracy score on test set: 0.9709302325581395


## Clean up

In [24]:
sm_boto3.delete_endpoint(EndpointName=predictor.endpoint_name)
sm_boto3.delete_endpoint_config(EndpointConfigName=predictor.endpoint_name)
sm_boto3.delete_model(ModelName=best_model.name)

{'ResponseMetadata': {'RequestId': '7a093c1c-d1da-4151-861d-20dcba7db8b9',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '7a093c1c-d1da-4151-861d-20dcba7db8b9',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '0',
   'date': 'Fri, 12 May 2023 21:30:49 GMT'},
  'RetryAttempts': 0}}

In [25]:
sm_boto3.close()