# Dog bread classification using AWS Sagemaker

In [122]:
!pip install smdebug
!pip install sagemaker -U

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0[0m[39;49m -> [0m[32;49m23.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0[0m[39;49m -> [0m[32;49m23.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [123]:
import sagemaker
import boto3
from sagemaker.tuner import (
    IntegerParameter,
    CategoricalParameter,
    ContinuousParameter,
    HyperparameterTuner,
)
from sagemaker.pytorch import PyTorch
from sagemaker.pytorch import PyTorchModel

from sagemaker.debugger import (
    Rule,
    DebuggerHookConfig,
    rule_configs,
    ProfilerRule,
    CollectionConfig
)

import os
import IPython
import matplotlib.pyplot as plt

## 1. Dataset

Download the data from udacity S3 bucket to local and upload it to my personal S3 bucket

In [None]:
# Command to download and unzip data
!wget https://s3-us-west-1.amazonaws.com/udacity-aind/dog-project/dogImages.zip
!unzip dogImages.zip

--2023-03-17 00:03:49--  https://s3-us-west-1.amazonaws.com/udacity-aind/dog-project/dogImages.zip
Resolving s3-us-west-1.amazonaws.com (s3-us-west-1.amazonaws.com)... 52.219.193.0
Connecting to s3-us-west-1.amazonaws.com (s3-us-west-1.amazonaws.com)|52.219.193.0|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1132023110 (1.1G) [application/zip]
Saving to: ‘dogImages.zip.1’


2023-03-17 00:04:28 (28.4 MB/s) - ‘dogImages.zip.1’ saved [1132023110/1132023110]

Archive:  dogImages.zip
replace dogImages/test/001.Affenpinscher/Affenpinscher_00003.jpg? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [6]:
!aws s3 sync dogImages s3://project3-files/dogImages

upload: dogImages/test/001.Affenpinscher/Affenpinscher_00003.jpg to s3://project3-files/dogImages/test/001.Affenpinscher/Affenpinscher_00003.jpg
upload: dogImages/test/001.Affenpinscher/Affenpinscher_00023.jpg to s3://project3-files/dogImages/test/001.Affenpinscher/Affenpinscher_00023.jpg
upload: dogImages/test/001.Affenpinscher/Affenpinscher_00071.jpg to s3://project3-files/dogImages/test/001.Affenpinscher/Affenpinscher_00071.jpg
upload: dogImages/test/002.Afghan_hound/Afghan_hound_00116.jpg to s3://project3-files/dogImages/test/002.Afghan_hound/Afghan_hound_00116.jpg
upload: dogImages/test/001.Affenpinscher/Affenpinscher_00036.jpg to s3://project3-files/dogImages/test/001.Affenpinscher/Affenpinscher_00036.jpg
upload: dogImages/test/002.Afghan_hound/Afghan_hound_00125.jpg to s3://project3-files/dogImages/test/002.Afghan_hound/Afghan_hound_00125.jpg
upload: dogImages/test/002.Afghan_hound/Afghan_hound_00141.jpg to s3://project3-files/dogImages/test/002.Afghan_hound/Afghan_hound_00141.j

Create data channel variable for model and tuner fitting.

In [75]:
BUCKET = "project3-files"

In [76]:
data_channels = {
    "train": f"s3://{BUCKET}/dogImages/train/",
    "test": f"s3://{BUCKET}/dogImages/test/",
    "valid": f"s3://{BUCKET}/dogImages/valid/"
}

In [77]:
role = sagemaker.get_execution_role()

## 2. Hyperparameter Tuning

The following hyperparameters were tuned: `Learning Rate`, `Batch Size` and `Number of Epochs`.

In [78]:
# setup hyperparameters
hyperparameters = {"lr": ContinuousParameter(0.001, 0.01),
                   "batch-size": CategoricalParameter([32, 64]),
                   "epochs": IntegerParameter(10, 20)
                  }

The objective of the model training is to `minimize` the loss. In this case, `average test loss` was used.

In [79]:
# debugger objectives
objective_metric_name = "test:accuracy"
objective_type = "Minimize"
metric_definitions = [{"Name": "test:accuracy", "Regex": "Testing Accuracy: ([0-9\\.]+)"}]

`Pytorch` estimator was defined and the `HyperparameterTuner` was created.

In [80]:
estimator = PyTorch(
    entry_point="hpo.py",
    base_job_name="dog-bread-pytorch",
    role=role,
    instance_count=2,
    instance_type="ml.m5.xlarge",
    framework_version="1.8",
    py_version="py36",
    output_path=output_path
)

tuner = HyperparameterTuner(
    estimator,
    objective_metric_name,
    hyperparameters,
    metric_definitions,
    max_jobs=4,
    max_parallel_jobs=2,
    objective_type=objective_type,
)

Train the `tuner` to tune the hyperparameters.

In [None]:
tuner.fit(data_channels,
          wait=True) 

In [41]:
# Get the best estimator
best_estimator = tuner.best_estimator()

# Get the hyperparameters of the best trained model
best_estimator.hyperparameters()


2023-03-06 16:49:12 Starting - Found matching resource for reuse
2023-03-06 16:49:12 Downloading - Downloading input data
2023-03-06 16:49:12 Training - Training image download completed. Training in progress.
2023-03-06 16:49:12 Uploading - Uploading generated training model
2023-03-06 16:49:12 Completed - Resource released due to keep alive period expiry


{'_tuning_objective_metric': '"test:loss"',
 'batch-size': '"64"',
 'epochs': '11',
 'lr': '0.00974064174567434',
 'sagemaker_container_log_level': '20',
 'sagemaker_estimator_class_name': '"PyTorch"',
 'sagemaker_estimator_module': '"sagemaker.pytorch.estimator"',
 'sagemaker_job_name': '"dog-bread-pytorch-2023-03-06-16-04-22-888"',
 'sagemaker_program': '"hpo.py"',
 'sagemaker_region': '"us-east-1"',
 'sagemaker_submit_directory': '"s3://sagemaker-us-east-1-655329371831/dog-bread-pytorch-2023-03-06-16-04-22-888/source/sourcedir.tar.gz"'}

In [None]:
tuner_result = sagemaker.HyperparameterTuningJobAnalytics("pytorch-training-230306-1604")

full_df = tuner_result.dataframe()

if len(full_df) > 0:
    df = full_df[full_df["FinalObjectiveValue"] > -float("inf")]
    if len(df) > 0:
        df = df.sort_values("FinalObjectiveValue", ascending=is_minimize)
        print("Number of training jobs with valid objective: %d" % len(df))
        print({"lowest": min(df["FinalObjectiveValue"]), "highest": max(df["FinalObjectiveValue"])})
        pd.set_option("display.max_colwidth", None)  # Don't truncate TrainingJobName
    else:
        print("No training jobs have reported valid results yet.")

df

## 3. Model Profiling and Debugging


In [81]:
rules = [
    Rule.sagemaker(rule_configs.vanishing_gradient()),
    Rule.sagemaker(rule_configs.overfit()),
    Rule.sagemaker(rule_configs.overtraining()),
    Rule.sagemaker(rule_configs.poor_weight_initialization()),
    ProfilerRule.sagemaker(rule_configs.ProfilerReport()),
]

In [82]:
output_path = f"s3://{BUCKET}/outputs/"

In [83]:
hook_config = DebuggerHookConfig(
    s3_output_path=output_path,
    collection_configs=[
        CollectionConfig(
            name="CrossEntropyLoss_output",
            parameters={"include_regex":"CrossEntropyLoss_output.*",
                        "train.save_interval": "100", 
                        "eval.save_interval": "100",
                        "predict.save_interval": "100"
            }
        )
    ]
)

In [84]:
hyperparameters = {"lr":"0.00974064174567434",
                   "batch-size":"64",
                   "epochs":"11"
                  }

In [153]:
estimator = PyTorch(
    entry_point="train_model.py",
    base_job_name="smdebugger-dog-bread-pytorch",
    role=role,
    hyperparameters=hyperparameters,
    instance_count=1,
    instance_type="ml.m5.2xlarge",
    framework_version="1.8.0",
    py_version="py3",
    output_path=output_path,
    # rules and hooks 
    rules=rules,
    debugger_hook_config=hook_config,
    env={'MMS_DEFAULT_RESPONSE_TIMEOUT': '10800'}
)

In [None]:
estimator.fit(data_channels,
              wait=True)

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: smdebugger-dog-bread-pytorch-2023-03-22-18-42-23-773


2023-03-22 18:42:26 Starting - Starting the training job...
2023-03-22 18:42:56 Starting - Preparing the instances for trainingVanishingGradient: InProgress
Overfit: InProgress
Overtraining: InProgress
PoorWeightInitialization: InProgress
ProfilerReport: InProgress
...
2023-03-22 18:43:30 Downloading - Downloading input data......
2023-03-22 18:44:30 Training - Training image download completed. Training in progress..[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2023-03-22 18:44:36,246 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2023-03-22 18:44:36,248 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2023-03-22 18:44:36,258 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2023-03-22 18:44:36,261 sagemaker_pytorch_container.training INFO     Invoking use

**TODO**: Is there some anomalous behaviour in your debugging output? If so, what is the error and how will you fix it?  
**TODO**: If not, suppose there was an error. What would that error look like and how would you have fixed it?

In [155]:
# Display the profiler output
rule_output_path = estimator.output_path + estimator.latest_training_job.job_name + "/rule-output"

In [156]:
! aws s3 ls {rule_output_path} --recursive

2023-03-22 19:03:10     329715 outputs/smdebugger-dog-bread-pytorch-2023-03-22-18-42-23-773/rule-output/ProfilerReport/profiler-output/profiler-report.html
2023-03-22 19:03:10     171077 outputs/smdebugger-dog-bread-pytorch-2023-03-22-18-42-23-773/rule-output/ProfilerReport/profiler-output/profiler-report.ipynb
2023-03-22 19:03:06        192 outputs/smdebugger-dog-bread-pytorch-2023-03-22-18-42-23-773/rule-output/ProfilerReport/profiler-output/profiler-reports/BatchSize.json
2023-03-22 19:03:06        200 outputs/smdebugger-dog-bread-pytorch-2023-03-22-18-42-23-773/rule-output/ProfilerReport/profiler-output/profiler-reports/CPUBottleneck.json
2023-03-22 19:03:06        126 outputs/smdebugger-dog-bread-pytorch-2023-03-22-18-42-23-773/rule-output/ProfilerReport/profiler-output/profiler-reports/Dataloader.json
2023-03-22 19:03:06        127 outputs/smdebugger-dog-bread-pytorch-2023-03-22-18-42-23-773/rule-output/ProfilerReport/profiler-output/profiler-reports/GPUMemoryIncrease.json
2023-0

In [143]:
! aws s3 cp {rule_output_path} ./ --recursive

download: s3://project3-files/outputs/smdebugger-dog-bread-pytorch-2023-03-22-16-39-26-314/rule-output/ProfilerReport/profiler-output/profiler-reports/MaxInitializationTime.json to ProfilerReport/profiler-output/profiler-reports/MaxInitializationTime.json
download: s3://project3-files/outputs/smdebugger-dog-bread-pytorch-2023-03-22-16-39-26-314/rule-output/ProfilerReport/profiler-output/profiler-reports/OverallFrameworkMetrics.json to ProfilerReport/profiler-output/profiler-reports/OverallFrameworkMetrics.json
download: s3://project3-files/outputs/smdebugger-dog-bread-pytorch-2023-03-22-16-39-26-314/rule-output/ProfilerReport/profiler-output/profiler-reports/GPUMemoryIncrease.json to ProfilerReport/profiler-output/profiler-reports/GPUMemoryIncrease.json
download: s3://project3-files/outputs/smdebugger-dog-bread-pytorch-2023-03-22-16-39-26-314/rule-output/ProfilerReport/profiler-output/profiler-reports/Dataloader.json to ProfilerReport/profiler-output/profiler-reports/Dataloader.json
do

In [144]:
# get the autogenerated folder name of profiler report
profiler_report_name = [
    rule["RuleConfigurationName"]
    for rule in estimator.latest_training_job.rule_job_summary()
    if "Profiler" in rule["RuleConfigurationName"]
][0]

In [146]:
IPython.display.HTML(filename=profiler_report_name + "/profiler-output/profiler-report.html")

Unnamed: 0,Description,Recommendation,Number of times rule triggered,Number of datapoints,Rule parameters
Dataloader,"Checks how many data loaders are running in parallel and whether the total number is equal the number of available CPU cores. The rule triggers if number is much smaller or larger than the number of available cores. If too small, it might lead to low GPU utilization. If too large, it might impact other compute intensive operations on CPU.",Change the number of data loader processes.,0,0,min_threshold:70  max_threshold:200
LoadBalancing,"Detects workload balancing issues across GPUs. Workload imbalance can occur in training jobs with data parallelism. The gradients are accumulated on a primary GPU, and this GPU might be overused with regard to other GPUs, resulting in reducing the efficiency of data parallelization.",Choose a different distributed training strategy or a different distributed training framework.,0,0,threshold:0.2  patience:1000
IOBottleneck,Checks if the data I/O wait time is high and the GPU utilization is low. It might indicate IO bottlenecks where GPU is waiting for data to arrive from storage. The rule evaluates the I/O and GPU utilization rates and triggers the issue if the time spent on the IO bottlenecks exceeds a threshold percent of the total training time. The default threshold is 50 percent.,"Pre-fetch data or choose different file formats, such as binary formats that improve I/O performance.",0,11337,threshold:50  io_threshold:50  gpu_threshold:10  patience:1000
MaxInitializationTime,Checks if the time spent on initialization exceeds a threshold percent of the total training time. The rule waits until the first step of training loop starts. The initialization can take longer if downloading the entire dataset from Amazon S3 in File mode. The default threshold is 20 minutes.,"Initialization takes too long. If using File mode, consider switching to Pipe mode in case you are using TensorFlow framework.",0,0,threshold:20
StepOutlier,"Detects outliers in step duration. The step duration for forward and backward pass should be roughly the same throughout the training. If there are significant outliers, it may indicate a system stall or bottleneck issues.","Check if there are any bottlenecks (CPU, I/O) correlated to the step outliers.",0,0,threshold:3  mode:None  n_outliers:10  stddev:3
BatchSize,"Checks if GPUs are underutilized because the batch size is too small. To detect this problem, the rule analyzes the average GPU memory footprint, the CPU and the GPU utilization.","The batch size is too small, and GPUs are underutilized. Consider running on a smaller instance type or increasing the batch size.",0,11334,cpu_threshold_p95:70  gpu_threshold_p95:70  gpu_memory_threshold_p95:70  patience:1000  window:500
GPUMemoryIncrease,Measures the average GPU memory footprint and triggers if there is a large increase.,Choose a larger instance type with more memory if footprint is close to maximum available memory.,0,0,increase:5  patience:1000  window:10
CPUBottleneck,"Checks if the CPU utilization is high and the GPU utilization is low. It might indicate CPU bottlenecks, where the GPUs are waiting for data to arrive from the CPUs. The rule evaluates the CPU and GPU utilization rates, and triggers the issue if the time spent on the CPU bottlenecks exceeds a threshold percent of the total training time. The default threshold is 50 percent.",Consider increasing the number of data loaders or applying data pre-fetching.,0,11337,threshold:50  cpu_threshold:90  gpu_threshold:10  patience:1000
LowGPUUtilization,"Checks if the GPU utilization is low or fluctuating. This can happen due to bottlenecks, blocking calls for synchronizations, or a small batch size.","Check if there are bottlenecks, minimize blocking calls, change distributed training strategy, or increase the batch size.",0,0,threshold_p95:70  threshold_p5:10  window:500  patience:1000


## 4. Model Deploying

In [133]:
pytorch_model = PyTorchModel(model_data=f's3://{BUCKET}/outputs/smdebugger-dog-bread-pytorch-2023-03-22-15-10-50-411/output/model.tar.gz', 
                             role=role,
                             entry_point='inference.py',
                             framework_version="1.8.0",
                             py_version="py3")

In [157]:
# Deploy the model to an endpoint
# predictor=pytorch_model.deploy(initial_instance_count=1, 
#                            instance_type="ml.m5.xlarge")

# # Deploy the model to an endpoint
predictor=estimator.deploy(initial_instance_count=1, 
                           instance_type="ml.m5.2xlarge")

INFO:sagemaker:Creating model with name: smdebugger-dog-bread-pytorch-2023-03-22-19-05-04-735
INFO:sagemaker:Creating endpoint-config with name smdebugger-dog-bread-pytorch-2023-03-22-19-05-04-735
INFO:sagemaker:Creating endpoint with name smdebugger-dog-bread-pytorch-2023-03-22-19-05-04-735


-----!

In [158]:
# get test data
from train_model import create_data_loaders

test_loader = create_data_loaders("./dogImages/test/", batch_size=1)

for image, labels in test_loader:
    break
    
image.shape

Loaded ./dogImages/test/


torch.Size([1, 3, 300, 400])

In [159]:
# predict the image
response = predictor.predict(image)

ModelError: An error occurred (ModelError) when calling the InvokeEndpoint operation: Received server error (0) from primary with message "Your invocation timed out while waiting for a response from container primary. Review the latency metrics for each container in Amazon CloudWatch, resolve the issue, and try again.". See https://us-east-1.console.aws.amazon.com/cloudwatch/home?region=us-east-1#logEventViewer:group=/aws/sagemaker/Endpoints/smdebugger-dog-bread-pytorch-2023-03-22-19-05-04-735 in account 655329371831 for more information.

In [None]:
plt.imshow(response)

In [42]:
predictor.delete_endpoint()

INFO:sagemaker:Deleting endpoint configuration with name: smdebugger-dog-bread-pytorch-2023-03-20-20-29-39-157
INFO:sagemaker:Deleting endpoint with name: smdebugger-dog-bread-pytorch-2023-03-20-20-29-39-157


The end of the notebook