## Image Classification - TensorFlow 

https://github.com/aws/amazon-sagemaker-examples/blob/main/introduction_to_amazon_algorithms/image_classification_tensorflow/Amazon_TensorFlow_Image_Classification.ipynb


https://github.com/aws/amazon-sagemaker-examples/blob/93fc48d21bf88d07853775f11d6ef7db92110549/introduction_to_amazon_algorithms/jumpstart_image_classification/Amazon_JumpStart_Image_Classification.ipynb


https://aws.amazon.com/blogs/machine-learning/transfer-learning-for-tensorflow-image-classification-models-in-amazon-sagemaker/


https://aws.amazon.com/blogs/machine-learning/run-image-classification-with-amazon-sagemaker-jumpstart/

In [85]:
%load_ext nb_black

The nb_black extension is already loaded. To reload it, use:
  %reload_ext nb_black


<IPython.core.display.Javascript object>

In [86]:
import json
import logging
from datetime import datetime
import pandas as pd

<IPython.core.display.Javascript object>

In [87]:
import boto3
import sagemaker
from sagemaker import get_execution_role
from sagemaker import image_uris,  script_uris, model_uris
from sagemaker import hyperparameters as hyperparameters_module
from sagemaker.estimator import Estimator


<IPython.core.display.Javascript object>

In [88]:
boto3.set_stream_logger(name="botocore.credentials", level=logging.WARNING)

<IPython.core.display.Javascript object>

In [89]:
sess = sagemaker.Session()
region = sess.boto_region_name
print(region)

us-west-2


<IPython.core.display.Javascript object>

In [90]:
import os
from dotenv import load_dotenv
load_dotenv()
# Define the bucket name and region
IMAGE_S3_BUCKET = os.getenv("IMAGE_S3_BUCKET")
REGION = os.getenv("REGION")
SAGE_MAKER_LOCAL_ROLE = os.getenv("SAGE_MAKER_LOCAL_ROLE")
print(f"IMAGE_S3_BUCKET: {IMAGE_S3_BUCKET}")
print(f"REGION: {REGION}")
print(f"SAGE_MAKER_LOCAL_ROLE: {SAGE_MAKER_LOCAL_ROLE}")

IMAGE_S3_BUCKET: sgmkr-images-training
REGION: us-west-2
SAGE_MAKER_LOCAL_ROLE: arn:aws:iam::654654352356:role/service-role/AmazonSageMaker-ExecutionRole-20250111T085887


<IPython.core.display.Javascript object>

## Preparing and upload data to S3
* For pytorch data testing, we need to have /train_imgs/rose, /train_imgs/daisy, train_imgs/dandelion in subfolders

In [91]:
# Create the S3 bucket for Images, if it not exists
import boto3
from botocore.exceptions import ClientError

# Initialize S3 client with the specified region
s3_client = boto3.client('s3', region_name=REGION)

# Function to check if bucket exists
def bucket_exists(bucket_name):
    try:
        s3_client.head_bucket(Bucket=bucket_name)
        return True
    except ClientError:
        return False

# Function to create bucket if it doesn't exist
def create_bucket(bucket_name, region):
    try:
        if not bucket_exists(bucket_name):
            s3_client.create_bucket(
                Bucket=bucket_name,
                CreateBucketConfiguration={'LocationConstraint': region}
            )
            print(f"Bucket '{bucket_name}' created successfully in region '{region}'.")
        else:
            print(f"Bucket '{bucket_name}' already exists.")
    except ClientError as e:
        print(f"Error creating bucket: {e}")

# Create the bucket
create_bucket(IMAGE_S3_BUCKET, REGION)

Bucket 'sgmkr-images-training' already exists.


<IPython.core.display.Javascript object>

In [92]:
#!/bin/bash
if not os.path.exists('./images/flowers'):
  !curl -L -o ./images/flowers.zip https://www.kaggle.com/api/v1/datasets/download/alxmamaev/flowers-recognition
  !unzip -o ./images/flowers.zip -d ./images
  !rm ./images/flowers.zip
else:
  print("Dataset already downloaded")

Dataset already downloaded


<IPython.core.display.Javascript object>

## Copy Training Images to S3
* For the training dataset, we will upload the first `CLASS_TRAIN_SAMPLES` of 30 images for each flower
* S3 structures will look like:
```
- train_imgs
--- daisy
--- rose
--- dandelion
--- sunflower
--- tulip
```

In [93]:
import shutil

DATA_PATH = "pytorch-classification"
CLASS_TRAIN_SAMPLES = 50

# Upload the images to the S3 bucket, if the prefix doesn't exist
s3_train_imgs_path = "s3://{}/{}/{}".format(IMAGE_S3_BUCKET, DATA_PATH, "train_imgs")

# 5 different classes of flowers
subfolders = ['daisy', 'rose', 'dandelion', 'sunflower', 'tulip']

def prefix_exists(bucket_name, prefix):
  response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
  return 'Contents' in response

for subfolder in subfolders:
  s3_subfolder = "{}/{}/{}".format(DATA_PATH, "train_imgs", subfolder)
  if prefix_exists(IMAGE_S3_BUCKET, s3_subfolder):
    # Remove all objects and files inside the S3 prefix path, before uploading the new files
    print(f"There are existing files in the prefix {s3_subfolder}. Removing all files...")
    command = f"aws s3 rm s3://{IMAGE_S3_BUCKET}/{s3_subfolder}/ --recursive --quiet"
    os.system(command)
  
  # Create a temporary directory to store the first 30 samples
  temp_dir = f"./temp_{subfolder}"
  os.makedirs(temp_dir, exist_ok=True)
  
  # Copy the first 30 samples to the temporary directory
  src_dir = f"./images/flowers/{subfolder}"
  files = os.listdir(src_dir)[:CLASS_TRAIN_SAMPLES]
  for file in files:
    shutil.copy(os.path.join(src_dir, file), temp_dir)
  
  # Upload the first 30 samples to the S3 bucket
  command = f"aws s3 cp --recursive {temp_dir} s3://{IMAGE_S3_BUCKET}/{s3_subfolder}/ --quiet"
  os.system(command)
  print(f"Upload completed {CLASS_TRAIN_SAMPLES} images for {s3_subfolder}")
  os.system(command)
  
  # Remove the temporary directory
  shutil.rmtree(temp_dir)

There are existing files in the prefix pytorch-classification/train_imgs/daisy. Removing all files...
Upload completed 50 images for pytorch-classification/train_imgs/daisy
There are existing files in the prefix pytorch-classification/train_imgs/rose. Removing all files...
Upload completed 50 images for pytorch-classification/train_imgs/rose
There are existing files in the prefix pytorch-classification/train_imgs/dandelion. Removing all files...
Upload completed 50 images for pytorch-classification/train_imgs/dandelion
There are existing files in the prefix pytorch-classification/train_imgs/sunflower. Removing all files...
Upload completed 50 images for pytorch-classification/train_imgs/sunflower
There are existing files in the prefix pytorch-classification/train_imgs/tulip. Removing all files...
Upload completed 50 images for pytorch-classification/train_imgs/tulip


<IPython.core.display.Javascript object>

## Copy validation images
* Copy the next `CLASS_VALIDATION_SAMPLES = 15` images from each flower-type subfolder to corresponding S3 destination
* Skip the first `CLASS_TRAINING_SAMPLES = 30` for images, which are already used for training

In [94]:
CLASS_VALIDATION_SAMPLES = 15

# Upload the images to the S3 bucket, if the prefix doesn't exist
s3_valid_imgs_path = "s3://{}/{}/{}".format(IMAGE_S3_BUCKET, DATA_PATH, "valid_imgs")
for subfolder in subfolders:
  s3_subfolder = "{}/{}/{}".format(DATA_PATH, "valid_imgs", subfolder)
  if prefix_exists(IMAGE_S3_BUCKET, s3_subfolder):
    # Remove all objects and files inside the S3 prefix path, before uploading the new files
    print(f"There are existing files in the prefix {s3_subfolder}. Removing all files...")
    command = f"aws s3 rm s3://{IMAGE_S3_BUCKET}/{s3_subfolder}/ --recursive --quiet"
    os.system(command)
  
  # Create a temporary directory to store the validation samples
  temp_dir = f"./temp_{subfolder}_valid"
  os.makedirs(temp_dir, exist_ok=True)
  
  # Copy the validation samples to the temporary directory
  src_dir = f"./images/flowers/{subfolder}"

  # Skip the first CLASS_TRAIN_SAMPLES number samples, in each subfolder, which is used for training
  files = os.listdir(src_dir)[CLASS_TRAIN_SAMPLES:CLASS_TRAIN_SAMPLES + CLASS_VALIDATION_SAMPLES]
  for file in files:
    shutil.copy(os.path.join(src_dir, file), temp_dir)
  
  # Upload the validation samples to the S3 bucket
  command = f"aws s3 cp --recursive {temp_dir} s3://{IMAGE_S3_BUCKET}/{s3_subfolder}/ --quiet"
  os.system(command)
  print(f"Upload completed {CLASS_VALIDATION_SAMPLES} images for {s3_subfolder}")
  
  # Remove the temporary directory
  shutil.rmtree(temp_dir)

There are existing files in the prefix pytorch-classification/valid_imgs/daisy. Removing all files...
Upload completed 15 images for pytorch-classification/valid_imgs/daisy
There are existing files in the prefix pytorch-classification/valid_imgs/rose. Removing all files...
Upload completed 15 images for pytorch-classification/valid_imgs/rose
There are existing files in the prefix pytorch-classification/valid_imgs/dandelion. Removing all files...
Upload completed 15 images for pytorch-classification/valid_imgs/dandelion
There are existing files in the prefix pytorch-classification/valid_imgs/sunflower. Removing all files...
Upload completed 15 images for pytorch-classification/valid_imgs/sunflower
There are existing files in the prefix pytorch-classification/valid_imgs/tulip. Removing all files...
Upload completed 15 images for pytorch-classification/valid_imgs/tulip


<IPython.core.display.Javascript object>

## After preparing data, we can jumpstart the existing model

In [95]:
# Checking for all available jumpstart models
# download JumpStart model_manifest file. https://docs.aws.amazon.com/sagemaker/latest/dg/jumpstart-deploy.html
boto3.client("s3").download_file(
    f"jumpstart-cache-prod-{region}", "models_manifest.json", "models_manifest.json"
)
with open("models_manifest.json", "rb") as json_file:
    model_list = json.load(json_file)

print("number of models: ", len(model_list))
model_df = pd.DataFrame(model_list)
model_df.sample(20)

number of models:  10433


Unnamed: 0,model_id,version,min_version,spec_key,search_keywords
9795,tensorflow-tc-electra-base-1,1.1.2,2.75.0,community_models/tensorflow-tc-electra-base-1/...,
9751,tensorflow-tc-bert-en-wwm-cased-L-24-H-1024-A-...,2.0.0,2.80.0,community_models/tensorflow-tc-bert-en-wwm-cas...,
10400,xgboost-classification-model,1.2.0,2.75.0,community_models/xgboost-classification-model/...,
1978,huggingface-tc-roberta-base,2.0.5,2.189.0,community_models/huggingface-tc-roberta-base/s...,"[Text, Text Classification]"
1607,huggingface-spc-bert-large-uncased,1.2.0,2.75.0,community_models/huggingface-spc-bert-large-un...,
10044,tensorflow-tc-small-bert-bert-en-uncased-L-8-H...,2.0.1,2.189.0,community_models/tensorflow-tc-small-bert-bert...,
3387,huggingface-txt2img-nitrosocke-nitro-diffusion,2.0.3,2.189.0,community_models/huggingface-txt2img-nitrosock...,
1410,huggingface-sentencesimilarity-bge-small-en-v1-5,1.1.3,2.189.0,community_models/huggingface-sentencesimilarit...,
2414,huggingface-textgeneration-bloomz-1b1,2.0.0,2.189.0,community_models/huggingface-textgeneration-bl...,
325,huggingface-llm-amazon-falconlite2,1.2.3,2.189.0,community_models/huggingface-llm-amazon-falcon...,


<IPython.core.display.Javascript object>

In [96]:
# filter-out all the Image classifications models from the manifest list.
ic_models = []
for model in model_list:
    model_id = model["model_id"]
    if ("-ic-" in model_id or "-ic1-" in model_id) and model_id not in ic_models:
        ic_models.append(model_id)

print(f"Number of icmodels available for inference: {len(ic_models)}")

# Looking for all the pytorch-ic-mobilenet models
pytorch_ic_mobinet_models = []
for model_id in ic_models:
    if "pytorch-ic-mobilenet" in model_id:
        pytorch_ic_mobinet_models.append(model_id)
pytorch_ic_mobinet_models

Number of icmodels available for inference: 162


['pytorch-ic-mobilenet-v2']

<IPython.core.display.Javascript object>

In [97]:
model_id = pytorch_ic_mobinet_models[0]
model_version = "3.0.8" # use a fixed version for stable results. May upgrade to the latest version
model_id

'pytorch-ic-mobilenet-v2'

<IPython.core.display.Javascript object>

In [98]:
train_instance_type = "ml.g4dn.xlarge"

train_image_uri = image_uris.retrieve(
    region=None,
    framework=None,
    model_id=model_id,
    model_version=model_version,
    image_scope="training",
    instance_type=train_instance_type,
)

train_source_uri = script_uris.retrieve(
    model_id=model_id, model_version=model_version, script_scope="training"
)

train_model_uri = model_uris.retrieve(
    model_id=model_id, model_version=model_version, model_scope="training"
)

print(train_image_uri)
print(train_source_uri)
print(train_model_uri)

763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:1.10.0-gpu-py38
s3://jumpstart-cache-prod-us-west-2/source-directory-tarballs/pytorch/transfer_learning/ic/prepack/v1.1.0/sourcedir.tar.gz
s3://jumpstart-cache-prod-us-west-2/pytorch-training/v2.0.0/train-pytorch-ic-mobilenet-v2.tar.gz


<IPython.core.display.Javascript object>

In [107]:
hyperparameters = hyperparameters_module.retrieve_default(
    model_id=model_id, model_version=model_version
)

hyperparameters["epochs"] = "60"  # Increase the number of epochs
hyperparameters["learning_rate"] = "0.0003"  # Adjust the learning rate
hyperparameters["batch_size"] = "8"  # Adjust the batch size
hyperparameters["train_only_top_layer"] = False  # Fine-tune more layers
hyperparameters["data_augmentation"] = True  # Enable data augmentation if supported

print(hyperparameters)

{'train_only_top_layer': False, 'epochs': '100', 'learning_rate': '0.0003', 'batch_size': '8', 'reinitialize_top_layer': 'Auto', 'data_augmentation': True}


<IPython.core.display.Javascript object>

In [108]:
s3_output_path = "s3://{}/{}/{}".format(IMAGE_S3_BUCKET, DATA_PATH, "model_output")

if prefix_exists(IMAGE_S3_BUCKET, "{}/model_output".format(DATA_PATH)):
  print(f"Removing existing files in the prefix {DATA_PATH}/model_output...")
  command = f"aws s3 rm s3://{IMAGE_S3_BUCKET}/{DATA_PATH}/model_output/ --recursive --quiet"
  os.system(command)

Removing existing files in the prefix pytorch-classification/model_output...


<IPython.core.display.Javascript object>

In [109]:
if "SM_CURRENT_HOST" in os.environ:
  print("Running in SageMaker Studio")
  # only inside Sagemaker notebook Studio
  role_arn = sagemaker.get_execution_role()
else:
  print("Not running in SageMaker Studio. Using custom role for local computer")
  # in local computer, we will get it from environment variable
  role_arn = SAGE_MAKER_LOCAL_ROLE

print(role_arn)

Not running in SageMaker Studio. Using custom role for local computer
arn:aws:iam::654654352356:role/service-role/AmazonSageMaker-ExecutionRole-20250111T085887


<IPython.core.display.Javascript object>

In [111]:
job_name_prefix = "flowers-clf-js-tf-"

clf_estimator = Estimator(
    role=role_arn,
    image_uri=train_image_uri,
    source_dir=train_source_uri,
    model_uri=train_model_uri,
    entry_point="transfer_learning.py",
    instance_count=1,
    instance_type=train_instance_type,
    max_run=360000,
    hyperparameters=hyperparameters,
    output_path=s3_output_path,
)

<IPython.core.display.Javascript object>

In [112]:
s3_train_imgs = "s3://{}/{}/{}".format(IMAGE_S3_BUCKET, DATA_PATH, "train_imgs")
s3_valid_imgs = "s3://{}/{}/{}".format(IMAGE_S3_BUCKET, DATA_PATH, "valid_imgs")
data_channels = {
    "training": s3_train_imgs,
    "validation": s3_valid_imgs,
}
print(data_channels)

{'training': 's3://sgmkr-images-training/pytorch-classification/train_imgs', 'validation': 's3://sgmkr-images-training/pytorch-classification/valid_imgs'}


<IPython.core.display.Javascript object>

In [113]:
timestamp = (
    str(datetime.now().replace(microsecond=0)).replace(" ", "-").replace(":", "-")
)
job_name = job_name_prefix + timestamp
print(job_name)

flowers-clf-js-tf-2025-01-12-21-43-15


<IPython.core.display.Javascript object>

In [114]:
clf_estimator.fit(inputs=data_channels, logs=True, job_name=job_name)

2025-01-13 05:43:18 Starting - Starting the training job...
2025-01-13 05:43:32 Starting - Preparing the instances for training...
2025-01-13 05:44:13 Downloading - Downloading the training image.................bash: cannot set terminal process group (-1): Inappropriate ioctl for device
bash: no job control in this shell
2025-01-13 05:47:11,555 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training
2025-01-13 05:47:11,584 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.
2025-01-13 05:47:11,589 sagemaker_pytorch_container.training INFO     Invoking user training script.
2025-01-13 05:47:11,744 sagemaker-training-toolkit INFO     Installing dependencies from requirements.txt:
/opt/conda/bin/python3.8 -m pip install -r requirements.txt
Processing ./lib/sagemaker_jumpstart_prepack_script_utilities/sagemaker_jumpstart_prepack_script_utilities-1.0.0-py2.py3-none-any.whl
Installing collected packages: sagemaker-jumps

<IPython.core.display.Javascript object>

In [115]:
infer_instance_type = "ml.t2.medium"

<IPython.core.display.Javascript object>

In [116]:
deploy_image_uri = image_uris.retrieve(
    region=None,
    framework=None,
    image_scope="inference",
    model_id=model_id,
    model_version=model_version,
    instance_type=infer_instance_type,
)

deploy_source_uri = script_uris.retrieve(
    model_id=model_id, model_version=model_version, script_scope="inference"
)

<IPython.core.display.Javascript object>

In [117]:
model_name = job_name
endpoint_name = job_name

<IPython.core.display.Javascript object>

In [118]:
clf_predictor = clf_estimator.deploy(
    initial_instance_count=1,
    instance_type=infer_instance_type,
    entry_point="inference.py",
    image_uri=deploy_image_uri,
    source_dir=deploy_source_uri,
    endpoint_name=endpoint_name,
    model_name=model_name,
)

-------!

<IPython.core.display.Javascript object>

In [119]:
sgmkr_runt = boto3.client("runtime.sagemaker")

<IPython.core.display.Javascript object>

In [121]:
with open("images/rose1.jpg", "rb") as image:
    payload = image.read()
    # payload = bytearray(payload)

response = sgmkr_runt.invoke_endpoint(
    EndpointName=endpoint_name,
    # ContentType = 'image/jpeg',
    ContentType="application/x-image",
    Accept="application/json;verbose",
    Body=payload,
)

prediction = json.loads(response["Body"].read().decode())
print(prediction)
prediction["predicted_label"]

{'probabilities': [0.010725224390625954, 0.00042605094495229423, 0.9787600636482239, 0.0002979158016387373, 0.009790708310902119], 'labels': ['daisy', 'dandelion', 'rose', 'sunflower', 'tulip'], 'predicted_label': 'rose'}


'rose'

<IPython.core.display.Javascript object>

In [122]:
with open("images/daisy1.jpg", "rb") as image:
    payload = image.read()
    # payload = bytearray(payload)

response = sgmkr_runt.invoke_endpoint(
    EndpointName=endpoint_name,
    # ContentType = 'image/jpeg',
    ContentType="application/x-image",
    Accept="application/json;verbose",
    Body=payload,
)

prediction = json.loads(response["Body"].read().decode())
print(prediction)
prediction["predicted_label"]

{'probabilities': [0.8895570039749146, 0.0037518879398703575, 0.010604382492601871, 0.05153409391641617, 0.044552627950906754], 'labels': ['daisy', 'dandelion', 'rose', 'sunflower', 'tulip'], 'predicted_label': 'daisy'}


'daisy'

<IPython.core.display.Javascript object>

In [123]:
clf_predictor.delete_endpoint()

<IPython.core.display.Javascript object>