In [2]:
import requests

def extract_cifar_data(url, filename="cifar.tar.gz"):
    """A function for extracting the CIFAR-100 dataset and storing it as a gzipped file
    
    Arguments:
    url      -- the URL where the dataset is hosted
    filename -- the full path where the dataset will be written
    
    """
    
    # request the data from the data url
    # Hint: use `requests.get` method
    r = requests.get(url)
    with open(filename, "wb") as file_context:
        file_context.write(r.content)
    return

In [3]:
extract_cifar_data("https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz")     


In [4]:
import tarfile

with tarfile.open("cifar.tar.gz", "r:gz") as tar:
    tar.extractall()

In [5]:
import pickle

with open("./cifar-100-python/meta", "rb") as f:
    dataset_meta = pickle.load(f, encoding='bytes')

with open("./cifar-100-python/test", "rb") as f:
    dataset_test = pickle.load(f, encoding='bytes')

with open("./cifar-100-python/train", "rb") as f:
    dataset_train = pickle.load(f, encoding='bytes')

In [6]:
# Feel free to explore the datasets

dataset_train.keys()


dict_keys([b'filenames', b'batch_label', b'fine_labels', b'coarse_labels', b'data'])

In [7]:
32*32*3


3072

For a simple gut-check, let's transform one of our images. Each 1024 items in a row is a channel (red, green, then blue). Each 32 items in the channel are a row in the 32x32 image. Using python, we can stack these channels into a 32x32x3 array, and save it as a PNG file:



In [8]:
import numpy as np

# Each 1024 in a row is a channel (red, green, then blue)
row = dataset_train[b'data'][0]
red, green, blue = row[0:1024], row[1024:2048], row[2048:]

# Each 32 items in the channel are a row in the 32x32 image
red = red.reshape(32,32)
green = green.reshape(32,32)
blue = blue.reshape(32,32)

# Combine the channels into a 32x32x3 image!
combined = np.dstack((red,green,blue))

In [9]:
# All in one:
test_image = np.dstack((
    row[0:1024].reshape(32,32),
    row[1024:2048].reshape(32,32),
    row[2048:].reshape(32,32)
))

In [10]:
import matplotlib.pyplot as plt
plt.imshow(test_image);

In [11]:
# Label number corresponding to the first image in train dataset
dataset_train[b'fine_labels'][0]


19

In [12]:
print(dataset_meta[b'fine_label_names'][19])


b'cattle'


In [13]:

n=0
print(dataset_meta[b'fine_label_names'][dataset_train[b'fine_labels'][n]])


b'cattle'


In [14]:
print(dataset_train[b'filenames'][0])


b'bos_taurus_s_000507.png'


In [15]:
plt.imsave("file.png", test_image)


Your new PNG file should now appear in the file explorer -- go ahead and pop it open to see!

Now that you know how to reshape the images, save them as files, and capture their filenames and labels, let's just capture all the bicycles and motorcycles and save them. Scones Unlimited can use a model that tells these apart to route delivery drivers automatically.

In the following cell, identify the label numbers for Bicycles and Motorcycles:

In [62]:
import pandas as pd
# Todo: Filter the dataset_train and dataset_meta objects to find the label numbers for Bicycle and Motorcycles
required_label = {i:idx for idx, i in enumerate(dataset_meta[b'fine_label_names']) if i==b'motorcycle' or i==b'bicycle'}
required_label

{b'bicycle': 8, b'motorcycle': 48}

Good job! We only need objects with label 8 and 48 -- this drastically simplifies our handling of the data! Below we construct a dataframe for you, and you can safely drop the rows that don't contain observations about bicycles and motorcycles. Fill in the missing lines below to drop all other rows:



In [63]:
#Construct the dataframe
df_train = pd.DataFrame({
    "filenames": dataset_train[b'filenames'],
    "labels": dataset_train[b'fine_labels'],
    "row": range(len(dataset_train[b'filenames']))
})

# Drop all rows from df_train where label is not 8 or 48
df_train = df_train.loc[df_train["labels"].isin([8,48])]

# Decode df_train.filenames so they are regular strings
df_train["filenames"] = df_train["filenames"].apply(
    lambda x: x.decode("utf-8")
)


df_test = pd.DataFrame({
    "filenames": dataset_test[b'filenames'],
    "labels": dataset_test[b'fine_labels'],
    "row": range(len(dataset_test[b'filenames']))
})

# Drop all rows from df_test where label is not 8 or 48
df_test = df_test.loc[df_test["labels"].isin([8,48])]

# Decode df_test.filenames so they are regular strings
df_test["filenames"] = df_test["filenames"].apply(
    lambda x: x.decode("utf-8")
)
 

Now that the data is filtered for just our classes, we can save all our images.



In [18]:
!mkdir ./train
!mkdir ./test

mkdir: cannot create directory ‘./train’: File exists
mkdir: cannot create directory ‘./test’: File exists


In [None]:
# Import logging / decorating capabilities
from functools import wraps, partial
import logging

# Define logging configuration
logging.basicConfig(
    filename='project_01.log',
    level=logging.WARNING,
    format= '[%(asctime)s] {%(pathname)s:%(lineno)d} %(levelname)s - %(message)s',
    datefmt='%H:%M:%S'
)

# Test logging categories:
logging.error('Testing logging.error')
logging.warning('Testing logging.warning.')

# Helper function that attaches function as attribute of an object
def attach_wrapper(obj, func=None):  
    if func is None:
        return partial(attach_wrapper, obj)
    setattr(obj, func.__name__, func)
    return func

# Actual decorator for logging a function
def log(level, message):
    def decorate(func):
        logger = logging.getLogger(func.__module__)  # Setup logger
        formatter = logging.Formatter(
            '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
        handler = logging.StreamHandler()
        handler.setFormatter(formatter)
        logger.addHandler(handler)
        log_message = f"{func.__name__} - {message}"

        @wraps(func)
        def wrapper(*args, **kwargs):  # Logs the message and before executing the decorated function
            logger.log(level, log_message)
            return func(*args, **kwargs)
        @attach_wrapper(wrapper)  # Attaches "set_level" to "wrapper" as attribute
        def set_level(new_level):  # Function that allows us to set log level
            nonlocal level
            level = new_level

        @attach_wrapper(wrapper)  # Attaches "set_message" to "wrapper" as attribute
        def set_message(new_message):  # Function that allows us to set message
            nonlocal log_message
            log_message = f"{func.__name__} - {new_message}"

        return wrapper
    return decorate

In [67]:
def save_images(dataset, path, filename, row):
    for index, row in df.iterrows():
        #Grab the image data in row-major form
        img = dataset[b'data'][row]
        # Consolidated stacking/reshaping from earlier
        target = np.dstack((
            img[2048:].reshape(32,32),
            img[0:1024].reshape(32,32),
            img[1024:2048].reshape(32,32)
        ))
        
           # Save the image
        plt.imsave(path+filename, target)  
        # Return any signal data you want for debugging
    return

## Load Data

In [70]:
import sagemaker

session = sagemaker.Session()

bucket= session.default_bucket()
print("Default Bucket: {}".format(bucket))

region = session.boto_region_name
print("AWS Region: {}".format(region))

role = sagemaker.get_execution_role()
print("RoleArn: {}".format(role))

Default Bucket: sagemaker-us-east-1-184930056456
AWS Region: us-east-1
RoleArn: arn:aws:iam::184930056456:role/service-role/AmazonSageMaker-ExecutionRole-20230201T224510


In [24]:
import os

os.environ["DEFAULT_S3_BUCKET"] = bucket
!aws s3 sync ./train s3://${DEFAULT_S3_BUCKET}/train/
!aws s3 sync ./test s3://${DEFAULT_S3_BUCKET}/test/

upload: train/bicycle_s_000017.png to s3://sagemaker-us-east-1-184930056456/train/bicycle_s_000017.png
upload: train/bicycle_s_000021.png to s3://sagemaker-us-east-1-184930056456/train/bicycle_s_000021.png
upload: train/bicycle_s_000124.png to s3://sagemaker-us-east-1-184930056456/train/bicycle_s_000124.png
upload: train/bicycle_s_000038.png to s3://sagemaker-us-east-1-184930056456/train/bicycle_s_000038.png
upload: train/bicycle_s_000043.png to s3://sagemaker-us-east-1-184930056456/train/bicycle_s_000043.png
upload: train/bicycle_s_000099.png to s3://sagemaker-us-east-1-184930056456/train/bicycle_s_000099.png
upload: train/bicycle_s_000051.png to s3://sagemaker-us-east-1-184930056456/train/bicycle_s_000051.png
upload: train/bicycle_s_000137.png to s3://sagemaker-us-east-1-184930056456/train/bicycle_s_000137.png
upload: train/bicycle_s_000071.png to s3://sagemaker-us-east-1-184930056456/train/bicycle_s_000071.png
upload: train/bicycle_s_000147.png to s3://sagemaker-us-east-1-1849300564

In [25]:
def to_metadata_file(df, prefix):
    df["s3_path"] = df["filenames"]
    df["labels"] = df["labels"].apply(lambda x: 0 if x==8 else 1)
    return df[["row", "labels", "s3_path"]].to_csv(
        f"{prefix}.lst", sep="\t", index=False, header=False
    )
    
to_metadata_file(df_train.copy(), "train")
to_metadata_file(df_test.copy(), "test")

In [26]:
import boto3

# Upload files
boto3.Session().resource('s3').Bucket(
    bucket).Object('train.lst').upload_file('./train.lst')
boto3.Session().resource('s3').Bucket(
    bucket).Object('test.lst').upload_file('./test.lst')

In [72]:
# Use the image_uris function to retrieve the latest 'image-classification' image 

algo_image = sagemaker.image_uris.retrieve(framework="image-classification", region=region)
s3_output_location = f"s3://{bucket}/models/image_model"


In [73]:
img_classifier_model = sagemaker.estimator.Estimator(
    algo_image,
    role=role,
    instance_count=1,
    instance_type='ml.p3.2xlarge',
    volume_size=40,
    output_path=s3_output_location,
    sagemaker_session=session,
)

In [74]:
img_classifier_model.set_hyperparameters(
    image_shape='3,32,32', 
    num_classes=2, 
    num_training_samples=len(df_train)
)


3. Load the data
Now we can load the data into S3.

Using the sagemaker SDK grab the current region, execution role, and bucket.

In [75]:
from sagemaker.debugger import Rule, rule_configs
from sagemaker.session import TrainingInput
model_inputs = {
        "train": sagemaker.inputs.TrainingInput(
            s3_data=f"s3://{bucket}/train/",
            content_type="application/x-image"
        ),
        "validation": sagemaker.inputs.TrainingInput(
            s3_data=f"s3://{bucket}/test/",
            content_type="application/x-image"
        ),
        "train_lst": sagemaker.inputs.TrainingInput(
            s3_data=f"s3://{bucket}/train.lst",
            content_type="application/x-image"
        ),
        "validation_lst": sagemaker.inputs.TrainingInput(
            s3_data=f"s3://{bucket}/test.lst",
            content_type="application/x-image"
        )
}

In [76]:
## train your model
img_classifier_model.fit(model_inputs)

2023-02-04 21:39:08 Starting - Starting the training job...
2023-02-04 21:39:36 Starting - Preparing the instances for trainingProfilerReport-1675546748: InProgress
.........
2023-02-04 21:41:07 Downloading - Downloading input data
2023-02-04 21:41:07 Training - Downloading the training image...............
2023-02-04 21:43:34 Training - Training image download completed. Training in progress....[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34mNvidia gpu devices, drivers and cuda toolkit versions (only available on hosts with GPU):[0m
[34mSat Feb  4 21:43:57 2023       [0m
[34m+-----------------------------------------------------------------------------+[0m
[34m| NVIDIA-SMI 515.65.07    Driver Version: 515.65.07    CUDA Version: 11.7     |[0m
[34m|-------------------------------+----------------------+----------------------+[0m
[34m| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr.

And that's it! You can check the bucket and verify that the items were uploaded.



Model Training
For Image Classification, Sagemaker also expects metadata e.g. in the form of TSV files with labels and filepaths. We can generate these using our Pandas DataFrames from earlier:

## Model Training

In [32]:
from sagemaker.model_monitor import DataCaptureConfig

data_capture_config = DataCaptureConfig(
    enable_capture=True,
    sampling_percentage=100,
    destination_s3_uri=f"s3://{bucket}/data_capture"
)

Note the destination_s3_uri parameter: At the end of the project, we can explore the data_capture directory in S3 to find crucial data about the inputs and outputs Model Monitor has observed on our model endpoint over time.

With that done, deploy your model on a single ml.m5.xlarge instance with the data capture config attached:

We can also upload our manifest files:



In [77]:
deployment = img_classifier_model.deploy(
    initial_instance_count=1, instance_type='ml.m5.xlarge',
    data_capture_config=data_capture_config
    )

endpoint = deployment.endpoint_name
print(endpoint)

--------!image-classification-2023-02-04-21-53-36-205


In [78]:
predictor = deployment


In [79]:
from sagemaker.serializers import IdentitySerializer
import base64

predictor.serializer = IdentitySerializer("image/png")
with open("./test/bicycle_s_001789.png", "rb") as f:
    payload = f.read()

    
inference = predictor.predict(payload)

In [80]:
img_classifier_model.set_hyperparameters(
    image_shape="3,32,32",
    num_classes="2",
    num_training_samples="1400"
)

In [81]:
print(inference)


b'[0.9405655860900879, 0.059434447437524796]'


In [82]:
import random
import boto3
import json


def generate_test_case():
    # Setup s3 in boto3
    s3 = boto3.resource('s3')
    
    # Randomly pick from sfn or test folders in our bucket
    objects = s3.Bucket(bucket).objects.filter(Prefix = "test")
    
    # Grab any random object key from that folder!
    obj = random.choice([x.key for x in objects])
    
    return json.dumps({
        "image_data": "",
        "s3_bucket": bucket,
        "s3_key": obj
    })
generate_test_case()

'{"image_data": "", "s3_bucket": "sagemaker-us-east-1-184930056456", "s3_key": "test/motorcycle_s_002126.png"}'

In [39]:
from sagemaker.s3 import S3Downloader

# In S3 your data will be saved to a datetime-aware path
# Find a path related to a datetime you're interested in
data_path = "s3://sagemaker-us-east-1-184930056456/test/"

S3Downloader.download(data_path, "captured_data")

# Feel free to repeat this multiple times and pull in more data

In [40]:
!pip install jsonlines --upgrade pip


Keyring is skipped due to an exception: 'keyring.backends'
Collecting jsonlines
  Using cached jsonlines-3.1.0-py3-none-any.whl (8.6 kB)
Collecting pip
  Using cached pip-23.0-py3-none-any.whl (2.1 MB)
Installing collected packages: pip, jsonlines
  Attempting uninstall: pip
    Found existing installation: pip 22.3.1
    Uninstalling pip-22.3.1:
      Successfully uninstalled pip-22.3.1
Successfully installed jsonlines-3.1.0 pip-23.0
[0m

In [41]:
import jsonlines

In [42]:
import os

# List the file names we downloaded
file_handles = os.listdir("./captured_data")

In [66]:
file_handles

['moped_s_000007.png',
 'minibike_s_000497.png',
 'bicycle_s_002735.png',
 'minibike_s_000398.png',
 'minibike_s_000947.png',
 'cycle_s_002613.png',
 'motorcycle_s_002126.png',
 'velocipede_s_001335.png',
 'minibike_s_000309.png',
 'motorcycle_s_000739.png',
 'bicycle_s_001789.png',
 'minibike_s_000075.png',
 'minibike_s_001605.png',
 'velocipede_s_001201.png',
 'bicycle_s_000513.png',
 'motorcycle_s_001936.png',
 'bicycle_s_001107.png',
 'motorcycle_s_000685.png',
 'safety_bike_s_001481.png',
 'bike_s_000801.png',
 'velocipede_s_001379.png',
 'cycle_s_001915.png',
 'bicycle_s_000777.png',
 'bike_s_000131.png',
 'velocipede_s_001744.png',
 'safety_bicycle_s_000280.png',
 'safety_bicycle_s_001153.png',
 'trail_bike_s_000073.png',
 'safety_bike_s_000541.png',
 'ordinary_bicycle_s_000284.png',
 'ordinary_bicycle_s_000158.png',
 'motorcycle_s_000825.png',
 'minibike_s_002051.png',
 'ordinary_bicycle_s_000437.png',
 'cycle_s_000970.png',
 'bike_s_000041.png',
 'motorbike_s_000121.png',
 'sa

In [45]:
import os

# List the file names we downloaded
file_handles = os.listdir("./captured_data")

# Dump all the data into an array
json_data = []
for jsonl in file_handles:
    with jsonlines.open(f"./captured_data/{jsonl}") as f:
        json_data.append(f.read())

UnicodeDecodeError: 'utf-8' codec can't decode byte 0x89 in position 0: invalid start byte

In [47]:
ignore_encoding = lambda s: s.decode('utf8', 'ignore')


In [48]:
# Populate the data for the x and y axis
x = []
y = []
for obj in json_data:
    inference, timestamp = simple_getter(obj)
    
    y.append(max(inference))
    
    x.append(timestamp)

# Todo: here is an visualization example, take some time to build another visual that helps monitor the result
# Plot the data
plt.scatter(x, y, c=['r' if k<.94 else 'b' for k in y ])
plt.axhline(y=0.94, color='g', linestyle='--')
plt.ylim(bottom=.88)

# Add labels
plt.ylabel("Confidence")
plt.suptitle("Observed Recent Inferences", size=14)
plt.title("Pictured with confidence threshold for production use", size=10)

# Give it some pizzaz!
plt.style.use("Solarize_Light2")
plt.gcf().autofmt_xdate()

In [56]:
# Define how we'll get our data
def simple_getter(obj):
    inferences = obj["captureData"]["endpointOutput"]["data"]
    timestamp = obj["eventMetadata"]["inferenceTime"]
    return json.loads(inferences), timestamp




In [51]:
import base64
import io
from matplotlib import pyplot as plt
import matplotlib.image as mpimg


# Populate the data for the x and y axis
x = []
y = []
images = []

for obj in json_data:
    inference, timestamp, image = my_simple_getter(obj)
    
    #Filter for images I checked multiple times:
    if image not in images:
        y.append(max(inference))
        x.append(timestamp)
        images.append(image)

# Plot the data
fig, ax = plt.subplots(figsize=(15,10))
ax.scatter(x, y, c=['r' if k<.94 else 'b' for k in y ])
ax.axhline(y=0.94, color='g', linestyle='--')
plt.ylim(bottom=.50, top=1.1)

# Add labels
plt.ylabel("Confidence", color='black', size=18)
plt.suptitle("Observed Failed Inferences", color='black', size=24)
plt.title("Pictures inset for reference", color='black', size=20)

# Give it some pizzaz!
plt.style.use("Solarize_Light2")
plt.gcf().autofmt_xdate()


from matplotlib.offsetbox import OffsetImage, AnnotationBbox
# Overlay images instead of datapoints

for x0, y0, image in zip(x, y, images):
    if y0 < 0.94:
        # Convert image to usable format:
        image = base64.b64decode(image)
        image = io.BytesIO(image)
        image = mpimg.imread(image, format='PNG')
        # Load image in OffsetImage for AnnotationBbox
        im = OffsetImage(image, zoom=2.5)
        
        # Display image at x,y coords:
        ab = AnnotationBbox(im, (x0, y0), frameon=False)
        ax.add_artist(ab)

plt.savefig('my-visualization.png')

In [52]:
# My Simple Getter:
def my_simple_getter(obj):
    inferences = obj["captureData"]["endpointOutput"]["data"]
    timestamp = obj["eventMetadata"]["inferenceTime"]
    images = obj["captureData"]["endpointInput"]["data"]
    return json.loads(inferences), timestamp, images



In [54]:
import base64
import io
from matplotlib import pyplot as plt
import matplotlib.image as mpimg


# Populate the data for the x and y axis
x = []
y = []
images = []

for obj in json_data:
    inference, timestamp, image = my_simple_getter(obj)
    
    #Filter for images I checked multiple times:
    if image not in images:
        y.append(max(inference))
        x.append(timestamp)
        images.append(image)

# Plot the data
fig, ax = plt.subplots(figsize=(15,10))
ax.scatter(x, y, c=['r' if k<.94 else 'b' for k in y ])
ax.axhline(y=0.94, color='g', linestyle='--')
plt.ylim(bottom=.50, top=1.1)

# Add labels
plt.ylabel("Confidence", color='black', size=18)
plt.suptitle("Observed Failed Inferences", color='black', size=24)
plt.title("Pictures inset for reference", color='black', size=20)

# Give it some pizzaz!
plt.style.use("Solarize_Light2")
plt.gcf().autofmt_xdate()

In [55]:
from matplotlib.offsetbox import OffsetImage, AnnotationBbox
# Overlay images instead of datapoints

for x0, y0, image in zip(x, y, images):
    if y0 < 0.94:
        # Convert image to usable format:
        image = base64.b64decode(image)
        image = io.BytesIO(image)
        image = mpimg.imread(image, format='PNG')
        
        # Load image in OffsetImage for AnnotationBbox
        im = OffsetImage(image, zoom=2.5)
        
        # Display image at x,y coords:
        ab = AnnotationBbox(im, (x0, y0), frameon=False)
        ax.add_artist(ab)

plt.savefig('my-visualization.png')

In [57]:
pip install pyyaml


[0mNote: you may need to restart the kernel to use updated packages.
