# Prequisites and Preprocessing

### Step IC1: Import necessary libraries and setup session

In [26]:
%%time
import boto3
import re
from sagemaker import get_execution_role
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker import session

project_name="image-classification-fulltraining"

sess = session.Session()
bucket = sess.default_bucket()
s3_project_folder = project_name
role = get_execution_role()

training_image = get_image_uri(boto3.Session().region_name, 'image-classification')

INFO:sagemaker:Created S3 bucket: sagemaker-us-east-1-344028372807


CPU times: user 130 ms, sys: 24.2 ms, total: 154 ms
Wall time: 314 ms


### Step IC2: Upload training data to this SageMaker notebook instance

Create raw input data directory

In [2]:
import os 
path = "{}/data".format(os.getcwd())
if not os.path.exists(path):
    os.makedirs(path)

**IMPORTANT:**  ***Upload the images to /data folder, with a subfolder for each class***    
For example, if you want to classify coffee types that has 3 classes: arabica, robusta, and liberica.  
Your folder structure should look like one below:  
<pre>
/data    
  /arabica  
    /image1.jpg  
    /image2.jpg  
    ...
  /robusta  
    /image1.jpg  
    /image2.jpg  
    ...
  /liberica   
    /image1.jpg  
    /image2.jpg  
    ...
</pre>

### Step IC3: Build train and validation set

***Define the train and validation set, also how many percents of images should be allocated to validation set***

In [3]:
train_set = []
validation_set = []
validation_percentage = 0.20

***Shuffle data and generate train data set and validation data set***

In [4]:
import os
from random import shuffle
            
def move_percentage(list_a, percentage):
    shuffle(list_a)
    count = int(len(list_a) * percentage)
    if not count: return []  # edge case, no elements removed
    list_a[-count:], list_b = [], list_a[-count:]
    return list_b

path = "{}/data".format(os.getcwd())
categories = os.listdir(path)
categories = list(filter(lambda x: (not x.startswith('.') and not x.endswith('.lst')) ,categories))

category_index = 0
for c in categories:
    p = "{}/{}".format(path,c)
    files = os.listdir(p)
    for f in files:
        item = {
            'label': category_index,
            'file_name': f
        }
        train_set.append(item)
    category_index = category_index + 1

validation_set = move_percentage(train_set, validation_percentage)
print("There are {} of images in training data set".format(len(train_set)))
print("There are {} of images in validation data set".format(len(validation_set)))

There are 256 of images in training data set
There are 63 of images in validation data set


### Step IC4: Generate list file to be used by training algorithm 
Each row in list file correspond to 1 image data, with 3 columns: index, label, and file name

In [12]:
import os

train_lst_path = "{}/train.lst".format(path)
if os.path.exists(train_lst_path):
  os.remove(train_lst_path)
f = open("{}/train.lst".format(path), "a")
index = 1
for i in train_set:
    f.write("{}\t{}\t{}-{}".format(index, i['label'],categories[int(i['label'])],i['file_name']))
    index = index + 1
    if index <= len(train_set):
        f.write("\n")

f.close()
print("List file for train_set generated. First 10 lines:")
!head $train_lst_path

validation_lst_path = "{}/validation.lst".format(path)
if os.path.exists(validation_lst_path):
  os.remove(validation_lst_path)
f = open("{}/validation.lst".format(path), "a")
index = 1
for i in validation_set:
    f.write("{}\t{}\t{}-{}".format(index, i['label'],categories[int(i['label'])],i['file_name']))
    index = index + 1
    if index <= len(validation_set):
        f.write("\n")
        
f.close()
print("\n\nList file for validation_set generated. First 10 lines:")
!head $validation_lst_path


List file for train_set generated. First 10 lines:
1	1	airplanes-image_0111.jpg
2	1	airplanes-image_0677.jpg
3	1	airplanes-image_0686.jpg
4	1	airplanes-image_0716.jpg
5	0	camera-image_0028.jpg
6	1	airplanes-image_0062.jpg
7	1	airplanes-image_0052.jpg
8	1	airplanes-image_0066.jpg
9	1	airplanes-image_0129.jpg
10	1	airplanes-image_0694.jpg


List file for validation_set generated. First 10 lines:
1	1	airplanes-image_0752.jpg
2	1	airplanes-image_0667.jpg
3	1	airplanes-image_0053.jpg
4	0	camera-image_0027.jpg
5	1	airplanes-image_0027.jpg
6	1	airplanes-image_0028.jpg
7	1	airplanes-image_0747.jpg
8	1	airplanes-image_0689.jpg
9	1	airplanes-image_0090.jpg
10	0	camera-image_0047.jpg


### Step IC5: Upload to S3

In [14]:
import os 
import boto3
        
def upload_to_s3(channel, file, name):
    s3 = boto3.resource('s3')
    data = open(file, "rb")
    key = channel + '/' + name
    s3.Bucket(bucket).put_object(Key=key, Body=data)

s3_train_path = "{}/train".format(s3_project_folder)
s3_validation_path = "{}/validation".format(s3_project_folder)
s3_lst_path = s3_project_folder

upload_to_s3(s3_lst_path, train_lst_path, 'train.lst')
upload_to_s3(s3_lst_path, validation_lst_path, 'validation.lst')

index=0
modulus=round(0.05*(len(train_set)+len(validation_set)))
for i in train_set:
    upload_to_s3(
        channel=s3_train_path, 
        file="{}/{}/{}".format(path,categories[int(i['label'])],i['file_name']),
        name="{}-{}".format(categories[int(i['label'])], i['file_name'])
    )
    if not index%modulus:
        print("Uploading: {} %".format(round(index/(len(train_set)+len(validation_set))*100)))
    index = index + 1
for i in validation_set:
    upload_to_s3(
        channel=s3_validation_path, 
        file="{}/{}/{}".format(path,categories[int(i['label'])],i['file_name']),
        name="{}-{}".format(categories[int(i['label'])], i['file_name'])
    )
    if not index%modulus:
        print("Uploading: {} %".format(round(index/(len(train_set)+len(validation_set))*100)))
    index = index + 1

print("Finished")

Uploading: 0 %
Uploading: 5 %
Uploading: 10 %
Uploading: 15 %
Uploading: 20 %
Uploading: 25 %
Uploading: 30 %
Uploading: 35 %
Uploading: 40 %
Uploading: 45 %
Uploading: 50 %
Uploading: 55 %
Uploading: 60 %
Uploading: 65 %
Uploading: 70 %
Uploading: 75 %
Uploading: 80 %
Uploading: 85 %
Uploading: 90 %
Uploading: 95 %
Finished


# Training the model

There are two kinds of parameters that need to be set for training. The first one are the parameters for the training job. These include:

* **Input specification**: These are the training and validation channels that specify the path where training data is present. These are specified in the "InputDataConfig" section. The main parameters that need to be set is the "ContentType" which can be set to "rec" or "lst" based on the input data format and the S3Uri which specifies the bucket and the folder where the data is present. 
* **Output specification**: This is specified in the "OutputDataConfig" section. We just need to specify the path where the output can be stored after training
* **Resource config**: This section specifies the type of instance on which to run the training and the number of hosts used for training. If "InstanceCount" is more than 1, then training can be run in a distributed manner. 

Apart from the above set of parameters, there are hyperparameters that are specific to the algorithm. These are:

* **num_layers**: The number of layers (depth) for the network. We use 101 in this samples but other values such as 50, 152 can be used. 
* **num_training_samples**: This is the total number of training samples.
* **num_classes**: This is the number of output classes for the new dataset.
* **epochs**: Number of training epochs
* **learning_rate**: Learning rate for training
* **mini_batch_size**: The number of training samples used for each mini batch. In distributed training, the number of training samples used per batch will be N * mini_batch_size where N is the number of hosts on which training is run

### Step IC6: Setup training parameters
Run the training using Amazon sagemaker CreateTrainingJob API

In [21]:
# The algorithm supports multiple network depth (number of layers). They are 18, 34, 50, 101, 152 and 200
# For this training, we will use 18 layers
num_layers = "18" 
# we need to specify the input image shape for the training data
image_shape = "3,100,100"
# we also need to specify the number of training samples in the training set
num_training_samples = len(train_set)
# specify the number of output classes
num_classes = len(categories)
# batch size for training
mini_batch_size =  "16"
# number of epochs
epochs = "10"
# learning rate
learning_rate = "0.01"

### Step IC7: Run training job
Use SageMaker's Image Classification built-in algorithm to run the training job

In [22]:
%%time
import time
import boto3
from time import gmtime, strftime


s3 = boto3.client('s3')
# create unique job name 
job_name_prefix = s3_project_folder
timestamp = time.strftime('-%Y-%m-%d-%H-%M-%S', time.gmtime())
job_name = job_name_prefix + timestamp
training_params = \
{
    # specify the training docker image
    "AlgorithmSpecification": {
        "TrainingImage": training_image,
        "TrainingInputMode": "File"
    },
    "RoleArn": role,
    "OutputDataConfig": {
        "S3OutputPath": 's3://{}/{}/output'.format(bucket, job_name_prefix)
    },
    "ResourceConfig": {
        "InstanceCount": 1,
        "InstanceType": "ml.p2.xlarge",
        "VolumeSizeInGB": 50
    },
    "TrainingJobName": job_name,
    "HyperParameters": {
        "image_shape": image_shape,
        "num_layers": str(num_layers),
        "num_training_samples": str(num_training_samples),
        "num_classes": str(num_classes),
        "mini_batch_size": str(mini_batch_size),
        "epochs": str(epochs),
        "learning_rate": str(learning_rate)
    },
    "StoppingCondition": {
        "MaxRuntimeInSeconds": 360000
    },
#Training data should be inside a subdirectory called "train"
#Validation data should be inside a subdirectory called "validation"
#The algorithm currently only supports fullyreplicated model (where data is copied onto each machine)
    "InputDataConfig": [
        {
            "ChannelName": "train",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": "s3://{}/{}".format(bucket,s3_train_path),
                    "S3DataDistributionType": "FullyReplicated"
                }
            },
            "ContentType": "application/x-image",
            "CompressionType": "None"
        },
        {
            "ChannelName": "validation",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": "s3://{}/{}".format(bucket,s3_validation_path),
                    "S3DataDistributionType": "FullyReplicated"
                }
            },
            "ContentType": "application/x-image",
            "CompressionType": "None"
        },
          {
            "ChannelName": "train_lst",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": "s3://{}/{}/train.lst".format(bucket,s3_lst_path),
                    "S3DataDistributionType": "FullyReplicated"
                }
            },
            "ContentType": "application/x-image",
            "CompressionType": "None"
        },
          {
            "ChannelName": "validation_lst",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": "s3://{}/{}/validation.lst".format(bucket,s3_lst_path),
                    "S3DataDistributionType": "FullyReplicated"
                }
            },
            "ContentType": "application/x-image",
            "CompressionType": "None"
        }
    ]
}
print('Training job name: {}'.format(job_name))
print('\nInput Data Location: {}'.format(training_params['InputDataConfig'][0]['DataSource']['S3DataSource']))

Training job name: image-classification-fulltraining-2019-03-26-07-53-39

Input Data Location: {'S3DataType': 'S3Prefix', 'S3Uri': 's3://sagemaker-us-east-1-344028372807/image-classification-fulltraining/train', 'S3DataDistributionType': 'FullyReplicated'}
CPU times: user 4.81 ms, sys: 0 ns, total: 4.81 ms
Wall time: 4.8 ms


***Continuously check training job status wait until the job completes***

In [23]:
# create the Amazon SageMaker training job
sagemaker = boto3.client(service_name='sagemaker')
sagemaker.create_training_job(**training_params)

# confirm that the training job has started
status = sagemaker.describe_training_job(TrainingJobName=job_name)['TrainingJobStatus']
print('Training job current status: {}'.format(status))

try:
    # wait for the job to finish and report the ending status
    sagemaker.get_waiter('training_job_completed_or_stopped').wait(TrainingJobName=job_name)
    training_info = sagemaker.describe_training_job(TrainingJobName=job_name)
    status = training_info['TrainingJobStatus']
    print("Training job ended with status: " + status)
except:
    print('Training failed to start')
     # if exception is raised, that means it has failed
    message = sagemaker.describe_training_job(TrainingJobName=job_name)['FailureReason']
    print('Training failed with the following error: {}'.format(message))

Training job current status: InProgress
Training job ended with status: Completed


In [24]:
training_info = sagemaker.describe_training_job(TrainingJobName=job_name)
status = training_info['TrainingJobStatus']
print("Training job ended with status: " + status)

Training job ended with status: Completed


If you see the message,

> `Training job ended with status: Completed`

then that means training successfully completed and the output model was stored in the output path specified by `training_params['OutputDataConfig']`.

You can also view information about and the status of a training job using the AWS SageMaker console. Just click on the "Jobs" tab.

# Deploy The Model

A trained model does nothing on its own. We now want to use the model to perform inference. For this example, that means predicting the topic mixture representing a given document.

## Step IC8: Create model

We now create a SageMaker Model from the training output. Using the model we can create a Batch Transform Job or an Endpoint Configuration.

In [27]:
%%time
import boto3
from time import gmtime, strftime

sage = boto3.Session().client(service_name='sagemaker') 

model_name=project_name
print(model_name)
info = sage.describe_training_job(TrainingJobName=job_name)
model_data = info['ModelArtifacts']['S3ModelArtifacts']
print(model_data)

hosting_image = get_image_uri(boto3.Session().region_name, 'image-classification')

primary_container = {
    'Image': hosting_image,
    'ModelDataUrl': model_data,
}

create_model_response = sage.create_model(
    ModelName = model_name,
    ExecutionRoleArn = role,
    PrimaryContainer = primary_container)

print(create_model_response['ModelArn'])

image-classification-fulltraining
s3://sagemaker-us-east-1-344028372807/image-classification-fulltraining/output/image-classification-fulltraining-2019-03-26-07-53-39/output/model.tar.gz
arn:aws:sagemaker:us-east-1:344028372807:model/image-classification-fulltraining
CPU times: user 56.5 ms, sys: 12.1 ms, total: 68.6 ms
Wall time: 384 ms


# Realtime inference

We now host the model with an endpoint and perform realtime inference.

### Step IC9: Create Endpoint Configuration
At launch, we will support configuring REST endpoints in hosting with multiple models, e.g. for A/B testing purposes. In order to support this, customers create an endpoint configuration, that describes the distribution of traffic across the models, whether split, shadowed, or sampled in some way.

In addition, the endpoint configuration describes the instance type required for model deployment, and at launch will describe the autoscaling configuration.

In [28]:
from time import gmtime, strftime

timestamp = time.strftime('-%Y-%m-%d-%H-%M-%S', time.gmtime())
endpoint_config_name = job_name_prefix + '-epc-' + timestamp
endpoint_config_response = sage.create_endpoint_config(
    EndpointConfigName = endpoint_config_name,
    ProductionVariants=[{
        'InstanceType':'ml.m4.xlarge',
        'InitialInstanceCount':1,
        'ModelName':model_name,
        'VariantName':'AllTraffic'}])

print('Endpoint configuration name: {}'.format(endpoint_config_name))
print('Endpoint configuration arn:  {}'.format(endpoint_config_response['EndpointConfigArn']))

Endpoint configuration name: image-classification-fulltraining-epc--2019-03-26-08-03-01
Endpoint configuration arn:  arn:aws:sagemaker:us-east-1:344028372807:endpoint-config/image-classification-fulltraining-epc--2019-03-26-08-03-01


### Step IC10: Create Endpoint
Creates the endpoint that serves up the model. The end result is an endpoint that can be validated and incorporated into production applications.

In [29]:
%%time
import time

timestamp = time.strftime('-%Y-%m-%d-%H-%M-%S', time.gmtime())
endpoint_name = job_name_prefix + '-ep-' + timestamp
print('Endpoint name: {}'.format(endpoint_name))

endpoint_params = {
    'EndpointName': endpoint_name,
    'EndpointConfigName': endpoint_config_name,
}
endpoint_response = sagemaker.create_endpoint(**endpoint_params)
print('EndpointArn = {}'.format(endpoint_response['EndpointArn']))

Endpoint name: image-classification-fulltraining-ep--2019-03-26-08-03-05
EndpointArn = arn:aws:sagemaker:us-east-1:344028372807:endpoint/image-classification-fulltraining-ep--2019-03-26-08-03-05
CPU times: user 13.9 ms, sys: 65 µs, total: 14 ms
Wall time: 323 ms


***Continuously check whether the endpoint has been successfully created***

In [None]:
# get the status of the endpoint
response = sagemaker.describe_endpoint(EndpointName=endpoint_name)
status = response['EndpointStatus']
print('EndpointStatus = {}'.format(status))


# wait until the status has changed
sagemaker.get_waiter('endpoint_in_service').wait(EndpointName=endpoint_name)


# print the status of the endpoint
endpoint_response = sagemaker.describe_endpoint(EndpointName=endpoint_name)
status = endpoint_response['EndpointStatus']
print('Endpoint creation ended with EndpointStatus = {}'.format(status))

if status != 'InService':
    raise Exception('Endpoint creation failed.')

EndpointStatus = Creating


If you see the message,

> `Endpoint creation ended with EndpointStatus = InService`

then congratulations! You now have a functioning inference endpoint. You can confirm the endpoint configuration and status by navigating to the "Endpoints" tab in the AWS SageMaker console.

### Step IC11: Upload test data


***First upload test images***

In [None]:
import os
path = "{}/test-data".format(os.getcwd())
if not os.path.exists(path):
    os.makedirs(path)

**IMPORTANT:** ***Please upload several (e.g. 10) images to /test-data folder in this Jupyter Notebook***

***Display image***

In [None]:
from IPython.display import Image
import os

test_images = os.listdir(path)
for im in test_images:
    file_name = "{}/{}".format(path,im)
    Image(file_name)  

### Step IC12: Perform evaluation

Evaluate the images through the network for inteference for every images uploaded to /test-data folder

In [None]:
import boto3
import json
import numpy as np

runtime = boto3.Session().client(service_name='runtime.sagemaker')

for im in test_images:
    file_name = "{}/{}".format(path,im)
    with open(file_name, 'rb') as f:
        payload = f.read()
        payload = bytearray(payload)
    response = runtime.invoke_endpoint(EndpointName=endpoint_name, 
                                       ContentType='application/x-image', 
                                       Body=payload)
    result = response['Body'].read()
    # result will be in json format and convert it to ndarray
    result = json.loads(result)
    # the result will output the probabilities for all classes
    # find the class with maximum probability and print the class index
    index = np.argmax(result)
    print("Result: label - " + categories[index] + ", probability - " + str(result[index]))

### Step IC13: Clean up

When we're done with the endpoint, we can just delete it and the backing instances will be released.  Run the following cell to delete the endpoint.

In [None]:
sage.delete_endpoint(EndpointName=endpoint_name)