# Prequisites and Preprocessing

### Step IC1: Import necessary libraries and setup session

In [None]:
%%time
import boto3
import re
from sagemaker import get_execution_role
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker import session

project_name="my-projects" # maximum 17 characters

sess = session.Session()
bucket = sess.default_bucket()
s3_project_folder = project_name

role = get_execution_role()
training_image = get_image_uri(boto3.Session().region_name, 'image-classification')

### Step IC2: Upload training data to this SageMaker notebook instance

**IMPORTANT:**    
***On your local PC/laptop, structure your source images into folders, where each folder contains all images of 1 category. All folders should be under 1 parent folder with name 'input-images*** 
It should look like one below:
<pre> 
/input-images
  /arabica  
    /image1.jpg  
    /image2.jpg  
    ...
  /robusta  
    /image1.jpg  
    /image2.jpg  
    ...
  /liberica   
    /image1.jpg  
    /image2.jpg  
    ...
</pre>

***Then zip 'input-images' folder and upload to this notebook***    
It should look like this one below:
<pre>
/input-images.zip
</pre>

***Unzip the data***

In [None]:
!unzip -o ./input-images.zip

### Step IC3: Build train and validation set

***Define the train and validation set, also how many percents of images should be allocated to validation set***

In [None]:
train_set = []
validation_set = []
validation_percentage = 0.20

***Shuffle data and generate train data set and validation data set***

In [None]:
import os
from random import shuffle
            
def move_percentage(list_a, percentage):
    shuffle(list_a)
    count = int(len(list_a) * percentage)
    if not count: return []  # edge case, no elements removed
    list_a[-count:], list_b = [], list_a[-count:]
    return list_b

path = "{}/input-images".format(os.getcwd())
categories = os.listdir(path)
categories = list(filter(lambda x: (not x.startswith('.') and not x.endswith('.lst')) ,categories))

category_index = 0
for c in categories:
    p = "{}/{}".format(path,c)
    files = os.listdir(p)
    for f in files:
        item = {
            'label': category_index,
            'file_name': f
        }
        train_set.append(item)
    category_index = category_index + 1

validation_set = move_percentage(train_set, validation_percentage)
print("There are {} of images in training data set".format(len(train_set)))
print("There are {} of images in validation data set".format(len(validation_set)))

### Step IC4: Generate list file to be used by training algorithm 
Each row in list file correspond to 1 image data, with 3 columns: index, label, and file name

In [None]:
import os

train_lst_path = "{}/train.lst".format(path)
if os.path.exists(train_lst_path):
  os.remove(train_lst_path)
f = open("{}/train.lst".format(path), "a")
index = 1
for i in train_set:
    f.write("{}\t{}\t{}-{}".format(index, i['label'],categories[int(i['label'])],i['file_name']))
    index = index + 1
    if index <= len(train_set):
        f.write("\n")

f.close()
print("List file for train_set generated. First 10 lines:")
!head $train_lst_path

validation_lst_path = "{}/validation.lst".format(path)
if os.path.exists(validation_lst_path):
  os.remove(validation_lst_path)
f = open("{}/validation.lst".format(path), "a")
index = 1
for i in validation_set:
    f.write("{}\t{}\t{}-{}".format(index, i['label'],categories[int(i['label'])],i['file_name']))
    index = index + 1
    if index <= len(validation_set):
        f.write("\n")
        
f.close()
print("\n\nList file for validation_set generated. First 10 lines:")
!head $validation_lst_path


### Step IC5: Upload to S3

In [None]:
import os 
import boto3
        
def upload_to_s3(channel, file, name):
    s3 = boto3.resource('s3')
    data = open(file, "rb")
    key = channel + '/' + name
    s3.Bucket(bucket).put_object(Key=key, Body=data)

s3_train_path = "{}/train".format(s3_project_folder)
s3_validation_path = "{}/validation".format(s3_project_folder)
s3_lst_path = s3_project_folder

upload_to_s3(s3_lst_path, train_lst_path, 'train.lst')
upload_to_s3(s3_lst_path, validation_lst_path, 'validation.lst')

index=0
modulus=round(0.05*(len(train_set)+len(validation_set)))
for i in train_set:
    upload_to_s3(
        channel=s3_train_path, 
        file="{}/{}/{}".format(path,categories[int(i['label'])],i['file_name']),
        name="{}-{}".format(categories[int(i['label'])], i['file_name'])
    )
    if not index%modulus:
        print("Uploading: {} %".format(round(index/(len(train_set)+len(validation_set))*100)))
    index = index + 1
for i in validation_set:
    upload_to_s3(
        channel=s3_validation_path, 
        file="{}/{}/{}".format(path,categories[int(i['label'])],i['file_name']),
        name="{}-{}".format(categories[int(i['label'])], i['file_name'])
    )
    if not index%modulus:
        print("Uploading: {} %".format(round(index/(len(train_set)+len(validation_set))*100)))
    index = index + 1

print("Finished")

# Training the model

We are going to train the model with SageMaker. However, instead of creating 1 training job, we can create multiple training jobs with Hyperparameter Tuning feature. The purpose is to allow the multiple traning jobs with different hyperparameter values to be tried out, seeking for the best training job with best validation accuracy.

There are two kinds of parameters that need to be set for training. The first one are the parameters for the training job. These include:

* **Input specification**: These are the training and validation channels that specify the path where training data is present. These are specified in the "InputDataConfig" section. The main parameters that need to be set is the "ContentType" which can be set to "rec" or "lst" based on the input data format and the S3Uri which specifies the bucket and the folder where the data is present. 
* **Output specification**: This is specified in the "OutputDataConfig" section. We just need to specify the path where the output can be stored after training
* **Resource config**: This section specifies the type of instance on which to run the training and the number of hosts used for training. If "InstanceCount" is more than 1, then training can be run in a distributed manner. 

Apart from the above set of parameters, there are hyperparameters that are specific to the algorithm. These are:

* **num_layers**: The number of layers (depth) for the network. We use 101 in this samples but other values such as 50, 152 can be used. 
* **num_training_samples**: This is the total number of training samples.
* **num_classes**: This is the number of output classes for the new dataset.
* **epochs**: Number of training epochs
* **learning_rate**: Learning rate for training
* **mini_batch_size**: The number of training samples used for each mini batch. In distributed training, the number of training samples used per batch will be N * mini_batch_size where N is the number of hosts on which training is run
* **use_pretrained_model**: Whether to use pretrained model that should be able to recognize some features of the images. We will use this to avoid our machine learning model to learn to understand image from 0.
* **optimizer**: Which optimizer to use in the training

### Step IC6: Setup training parameters
Run the training using Amazon sagemaker CreateHyperParameterTuningJob API

In [None]:
# The algorithm supports multiple network depth (number of layers). They are 18, 34, 50, 101, 152 and 200
# For this training, we will use 18 layers
num_layers = 18 
# we need to specify the input image shape for the training data
image_shape = "3,224,224"
# we also need to specify the number of training samples in the training set
num_training_samples = len(train_set)
# specify the number of output classes
num_classes = len(categories)
# training mini batch size range to be tuned
mini_batch_size_min = 16
mini_batch_size_max = 64
# number of epochs
epochs = 10
# learning rate range to be tuned
learning_rate_min = "0.0001"
learning_rate_max = "1.0"
# maximum number of training jobs
hpo_max_number_of_training_jobs = 20
# maximum number of parallel training jobs
hpo_max_number_of_parallel_jobs = 1
# optimizers to be tried out in hyperparameter tuning
optimizers = ['sgd', 'adam', 'rmsprop', 'nag']
top_k = '2'

### Step IC7: Run training jobs with automatic hyperparameter tuning
Use SageMaker's Image Classification built-in algorithm to run the training job

In [None]:
%%time
import time
import boto3
from time import gmtime, strftime
from sagemaker import session, estimator
from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner 

s3 = boto3.client('s3')
# create unique job name 
job_name_prefix = s3_project_folder
timestamp = time.strftime('-%Y%m%d-%H%M%S', time.gmtime())
job_name = job_name_prefix + timestamp

# define the SageMaker training resources
imageclassification = estimator.Estimator(training_image, 
                                                    role, 
                                                    train_instance_count=1,
                                                    train_instance_type='ml.p2.xlarge',
                                                    output_path='s3://{}/{}/output'.format(bucket, job_name_prefix), 
                                                    sagemaker_session=sess)

# define data input channels in S3
train_channel = session.s3_input("s3://{}/{}".format(bucket,s3_train_path), distribution='FullyReplicated', s3_data_type='S3Prefix', input_mode='File', content_type='application/x-image')
validation_channel = session.s3_input("s3://{}/{}".format(bucket,s3_validation_path), distribution='FullyReplicated', s3_data_type='S3Prefix', input_mode='File', content_type='application/x-image')
train_lst_channel = session.s3_input("s3://{}/{}/train.lst".format(bucket,s3_lst_path), distribution='FullyReplicated', s3_data_type='S3Prefix', input_mode='File', content_type='application/x-image')
validation_lst_channel = session.s3_input("s3://{}/{}/validation.lst".format(bucket,s3_lst_path), distribution='FullyReplicated', s3_data_type='S3Prefix', input_mode='File', content_type='application/x-image')

# set the training static hyperparameters
imageclassification.set_hyperparameters(num_layers=num_layers, 
                                        image_shape=image_shape,
                                        num_classes=num_classes, 
                                        epochs=epochs, 
                                        top_k=top_k,
                                        num_training_samples=num_training_samples,  
                                        precision_dtype='float32')

# set the parameters for hyperparameter tuning job
hpo_objective_metric_name = 'validation:accuracy'
hpo_hyperparameter_ranges = \
{
    'learning_rate': ContinuousParameter(learning_rate_min, learning_rate_max),
    'mini_batch_size': IntegerParameter(mini_batch_size_min, mini_batch_size_max),
    'optimizer': CategoricalParameter(optimizers)
}

tuner_es = HyperparameterTuner(imageclassification, 
                               hpo_objective_metric_name, 
                               hpo_hyperparameter_ranges,
                               objective_type='Maximize', 
                               max_jobs=2, 
                               max_parallel_jobs=1, 
                               early_stopping_type='Auto')

# run the hyperparameter tuning job
tuner_es.fit({ #TODO: Fix channels
    'train': train_channel,
    'validation': validation_channel, 
    'train_lst': train_lst_channel,
    'validation_lst': validation_lst_channel
}, job_name=job_name, include_cls_metadata=False)

print('Hyperparameter Tuning job name: {}'.format(job_name))

***Continuously check training job status wait until the job completes***

In [None]:
tuner_es.wait()

### Step IC8: Analyze result and pick best training

***List the top 5 hyperparameters***

In [None]:
from sagemaker import HyperparameterTuningJobAnalytics
tuner_metrics_es = HyperparameterTuningJobAnalytics(job_name)
tuner_metrics_es.dataframe().sort_values(['FinalObjectiveValue'], ascending=False).head(5)

***Pick best training job***

In [None]:
best_training_job_name = tuner_es.best_training_job()

# Deploy The Model

A trained model does nothing on its own. We now want to use the model to perform inference. For this example, that means predicting the topic mixture representing a given document.

## Step IC9: Create model

We now create a SageMaker Model from the training output. Using the model we can create a Batch Transform Job or an Endpoint Configuration.

In [None]:
%%time
import boto3
from time import gmtime, strftime

sage = boto3.Session().client(service_name='sagemaker') 

model_name=project_name
print(model_name)
info = sage.describe_training_job(TrainingJobName=best_training_job_name)
model_data = info['ModelArtifacts']['S3ModelArtifacts']
print(model_data)

hosting_image = get_image_uri(boto3.Session().region_name, 'image-classification')

primary_container = {
    'Image': hosting_image,
    'ModelDataUrl': model_data,
}

create_model_response = sage.create_model(
    ModelName = model_name,
    ExecutionRoleArn = role,
    PrimaryContainer = primary_container)

print(create_model_response['ModelArn'])

# Realtime inference

We now host the model with an endpoint and perform realtime inference.

### Step IC10: Create Endpoint Configuration
At launch, we will support configuring REST endpoints in hosting with multiple models, e.g. for A/B testing purposes. In order to support this, customers create an endpoint configuration, that describes the distribution of traffic across the models, whether split, shadowed, or sampled in some way.

In addition, the endpoint configuration describes the instance type required for model deployment, and at launch will describe the autoscaling configuration.

In [None]:
from time import gmtime, strftime

timestamp = time.strftime('-%Y-%m-%d-%H-%M-%S', time.gmtime())
endpoint_config_name = job_name_prefix + '-epc-' + timestamp
endpoint_config_response = sage.create_endpoint_config(
    EndpointConfigName = endpoint_config_name,
    ProductionVariants=[{
        'InstanceType':'ml.m4.xlarge',
        'InitialInstanceCount':1,
        'ModelName':model_name,
        'VariantName':'AllTraffic'}])

print('Endpoint configuration name: {}'.format(endpoint_config_name))
print('Endpoint configuration arn:  {}'.format(endpoint_config_response['EndpointConfigArn']))

### Step IC11: Create Endpoint
Creates the endpoint that serves up the model. The end result is an endpoint that can be validated and incorporated into production applications.

In [None]:
%%time
import time
import boto3

sagemaker = boto3.client('sagemaker')

timestamp = time.strftime('-%Y-%m-%d-%H-%M-%S', time.gmtime())
endpoint_name = job_name_prefix + '-ep-' + timestamp
print('Endpoint name: {}'.format(endpoint_name))

endpoint_params = {
    'EndpointName': endpoint_name,
    'EndpointConfigName': endpoint_config_name,
}
endpoint_response = sagemaker.create_endpoint(**endpoint_params)
print('EndpointArn = {}'.format(endpoint_response['EndpointArn']))

***Continuously check whether the endpoint has been successfully created***

In [None]:
# get the status of the endpoint
response = sagemaker.describe_endpoint(EndpointName=endpoint_name)
status = response['EndpointStatus']
print('EndpointStatus = {}'.format(status))


# wait until the status has changed
sagemaker.get_waiter('endpoint_in_service').wait(EndpointName=endpoint_name)


# print the status of the endpoint
endpoint_response = sagemaker.describe_endpoint(EndpointName=endpoint_name)
status = endpoint_response['EndpointStatus']
print('Endpoint creation ended with EndpointStatus = {}'.format(status))

if status != 'InService':
    raise Exception('Endpoint creation failed.')

If you see the message,

> `Endpoint creation ended with EndpointStatus = InService`

then congratulations! You now have a functioning inference endpoint. You can confirm the endpoint configuration and status by navigating to the "Endpoints" tab in the AWS SageMaker console.

### Step IC12: Prepare upload directory


***First make the direcctory for test data upload***

In [None]:
import os
path = "{}/test-images".format(os.getcwd())
if not os.path.exists(path):
    os.makedirs(path)

### Step IC13: Upload test data

**IMPORTANT:**   
***Please upload several (e.g. 10) images to /test-images folder in this Jupyter Notebook***

***Display image***

In [None]:
from IPython.display import HTML
import os

test_images = os.listdir(path)
display_string = ''
for im in test_images:
    file_name = "{}/{}".format('./test-images',im)
    display_string += '<figure style="float:left;"><img src="{}" alt="{}" width="200"/><figcaption ><center>{}</center></figcaption></figure>'.format(file_name, im, im)

HTML(data=display_string)
    

### Step IC13: Perform evaluation

Evaluate the images through the network for inteference for every images uploaded to /test-data folder

In [None]:
import boto3
import json
import numpy as np
from IPython.display import Image

runtime = boto3.Session().client(service_name='runtime.sagemaker')

for im in test_images:
    file_name = "{}/{}".format(path,im)
    with open(file_name, 'rb') as f:
        payload = f.read()
        payload = bytearray(payload)
    response = runtime.invoke_endpoint(EndpointName=endpoint_name, 
                                       ContentType='application/x-image', 
                                       Body=payload)
    result = response['Body'].read()
    # result will be in json format and convert it to ndarray
    result = json.loads(result)
    # the result will output the probabilities for all classes
    # find the class with maximum probability and print the class index
    index = np.argmax(result)
    display(Image("./test-images/{}".format(im)))
    print("File name: {} Result: label - {} with confidence {}\n\n".format(im, categories[index], str(result[index])))

### Step IC15: Clean up (Optional) 

If you are using this notebook only for trial/experiment, you might want to delete the Sagemaker Endpoint created to avoid cost.

In [None]:
sage.delete_endpoint(EndpointName=endpoint_name)

***Delete images source and model output in S3 (Optional)***

In [None]:
import boto3
s3 = boto3.client('s3')
s3_objects = s3.list_objects(
    Bucket=bucket,
    Delimiter=',',
    EncodingType='url',
    MaxKeys=5000,
    Prefix=project_name
)
to_be_deleted = list(map(lambda x: {'Key':x['Key']},s3_objects['Contents']))
s3.delete_objects(
    Bucket=bucket,
    Delete={
        'Objects': to_be_deleted,
        'Quiet': True
    }
)

***Clean up data in this Jupyter Notebook (Optional)***

In [None]:
!rm -rf ./test-images
!rm -rf ./input-images
!rm -f input-images.zip
!rm -rf __MACOSX