In [1]:
import numpy as np
import pandas as pd

# Define IAM role
import boto3
import re
import sagemaker
from sagemaker import get_execution_role

# SageMaker SDK Documentation: http://sagemaker.readthedocs.io/en/latest/estimators.html

## Upload Data to S3

In [2]:
bucket_name = 'weisurya-sagemaker-playground'
training_file_key = 'biketrain/bike_train_numeric_columns.recordio'

s3_model_output_location = r's3://{0}/biketrain/model'.format(bucket_name)
s3_training_file_location = r's3://{0}/{1}'.format(bucket_name,training_file_key)

In [3]:
print(s3_model_output_location)
print(s3_training_file_location)

s3://weisurya-sagemaker-playground/biketrain/model
s3://weisurya-sagemaker-playground/biketrain/bike_train_numeric_columns.recordio


In [4]:
# Write and Reading from S3 is just as easy
# files are referred as objects in S3.  
# file name is referred as key name in S3
# Files stored in S3 are automatically replicated across 3 different availability zones 
# in the region where the bucket was created.

# http://boto3.readthedocs.io/en/latest/guide/s3.html
def write_to_s3(filename, bucket, key):
    with open(filename,'rb') as f: # Read in binary mode
        return boto3.Session().resource('s3').Bucket(bucket).Object(key).upload_fileobj(f)

In [5]:
write_to_s3('bike_train_numeric_columns.recordio',bucket_name,training_file_key)

## Training Algorithm Docker Image
### AWS Maintains a separate image for every region and algorithm

In [8]:
# Registry Path for algorithms provided by SageMaker
#  https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-algo-docker-registry-paths.html
containers = {'us-west-2': '174872318107.dkr.ecr.us-west-2.amazonaws.com/pca:latest',
              'us-east-1': '382416733822.dkr.ecr.us-east-1.amazonaws.com/pca:latest',
              'us-east-2': '404615174143.dkr.ecr.us-east-2.amazonaws.com/pca:latest',
              'eu-west-1': '438346466558.dkr.ecr.eu-west-1.amazonaws.com/pca:latest'}

In [6]:
role = get_execution_role()

In [7]:
# This role contains the permissions needed to train, deploy models
# SageMaker Service is trusted to assume this role
print(role)

arn:aws:iam::483061387878:role/service-role/AmazonSageMaker-ExecutionRole-20200106T033205


## Build Model

In [9]:
sess = sagemaker.Session()

In [10]:
# Access appropriate algorithm container image
#  Specify how many instances to use for distributed training and what type of machine to use
#  Finally, specify where the trained model artifacts needs to be stored
#   Reference: http://sagemaker.readthedocs.io/en/latest/estimators.html
#    Optionally, give a name to the training job using base_job_name

estimator = sagemaker.estimator.Estimator(containers[boto3.Session().region_name],
                                       role, 
                                       train_instance_count=1, 
                                       train_instance_type='ml.m4.xlarge',
                                       output_path=s3_model_output_location,
                                       sagemaker_session=sess,
                                       base_job_name ='pca-biketrain-v1')

In [11]:
# Specify hyper parameters that appropriate for the training algorithm
estimator.set_hyperparameters(feature_dim=4,
                        num_components=3,
                        subtract_mean=False,
                        algorithm_mode='regular',
                        mini_batch_size=200)

In [12]:
estimator.hyperparameters()

{'feature_dim': 4,
 'num_components': 3,
 'subtract_mean': False,
 'algorithm_mode': 'regular',
 'mini_batch_size': 200}

### Train the model

In [13]:
# XGBoost supports "train", "validation" channels
# Reference: Supported channels by algorithm
#   https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-algo-docker-registry-paths.html
estimator.fit({'train':s3_training_file_location})

2020-01-27 19:21:50 Starting - Starting the training job...
2020-01-27 19:21:51 Starting - Launching requested ML instances......
2020-01-27 19:22:50 Starting - Preparing the instances for training......
2020-01-27 19:24:03 Downloading - Downloading input data...
2020-01-27 19:24:49 Training - Training image download completed. Training in progress.
2020-01-27 19:24:49 Uploading - Uploading generated training model.[34mDocker entrypoint called with argument(s): train[0m
[34m[01/27/2020 19:24:46 INFO 139648230414144] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-conf.json: {u'_num_gpus': u'auto', u'_log_level': u'info', u'subtract_mean': u'true', u'force_dense': u'true', u'epochs': 1, u'algorithm_mode': u'regular', u'extra_components': u'-1', u'_kvstore': u'dist_sync', u'_num_kv_servers': u'auto'}[0m
[34m[01/27/2020 19:24:46 INFO 139648230414144] Reading provided configuration from /opt/ml/input/config/hyperparameters.json: {


2020-01-27 19:24:55 Completed - Training job completed
Training seconds: 52
Billable seconds: 52


## Deploy Model

In [14]:
# Ref: http://sagemaker.readthedocs.io/en/latest/estimators.html
predictor = estimator.deploy(initial_instance_count=1,
                             instance_type='ml.m4.xlarge',
                             endpoint_name = 'pca-biketrain-v1')

-----------------!

## Run Predictions

In [15]:
from sagemaker.predictor import csv_serializer, json_deserializer

predictor.content_type = 'text/csv'
predictor.serializer = csv_serializer
predictor.deserializer = json_deserializer

In [16]:
predictor.predict([[-1.333660693,-1.092736969,0.993213054,1.567753667]])

{'projections': [{'projection': [1.6828124523162842,
    0.45077428221702576,
    -1.8276870250701904]}]}

## Summary

1. Ensure Training, Test and Validation data are in S3 Bucket
2. Select Algorithm Container Registry Path - Path varies by region
3. Configure Estimator for training - Specify Algorithm container, instance count, instance type, model output location
4. Specify algorithm specific hyper parameters
5. Train model
6. Deploy model - Specify instance count, instance type and endpoint name
7. Run Predictions