In [1]:
# Import libraries
from sagemaker import get_execution_role
import boto3, sys, os
import sagemaker

# S3 prefix
bucket = 'sagemaker-getting-start-test'
prefix = 'sagemaker/sklearn-randomforest'

sagemaker_session = sagemaker.Session()
# Get a SageMaker-compatible role used by this Notebook Instance.
role = get_execution_role()
my_region = boto3.session.Session().region_name # set the region of the instance
print("Execution role is " + role)
print("Success - the MySageMakerInstance is in the " + my_region + ".")

Execution role is arn:aws:iam::251344623468:role/service-role/AmazonSageMaker-ExecutionRole-20191017T203175
Success - the MySageMakerInstance is in the ap-northeast-1.


In [2]:
s3 = boto3.resource('s3')

try:
    if my_region == 'ap-northeast-1':
        s3.create_bucket(Bucket=bucket)
    else:
        s3.create_bucket(Bucket=bucket, CreateBucketConfiguration={'LocationConstraint': my_region})
    print('S3 bucket created successfully')
except Exception as e:
    print('S3 error: ', e)

S3 error:  An error occurred (IllegalLocationConstraintException) when calling the CreateBucket operation: The unspecified location constraint is incompatible for the region specific endpoint this request was sent to.


In [3]:
# Upload csv train data to S3
WORK_DIRECTORY = 'data'
train_input = sagemaker_session.upload_data("{}/train_tweets.csv".format(WORK_DIRECTORY), bucket=bucket, key_prefix="{}/{}".format(prefix, WORK_DIRECTORY))

In [4]:
# train data and save a model
account = sagemaker_session.boto_session.client('sts').get_caller_identity()['Account']
region = sagemaker_session.boto_session.region_name
container_name = 'sklearn-rf-container'
image_full = '{}.dkr.ecr.{}.amazonaws.com/{}:latest'.format(account, region, container_name)

clf = sagemaker.estimator.Estimator(image_full, role, 1, 'ml.c4.2xlarge', 
                                    output_path="s3://{}/{}/output".format(bucket, prefix),
                                    sagemaker_session=sagemaker_session)

# training tweet data with RandomForestClassifier in scikit-learn container
clf.fit(train_input)

2020-06-03 09:45:10 Starting - Starting the training job...
2020-06-03 09:45:12 Starting - Launching requested ML instances.........
2020-06-03 09:46:53 Starting - Preparing the instances for training...
2020-06-03 09:47:31 Downloading - Downloading input data...
2020-06-03 09:47:43 Training - Downloading the training image...
2020-06-03 09:48:25 Training - Training image download completed. Training in progress.[34mStarting the training.[0m
[34mObtained 12055 features in dataset[0m
[34mX shape: (59440, 12055)[0m
[34my shape: (59440,)[0m

2020-06-03 09:50:13 Uploading - Uploading generated training model
2020-06-03 09:50:13 Completed - Training job completed
[34mTraining complete.[0m
Training seconds: 162
Billable seconds: 162


In [5]:
from sagemaker.predictor import csv_serializer
predictor = clf.deploy(initial_instance_count=1, instance_type="ml.m4.xlarge", serializer=csv_serializer)

-------------!

In [6]:
# load test payload
import numpy as np
import pandas as pd
test_data = pd.read_csv("{}/payload_tweet.csv".format(WORK_DIRECTORY), header=None)
print("test_data: {}".format(test_data.shape))

test_data: (2, 12055)


In [7]:
predictions = predictor.predict(test_data.values).decode('utf-8')
predictions_array = np.fromstring(predictions, sep=' ') # and turn the prediction into an array
print("Predicted values: {}".format(predictions_array))

Predicted values: [0. 0.]
