In [1]:
# S3 prefix
bucket = 'sagemaker-getting-start-test'
prefix = 'sagemaker/scikit-iris'

import sagemaker
from sagemaker import get_execution_role

sagemaker_session = sagemaker.Session()
# Get a SageMaker-compatible role used by this Notebook Instance.
role = get_execution_role()

In [2]:
import numpy as np
import os
from sklearn import datasets

# Load Iris dataset, then join labels and features
iris = datasets.load_iris()
joined_iris = np.insert(iris.data, 0, iris.target, axis=1)

# Create directory and write csv
os.makedirs('./data', exist_ok=True)
np.savetxt('./data/iris.csv', joined_iris, delimiter=',', fmt='%1.1f, %1.3f, %1.3f, %1.3f, %1.3f')

In [3]:
WORK_DIRECTORY = 'data'
train_input = sagemaker_session.upload_data(WORK_DIRECTORY, key_prefix="{}/{}".format(prefix, WORK_DIRECTORY))

In [4]:
account = sagemaker_session.boto_session.client('sts').get_caller_identity()['Account']
region = sagemaker_session.boto_session.region_name
image_full = '{}.dkr.ecr.{}.amazonaws.com/sklearn-container:latest'.format(account, region)

clf = sagemaker.estimator.Estimator(image_full, role, 1, 'ml.c4.2xlarge', 
                                    output_path="s3://{}/output".format(sagemaker_session.default_bucket()),
                                    sagemaker_session=sagemaker_session)
 
# training with the gradient boosting classifier model
clf.fit(train_input)

2019-11-17 05:28:07 Starting - Starting the training job...
2019-11-17 05:28:08 Starting - Launching requested ML instances.........
2019-11-17 05:29:39 Starting - Preparing the instances for training...
2019-11-17 05:30:24 Downloading - Downloading input data
2019-11-17 05:30:24 Training - Downloading the training image...
2019-11-17 05:31:07 Uploading - Uploading generated training model.[31mStarting the training.[0m
[31mX shape: (150,4)[0m
[31my shape: (150,1)[0m
[31mTraining complete.[0m

2019-11-17 05:31:12 Completed - Training job completed
Training seconds: 54
Billable seconds: 54


In [20]:
from sagemaker.predictor import csv_serializer
predictor = clf.deploy(initial_instance_count=1, instance_type="ml.m4.xlarge", serializer=csv_serializer)

Using already existing model: sklearn-container-2019-11-17-05-28-07-002


--------------------------------------------------------------------------------------!

In [21]:
import itertools
import pandas as pd

shape = pd.read_csv("./data/iris.csv", header=None)
a = [50*i for i in range(3)]
b = [40+i for i in range(10)]
indices = [i+j for i,j in itertools.product(a,b)]

test_data = shape.iloc[indices[:-1]]
print("test_data: {}".format(test_data.shape))
test_X = test_data.iloc[:,1:]
test_y = test_data.iloc[:,0]
print("test_X: {}".format(test_X.shape))
print("test_y: {}".format(test_y.shape))

test_data: (29, 5)
test_X: (29, 4)
test_y: (29,)


In [22]:
print("Predicted values:\n{}".format(predictor.predict(test_X.values).decode('utf-8')))
print("test_y values:\n{}".format(test_y.values))

Predicted values:
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0

test_y values:
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 2. 2. 2. 2.
 2. 2. 2. 2. 2.]


In [19]:
clf.delete_endpoint()