In [68]:
import numpy as np
import pandas as pd

import boto3
import re

import sagemaker
from sagemaker import get_execution_role
from sklearn.preprocessing import LabelEncoder
from sagemaker.predictor import csv_serializer

In [69]:
# Specify your bucket name
bucket_name = 'udemy-workshop-chloe'

training_folder = r'wwc-workshop/iris/data/'
validation_folder = r'wwc-workshop/iris/validation/'
test_folder = r'wwc-workshop/iris/test/'

s3_model_output_location = r's3://{0}/wwc-workshop/iris/model'.format(bucket_name)
s3_training_file_location = r's3://{0}/{1}'.format(bucket_name,training_folder)
s3_validation_file_location = r's3://{0}/{1}'.format(bucket_name,validation_folder)
s3_validation_file_location = r's3://{0}/{1}'.format(bucket_name,test_folder)

In [70]:
print(s3_model_output_location)
print(s3_training_file_location)

s3://udemy-workshop-chloe/wwc-workshop/iris/model
s3://udemy-workshop-chloe/wwc-workshop/iris/data/


In [71]:
# Write and Reading from S3 is just as easy
# files are referred as objects in S3.  
# file name is referred as key name in S3

# File stored in S3 is automatically replicated across 3 different availability zones 
# in the region where the bucket was created.

# http://boto3.readthedocs.io/en/latest/guide/s3.html
def write_to_s3(filename, bucket, key):
    with open(filename,'rb') as f: # Read in binary mode
        return boto3.Session().resource('s3').Bucket(bucket).Object(key).upload_fileobj(f)

In [72]:
write_to_s3('train.csv', 
            bucket_name,
            training_folder + 'train.csv')

In [73]:
write_to_s3('validation.csv',
            bucket_name,
            validation_folder + 'validation.csv')

In [74]:
write_to_s3('test.csv',
            bucket_name,
            test_folder + 'test.csv')

In [75]:
# Establish a session with AWS
sess = sagemaker.Session()

In [76]:
role = get_execution_role()

In [77]:
# Sagemaker API now maintains the algorithm container mapping for us
# Specify the region, algorithm and version
container = sagemaker.amazon.amazon_estimator.get_image_uri(
    sess.boto_region_name,
    "xgboost", 
    "latest")

print('Using SageMaker XGBoost container:\n{} ({})'.format(container, sess.boto_region_name))

	get_image_uri(region, 'xgboost', '0.90-1').


Using SageMaker XGBoost container:
644912444149.dkr.ecr.eu-west-2.amazonaws.com/xgboost:latest (eu-west-2)


In [78]:
# Configure the training job
# Specify type and number of instances to use
# S3 location where final artifacts needs to be stored

#   Reference: http://sagemaker.readthedocs.io/en/latest/estimators.html

estimator = sagemaker.estimator.Estimator(
    container,
    role, 
    train_instance_count=1, 
    train_instance_type='ml.m4.xlarge',
    output_path=s3_model_output_location,
    sagemaker_session=sess,
    base_job_name ='iris-workshop-v1-3')

In [79]:
# Specify hyper parameters that appropriate for the training algorithm
# XGBoost Training Parameter Reference: 
#   https://github.com/dmlc/xgboost/blob/master/doc/parameter.md

estimator.set_hyperparameters(max_depth=5,
                              objective="multi:softmax",
                              num_class=3,
                              num_round=50,
                              early_stopping_rounds=10)

In [80]:
estimator.hyperparameters()

{'max_depth': 5,
 'objective': 'multi:softmax',
 'num_class': 3,
 'num_round': 50,
 'early_stopping_rounds': 10}

In [81]:
training_input_config = sagemaker.session.s3_input(
    s3_data=s3_training_file_location,
    content_type='text/csv',
    s3_data_type='S3Prefix')

validation_input_config = sagemaker.session.s3_input(
    s3_data=s3_validation_file_location,
    content_type='text/csv',
    s3_data_type='S3Prefix'
)

data_channels = {'train': training_input_config, 'validation': validation_input_config}

In [82]:
print(training_input_config.config)
print(validation_input_config.config)

{'DataSource': {'S3DataSource': {'S3DataDistributionType': 'FullyReplicated', 'S3DataType': 'S3Prefix', 'S3Uri': 's3://udemy-workshop-chloe/wwc-workshop/iris/data/'}}, 'ContentType': 'text/csv'}
{'DataSource': {'S3DataSource': {'S3DataDistributionType': 'FullyReplicated', 'S3DataType': 'S3Prefix', 'S3Uri': 's3://udemy-workshop-chloe/wwc-workshop/iris/test/'}}, 'ContentType': 'text/csv'}


In [83]:
estimator.fit(data_channels)

2019-11-26 15:47:53 Starting - Starting the training job...
2019-11-26 15:47:54 Starting - Launching requested ML instances......
2019-11-26 15:48:54 Starting - Preparing the instances for training...
2019-11-26 15:49:51 Downloading - Downloading input data...
2019-11-26 15:50:22 Training - Training image download completed. Training in progress..[31mArguments: train[0m
[31m[2019-11-26:15:50:23:INFO] Running standalone xgboost training.[0m
[31m[2019-11-26:15:50:23:INFO] File size need to be processed in the node: 0.0mb. Available memory size in the node: 8535.84mb[0m
[31m[2019-11-26:15:50:23:INFO] Determined delimiter of CSV input is ','[0m
[31m[15:50:23] S3DistributionType set as FullyReplicated[0m
[31m[15:50:23] 105x4 matrix with 420 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[31m[2019-11-26:15:50:23:INFO] Determined delimiter of CSV input is ','[0m
[31m[15:50:23] S3DistributionType set as FullyReplicated[0m
[31m[15:50:23] 1

In [84]:
predictor = estimator.deploy(initial_instance_count=1,
                             instance_type='ml.m4.xlarge',
                             endpoint_name = 'iris-workshop-v1-3')

-------------------------------------------------------------------------------------!

0 = Iris Setosa 
1 = Iris Versicolor 
3 = Iris Virginca

In [85]:
test = pd.read_csv('test.csv')

In [86]:
test.head()

Unnamed: 0,0,5.1,3.8,1.9,0.4
0,1,6.0,2.7,5.1,1.6
1,1,5.0,2.3,3.3,1.0
2,2,6.4,2.7,5.3,1.9
3,1,6.1,3.0,4.6,1.4
4,0,5.0,3.2,1.2,0.2


In [105]:
testLabels = test.iloc[:, 0]

In [96]:
testData = test.iloc[:, 1:]

In [97]:
testData = testData.to_numpy()

In [98]:
testData

array([[6. , 2.7, 5.1, 1.6],
       [5. , 2.3, 3.3, 1. ],
       [6.4, 2.7, 5.3, 1.9],
       [6.1, 3. , 4.6, 1.4],
       [5. , 3.2, 1.2, 0.2],
       [5.2, 2.7, 3.9, 1.4],
       [5.5, 2.5, 4. , 1.3],
       [5.9, 3. , 5.1, 1.8],
       [5.8, 2.7, 4.1, 1. ],
       [6.2, 2.8, 4.8, 1.8],
       [5.1, 3.7, 1.5, 0.4],
       [4.9, 3.1, 1.5, 0.1],
       [7.1, 3. , 5.9, 2.1],
       [6.9, 3.1, 5.1, 2.3]])

In [99]:
from sagemaker.predictor import csv_serializer, json_deserializer

predictor.content_type = 'text/csv'
predictor.serializer = csv_serializer
predictor.deserializer = None

In [122]:
int(str(predictor.predict( [6. , 2.7, 5.1, 1.6]))[2])

2

In [109]:
def getLabel(num):
    if(num == 0):
        return "Iris-Setosa"
    if(num == 1):
        return "Iris-Versicolor"
    if(num == 2):
        return "Iris-Virginica"

In [126]:
count =0
for data in testData:
    print(data)
    print("Actual = "+ getLabel(testLabels[count]))
    prediction = int(str(predictor.predict(data))[2])
    print("Prediction= " + getLabel(prediction))
    print(" ")
    count +=1
    

[6.  2.7 5.1 1.6]
Actual = Iris-Versicolor
Prediction= Iris-Virginica
 
[5.  2.3 3.3 1. ]
Actual = Iris-Versicolor
Prediction= Iris-Versicolor
 
[6.4 2.7 5.3 1.9]
Actual = Iris-Virginica
Prediction= Iris-Virginica
 
[6.1 3.  4.6 1.4]
Actual = Iris-Versicolor
Prediction= Iris-Versicolor
 
[5.  3.2 1.2 0.2]
Actual = Iris-Setosa
Prediction= Iris-Setosa
 
[5.2 2.7 3.9 1.4]
Actual = Iris-Versicolor
Prediction= Iris-Versicolor
 
[5.5 2.5 4.  1.3]
Actual = Iris-Versicolor
Prediction= Iris-Versicolor
 
[5.9 3.  5.1 1.8]
Actual = Iris-Virginica
Prediction= Iris-Virginica
 
[5.8 2.7 4.1 1. ]
Actual = Iris-Versicolor
Prediction= Iris-Versicolor
 
[6.2 2.8 4.8 1.8]
Actual = Iris-Virginica
Prediction= Iris-Virginica
 
[5.1 3.7 1.5 0.4]
Actual = Iris-Setosa
Prediction= Iris-Setosa
 
[4.9 3.1 1.5 0.1]
Actual = Iris-Setosa
Prediction= Iris-Setosa
 
[7.1 3.  5.9 2.1]
Actual = Iris-Virginica
Prediction= Iris-Virginica
 
[6.9 3.1 5.1 2.3]
Actual = Iris-Virginica
Prediction= Iris-Virginica
 
