In [31]:
# import libraries
import boto3, re, sys, math, json, os, sagemaker, urllib.request
from sagemaker import get_execution_role
import numpy as np                                
import pandas as pd                               
import matplotlib.pyplot as plt                   
from IPython.display import Image                 
from IPython.display import display               
from time import gmtime, strftime                 
from sagemaker.predictor import csv_serializer   

# Define IAM role
role = get_execution_role()
prefix = 'sagemaker/DEMO-xgboost-dm'
# each region has its XGBoost container
containers = {'us-west-2': '433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:latest',
              'us-east-1': '811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest',
              'us-east-2': '825641698319.dkr.ecr.us-east-2.amazonaws.com/xgboost:latest',
              'eu-west-1': '685385470294.dkr.ecr.eu-west-1.amazonaws.com/xgboost:latest'} 
my_region = boto3.session.Session().region_name # set the region of the instance
print("Great! - your SageMaker Instance is in the " + my_region + " region. You will use the " + containers[my_region] + " container for your SageMaker endpoint to make inference requests.")

Great! - your SageMaker Instance is in the us-east-1 region. You will use the 811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest container for your SageMaker endpoint to make inference requests.


In [32]:
# Download from your S3 bucket the census data CSV file based on the publically available census data from the ML repository curated by the University of California, Irvine
from io import StringIO
s3 = boto3.resource('s3')
bucket_name = 'train-census-earnings-xgboost' # place the adult_census.csv file in a bucket in your account
object_key = 'adult_census.csv'

# Load the data into a pandas dataframe 

csv_obj = s3.Object(bucket_name, object_key)
csv_string = csv_obj.get()['Body'].read().decode('utf-8')

raw_data = pd.read_csv(StringIO(csv_string))
raw_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,y_no,y_yes
0,39,6,77516,1,13,3,9,4,1,2,2174,0,40,1,0,1
1,50,2,83311,1,13,1,5,3,1,2,0,0,13,1,0,1
2,38,1,215646,4,9,2,7,4,1,2,0,0,40,1,0,1
3,53,1,234721,3,7,1,7,3,5,2,0,0,40,1,0,1
4,28,1,338409,1,13,1,6,1,5,1,0,0,40,13,0,1


In [33]:
model_data = pd.get_dummies(raw_data)
model_data.head()

In [34]:
# Randomize the data and split it between train and test datasets on a 70% 30% split respectively
train_data, test_data = np.split(model_data.sample(frac=1, random_state=1729), [int(0.7 * len(model_data))])
print(train_data.shape, test_data.shape)

(22792, 16) (9769, 16)


In [35]:
# Reformat the header and first column of the training data, 
# save the new train dataset to your S3 bucket as train.csv and load the data from the S3 bucket
pd.concat([train_data['y_yes'], train_data.drop(['y_no', 'y_yes'], axis=1)], axis=1).to_csv('train.csv', index=False, header=False)
boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'train/train.csv')).upload_file('train.csv')
s3_input_train = sagemaker.s3_input(s3_data='s3://{}/{}/train'.format(bucket_name, prefix), content_type='csv')

In [36]:
# Set up the SageMaker session, create an instance of the XGBoost model (an estimator), 
# and define the model’s hyperparameters
session_sm = sagemaker.Session()
xgb = sagemaker.estimator.Estimator(containers[my_region],role, train_instance_count=1, train_instance_type='ml.m4.xlarge',output_path='s3://{}/{}/output'.format(bucket_name, prefix),sagemaker_session=session_sm)
xgb.set_hyperparameters(alpha=1.4697769189147052,rate_drop=0.3,tweedie_variance_power=1.4,max_depth=3,eta=0.27735070284333196,min_child_weight=1.758169167129938,objective='binary:logistic',num_round=100)

In [37]:
# After the data is loaded and the XGBoost estimator is set up, 
# train the model using gradient optimization on a ml.m4.xlarge instance
xgb.fit({'train': s3_input_train})

2020-02-06 18:41:08 Starting - Starting the training job...
2020-02-06 18:41:10 Starting - Launching requested ML instances......
2020-02-06 18:42:13 Starting - Preparing the instances for training......
2020-02-06 18:43:21 Downloading - Downloading input data...
2020-02-06 18:43:55 Training - Downloading the training image..[34mArguments: train[0m
[34m[2020-02-06:18:44:15:INFO] Running standalone xgboost training.[0m
[34m[2020-02-06:18:44:15:INFO] Path /opt/ml/input/data/validation does not exist![0m
[34m[2020-02-06:18:44:15:INFO] File size need to be processed in the node: 0.83mb. Available memory size in the node: 8513.19mb[0m
[34m[2020-02-06:18:44:15:INFO] Determined delimiter of CSV input is ','[0m
[34m[18:44:15] S3DistributionType set as FullyReplicated[0m
[34m[18:44:15] 22792x14 matrix with 319088 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[18:44:15] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 40 extra no


2020-02-06 18:44:27 Uploading - Uploading generated training model
2020-02-06 18:44:27 Completed - Training job completed
Training seconds: 66
Billable seconds: 66


In [38]:
# Deploy your model and create an endpoint that you can access
xgb_predictor = xgb.deploy(initial_instance_count=1,instance_type='ml.m4.xlarge')

-----------------!

In [39]:
# Predict whether census participants in the test dataset earned more than 50K
test_data_array = test_data.drop(['y_no', 'y_yes'], axis=1).values #load the data into an array
xgb_predictor.content_type = 'text/csv' # set the data type for an inference
xgb_predictor.serializer = csv_serializer # set the serializer type
predictions = xgb_predictor.predict(test_data_array).decode('utf-8') # predict!
predictions_array = np.fromstring(predictions[1:], sep=',') # and turn the prediction into an array
print(predictions_array.shape)

(9769,)


In [40]:
# Evaluate the performance and accuracy of the model
cm = pd.crosstab(index=test_data['y_yes'], columns=np.round(predictions_array), rownames=['Observed'], colnames=['Predicted'])
tn = cm.iloc[0,0]; fn = cm.iloc[1,0]; tp = cm.iloc[1,1]; fp = cm.iloc[0,1]; p = (tp+tn)/(tp+tn+fp+fn)*100
print("\n{0:<20}{1:<4.1f}%\n".format("Overall Classification Rate: ", p))
print("{0:<15}{1:<15}{2:>8}".format("Predicted", "Under 50K", "Over 50K"))
print("Observed")
print("{0:<15}{1:<2.0f}% ({2:<}){3:>6.0f}% ({4:<})".format("Under 50K", tn/(tn+fn)*100,tn, fp/(tp+fp)*100, fp))
print("{0:<16}{1:<1.0f}% ({2:<}){3:>7.0f}% ({4:<}) \n".format("Over 50K", fn/(tn+fn)*100,fn, tp/(tp+fp)*100, tp))


Overall Classification Rate: 87.2%

Predicted      Under 50K      Over 50K
Observed
Under 50K      78% (1581)    10% (805)
Over 50K        22% (442)     90% (6941) 



In [41]:
# Terminate your SageMaker-related resources,
# delete the SageMaker endpoint and the objects in your S3 bucket
sagemaker.Session().delete_endpoint(xgb_predictor.endpoint)
# delete your bucket holding your training data and model artifacts
bucket_to_delete = boto3.resource('s3').Bucket(bucket_name)
bucket_to_delete.objects.all().delete()

[{'ResponseMetadata': {'RequestId': '87E484C6598F7785',
   'HostId': 'uG3t84pjqI3jMkbALlwbu5Y1gifYb3m6ni1wxNwJSNWiQK83qm+YW3zE1FIMzZSdvTsKTqyAD/k=',
   'HTTPStatusCode': 200,
   'HTTPHeaders': {'x-amz-id-2': 'uG3t84pjqI3jMkbALlwbu5Y1gifYb3m6ni1wxNwJSNWiQK83qm+YW3zE1FIMzZSdvTsKTqyAD/k=',
    'x-amz-request-id': '87E484C6598F7785',
    'date': 'Thu, 06 Feb 2020 18:55:06 GMT',
    'connection': 'close',
    'content-type': 'application/xml',
    'transfer-encoding': 'chunked',
    'server': 'AmazonS3'},
   'RetryAttempts': 0},
  'Deleted': [{'Key': 'adult_census_clean.csv'},
   {'Key': 'sagemaker/DEMO-xgboost-dm/train/train.csv'},
   {'Key': 'sagemaker/DEMO-xgboost-dm/output/xgboost-2020-02-06-18-41-08-669/output/model.tar.gz'}]}]