In [1]:
import numpy as np
import pandas as pd
import boto3
import re
import sagemaker
from sagemaker import get_execution_role

## Upload Data to S3

In [33]:
bucket_name = "williamli-ml-sagemaker"  ## sagemaker s3 bucket name

training_folder = r"heart_disease/training/"
validation_folder = r"heart_disease/validation/"
testing_folder = r"heart_disease/testing/"

s3_model_output_location = r"s3://{0}/heart_disease/model".format(bucket_name)
s3_training_file_location = r"s3://{0}/{1}".format(bucket_name, training_folder)
s3_validation_file_location = r"s3://{0}/{1}".format(bucket_name, validation_folder)

In [3]:
print(s3_model_output_location)
print(s3_training_file_location)
print(s3_validation_file_location)

s3://williamli-ml-sagemaker/heart_disease/model
s3://williamli-ml-sagemaker/heart_disease/training/
s3://williamli-ml-sagemaker/heart_disease/validation/


In [4]:
def write_to_s3(filename, bucket, key):
    with open(filename, 'rb') as f:
        return boto3.Session().resource('s3').Bucket(bucket).Object(key).upload_fileobj(f)

In [5]:
write_to_s3("train_data.csv", bucket_name, training_folder + "heart_disease_train.csv")
write_to_s3("validation_data.csv", bucket_name, validation_folder + "heart_disease_validation.csv")

In [34]:
write_to_s3("test_data.csv", bucket_name, testing_folder + "heart_disease_test.csv")

## Set up Training Docker Image

In [6]:
use_spot_instance = True
max_run = 3600
max_wait = 7200 if use_spot_instance else None

job_name = "xgboost-heart-disease-v1"

checkpoint_s3_uri = None

if use_spot_instance:
    checkpoint_s3_uri = f"s3://{bucket_name}/heart_disease/checkpoints/{job_name}"
    
print(f"Checkpoint uri: {checkpoint_s3_uri}")

Checkpoint uri: s3://williamli-ml-sagemaker/heart_disease/checkpoints/xgboost-heart-disease-v1


In [7]:
# establish a session with AWS
sess = sagemaker.Session()

In [8]:
role = get_execution_role()

In [9]:
print(role)

arn:aws:iam::432449956699:role/service-role/AmazonSageMaker-ExecutionRole-20220104T104116


In [10]:
container = sagemaker.image_uris.retrieve("xgboost",
                                          sess.boto_region_name,
                                          version="1.2-2")
print(f"Using XGBoost Container {container}")

Using XGBoost Container 683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:1.2-2


## Build Model

In [11]:
estimator = sagemaker.estimator.Estimator(
    container,
    role,
    instance_count=1,
    instance_type='ml.m5.xlarge',
    output_path=s3_model_output_location,
    sagemaker_session=sess,
    base_job_name=job_name,
    use_spot_instances=use_spot_instance,
    max_run=max_run,
    max_wait=max_wait,
    checkpoint_s3_uri=checkpoint_s3_uri
)

In [12]:
estimator.set_hyperparameters(max_depth=5,
                              objective="binary:hinge",
                              num_round=150)

In [13]:
estimator.hyperparameters()

{'max_depth': 5, 'objective': 'binary:hinge', 'num_round': 150}

In [14]:
# specify training data location and validation data location
training_input_config = sagemaker.session.TrainingInput(
    s3_data=s3_training_file_location,
    content_type="csv",
    s3_data_type="S3Prefix"
)

validation_input_config = sagemaker.session.TrainingInput(
    s3_data=s3_validation_file_location,
    content_type="csv",
    s3_data_type="S3Prefix"
)

data_channels = {"train": training_input_config, "validation": validation_input_config}

In [15]:
print(training_input_config.config)
print(validation_input_config.config)

{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://williamli-ml-sagemaker/heart_disease/training/', 'S3DataDistributionType': 'FullyReplicated'}}, 'ContentType': 'csv'}
{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://williamli-ml-sagemaker/heart_disease/validation/', 'S3DataDistributionType': 'FullyReplicated'}}, 'ContentType': 'csv'}


In [16]:
# Train the model
estimator.fit(data_channels)

2022-04-03 19:48:45 Starting - Starting the training job...
2022-04-03 19:49:09 Starting - Launching requested ML instancesProfilerReport-1649015325: InProgress
.........
2022-04-03 19:50:29 Starting - Preparing the instances for training...
2022-04-03 19:51:14 Downloading - Downloading input data...
2022-04-03 19:51:30 Training - Downloading the training image...
2022-04-03 19:52:09 Training - Training image download completed. Training in progress..[34m[2022-04-03 19:52:08.493 ip-10-2-226-211.ec2.internal:1 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2022-04-03:19:52:08:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2022-04-03:19:52:08:INFO] Failed to parse hyperparameter objective value binary:hinge to Json.[0m
[34mReturning the value itself[0m
[34m[2022-04-03:19:52:08:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2022-04-03:19:52:08:INFO] Running XGBoost Sagemaker in algorithm mode[0m
[34m[2022-04-03:19:52:08:INFO


2022-04-03 19:52:46 Uploading - Uploading generated training model[34m[140]#011train-error:0.23151#011validation-error:0.23317[0m
[34m[141]#011train-error:0.23135#011validation-error:0.23303[0m
[34m[142]#011train-error:0.23127#011validation-error:0.23305[0m
[34m[143]#011train-error:0.23124#011validation-error:0.23299[0m
[34m[144]#011train-error:0.23120#011validation-error:0.23301[0m
[34m[145]#011train-error:0.23115#011validation-error:0.23293[0m
[34m[146]#011train-error:0.23108#011validation-error:0.23276[0m
[34m[147]#011train-error:0.23104#011validation-error:0.23265[0m
[34m[148]#011train-error:0.23097#011validation-error:0.23254[0m
[34m[149]#011train-error:0.23094#011validation-error:0.23247[0m

2022-04-03 19:53:10 Completed - Training job completed
ProfilerReport-1649015325: NoIssuesFound
Training seconds: 100
Billable seconds: 36
Managed Spot Training savings: 64.0%


## Deploy Model

In [18]:
predictor = estimator.deploy(initial_instance_count=1,
                             instance_type='ml.m5.xlarge',
                             endpoint_name=job_name)

-----!

## Evaluate Model

In [19]:
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import os

import boto3
from sagemaker import get_execution_role
import sagemaker
from sagemaker.serializers import CSVSerializer

In [20]:
from sklearn.metrics import log_loss, roc_auc_score, classification_report

In [21]:
endpoint_name = "xgboost-heart-disease-v1"
predictor = sagemaker.predictor.Predictor(endpoint_name=endpoint_name)

In [22]:
predictor.serializer = CSVSerializer()

In [23]:
with open("train_data_column_names.txt", 'r') as f:
    train_col_names = f.read().split(',')

In [24]:
train_col_names

['HeartDisease',
 'BMI',
 'Smoking',
 'AlcoholDrinking',
 'Stroke',
 'PhysicalHealth',
 'MentalHealth',
 'DiffWalking',
 'Sex',
 'AgeCategory',
 'Race',
 'Diabetic',
 'PhysicalActivity',
 'GenHealth',
 'SleepTime',
 'Asthma',
 'KidneyDisease',
 'SkinCancer']

In [25]:
train_df = pd.read_csv("train_data.csv", names=train_col_names)
validation_df = pd.read_csv("validation_data.csv", names=train_col_names)

In [26]:
train_df.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,0,21.58,1,0,0,0.0,0.0,0,0,12,5,0,0,0,9.0,0,0,0
1,1,28.8,1,0,0,2.0,10.0,0,1,11,5,0,1,2,6.0,0,0,0
2,0,24.41,1,0,0,0.0,0.0,0,1,12,5,0,1,2,8.0,0,0,1
3,1,37.25,1,0,0,0.0,0.0,0,1,9,5,0,0,2,8.0,0,0,0
4,0,24.48,1,0,0,1.0,0.0,0,1,7,5,0,1,4,7.0,0,0,0


In [27]:
X_train = train_df.iloc[:, 1:]
y_train = train_df.iloc[:, 0]
X_validation = validation_df.iloc[:, 1:]
y_validation = validation_df.iloc[:, 0]

In [28]:
# Split the input data into chunks
train_predictions = []
for arr in np.array_split(X_train.to_numpy(),10):
    result = predictor.predict(arr)
    result = result.decode("utf-8")
    result = result.split(',')
    print (arr.shape)
    train_predictions += [float(r) for r in result]

(40939, 17)
(40939, 17)
(40939, 17)
(40939, 17)
(40939, 17)
(40939, 17)
(40939, 17)
(40939, 17)
(40939, 17)
(40939, 17)


In [29]:
validation_predictions = []
for arr in np.array_split(X_validation.to_numpy(),10):
    result = predictor.predict(arr)
    result = result.decode("utf-8")
    result = result.split(',')
    print (arr.shape)
    validation_predictions += [float(r) for r in result]

(17546, 17)
(17546, 17)
(17546, 17)
(17546, 17)
(17545, 17)
(17545, 17)
(17545, 17)
(17545, 17)
(17545, 17)
(17545, 17)


In [30]:
print("Train Log Loss: ", log_loss(y_train, train_predictions))
print("Validation Log Loss: ", log_loss(y_validation, validation_predictions))

Train Log Loss:  7.976545148353147
Validation Log Loss:  8.029194230738419


In [31]:
print("Train ROC AUC: ", roc_auc_score(y_train, train_predictions))
print("Validation ROC AUC: ", roc_auc_score(y_validation, validation_predictions))

Train ROC AUC:  0.7690394219495932
Validation ROC AUC:  0.7675794767086515


In [32]:
print(classification_report(y_validation, validation_predictions))

              precision    recall  f1-score   support

           0       0.81      0.70      0.75     87784
           1       0.73      0.84      0.78     87670

    accuracy                           0.77    175454
   macro avg       0.77      0.77      0.77    175454
weighted avg       0.77      0.77      0.77    175454



## Batch Transform

In [4]:
sess = sagemaker.Session()
role = get_execution_role()
container = sagemaker.image_uris.retrieve("xgboost",
                                          sess.boto_region_name,
                                          version="1.2-2")
print(f"Using XGBoost Container {container}")

Using XGBoost Container 683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:1.2-2


In [5]:
model_data_path = "s3://williamli-ml-sagemaker/heart_disease/model/xgboost-heart-disease-v1-2022-04-03-19-48-45-180/output/model.tar.gz"

In [7]:
trained_model = sagemaker.model.Model(
    model_data=model_data_path,
    image_uri=container,
    role=role)  # your role here; could be different name

# trainedmodel.deploy(initial_instance_count=1, instance_type='ml.c4.xlarge')

In [15]:
test_data_path = "s3://williamli-ml-sagemaker/heart_disease/testing/heart_disease_test.csv"
transformer = trained_model.transformer(instance_count=1, instance_type="ml.m4.xlarge", max_payload=1)
transformer.transform(test_data_path, content_type="text/csv")

........................................[34m[2022-04-10:20:08:40:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2022-04-10:20:08:40:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2022-04-10:20:08:40:INFO] nginx config: [0m
[34mworker_processes auto;[0m
[34mdaemon off;[0m
[34mpid /tmp/nginx.pid;[0m
[34merror_log  /dev/stderr;[0m
[34mworker_rlimit_nofile 4096;[0m
[34mevents {
  worker_connections 2048;[0m
[34m}[0m
[34mhttp {
  include /etc/nginx/mime.types;
  default_type application/octet-stream;
  access_log /dev/stdout combined;
  upstream gunicorn {
    server unix:/tmp/gunicorn.sock;
  }
  server {
    listen 8080 deferred;
    client_max_body_size 0;
    keepalive_timeout 3;
    location ~ ^/(ping|invocations|execution-parameters) {
      proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
      proxy_set_header Host $http_host;
      proxy_redirect off;
      proxy_read_timeout 60s;
      proxy_pass http://gunicorn;
    }
 

UnexpectedStatusException: Error for Transform job sagemaker-xgboost-2022-04-10-20-02-09-518: Failed. Reason: ClientError: See job logs for more information

In [12]:
print(transformer.output_path)

s3://sagemaker-us-east-1-432449956699/sagemaker-xgboost-2022-04-10-17-19-48-641
