In [32]:
import os
import boto3
import re
import sagemaker

role = sagemaker.get_execution_role()
region = boto3.Session().region_name
bucket = "bdx-demo-sagemaker"
prefix = (
    "obesity-risks"  # place to upload training files within the bucket
)

In [33]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import io
import time
import json
import sagemaker.amazon.common as smac

In [34]:
s3 = boto3.client("s3")
filename = "obesity_training_data_1.csv"
s3.download_file(bucket, "data/obesity_training_data.csv", filename)
data = pd.read_csv(filename, header=None)
data.columns = [
    "id",
    "diagnosis",
    "gender",
    "age",
    "drinking_freq",
    "snp1",
    "snp2",
    "snp3",
    "snp4",
    "snp5",
    "snp6",
    "snp7",
    "snp8",
    "snp9",
    "snp10",
    "snp11",
    "snp12",
    "snp13",
    "snp14",
    "snp15",
    "snp16",
    "snp17",
    "snp18",
    "snp19",
    "snp20",
    "snp21",
    "snp22"
]
# save the data
data.to_csv("data.csv", sep=",", index=False)
# print the shape of the data file
print(data.shape)
# show the top few rows
display(data.head())
# describe the data object
display(data.describe())
# we will also summarize the categorical field diganosis
display(data.diagnosis.value_counts())

(429582, 27)


Unnamed: 0,id,diagnosis,gender,age,drinking_freq,snp1,snp2,snp3,snp4,snp5,...,snp13,snp14,snp15,snp16,snp17,snp18,snp19,snp20,snp21,snp22
0,1000015,N,0,64,5,2,0,2,1,0,...,1,2,1,0,2,2,1,1,2,0
1,1000027,N,0,60,3,2,1,0,1,1,...,2,1,1,0,2,1,2,0,0,0
2,1000039,N,0,58,1,2,0,0,1,1,...,2,0,0,0,2,1,2,1,1,1
3,1000040,N,1,66,3,1,0,2,1,0,...,1,1,1,0,2,1,2,0,2,0
4,1000053,O,0,67,4,1,1,1,1,0,...,2,1,2,1,2,2,1,0,2,0


Unnamed: 0,id,gender,age,drinking_freq,snp1,snp2,snp3,snp4,snp5,snp6,...,snp13,snp14,snp15,snp16,snp17,snp18,snp19,snp20,snp21,snp22
count,429582.0,429582.0,429582.0,429582.0,429582.0,429582.0,429582.0,429582.0,429582.0,429582.0,...,429582.0,429582.0,429582.0,429582.0,429582.0,429582.0,429582.0,429582.0,429582.0,429582.0
mean,3514596.0,0.458823,56.859282,2.857524,1.199587,0.412729,1.130415,1.396956,0.489078,0.808942,...,1.350601,1.276085,0.856907,0.149443,1.804682,1.272188,1.12548,0.334656,1.485977,0.642406
std,1451054.0,0.498302,7.998803,1.490233,0.695285,0.57178,0.701569,0.661284,0.607122,0.697206,...,0.663342,0.680713,0.698568,0.37137,0.422957,0.681157,0.702015,0.528315,0.619095,0.661762
min,1000015.0,0.0,38.0,-3.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2257101.0,0.0,51.0,2.0,1.0,0.0,1.0,1.0,0.0,0.0,...,1.0,1.0,0.0,0.0,2.0,1.0,1.0,0.0,1.0,0.0
50%,3516627.0,0.0,58.0,3.0,1.0,0.0,1.0,1.0,0.0,1.0,...,1.0,1.0,1.0,0.0,2.0,1.0,1.0,0.0,2.0,1.0
75%,4771964.0,1.0,63.0,4.0,2.0,1.0,2.0,2.0,1.0,1.0,...,2.0,2.0,1.0,0.0,2.0,2.0,2.0,1.0,2.0,1.0
max,6025052.0,1.0,73.0,6.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0


N    325558
O    104024
Name: diagnosis, dtype: int64

In [35]:
rand_split = np.random.rand(len(data))
train_list = rand_split < 0.8
val_list = (rand_split >= 0.8) & (rand_split < 0.9)
test_list = rand_split >= 0.9

data_train = data[train_list]
data_val = data[val_list]
data_test = data[test_list]
train_y = ((data_train.iloc[:, 1] == "O") + 0).to_numpy()
train_X = data_train.iloc[:, 2:].to_numpy()

val_y = ((data_val.iloc[:, 1] == "O") + 0).to_numpy()
val_X = data_val.iloc[:, 2:].to_numpy()

test_y = ((data_test.iloc[:, 1] == "O") + 0).to_numpy()
test_X = data_test.iloc[:, 2:].to_numpy()

In [36]:
train_file = "linear_train.data"

f = io.BytesIO()
smac.write_numpy_to_dense_tensor(f, train_X.astype("float32"), train_y.astype("float32"))
f.seek(0)

boto3.Session().resource("s3").Bucket(bucket).Object(
    os.path.join(prefix, "train", train_file)
).upload_fileobj(f)

In [37]:
validation_file = "linear_validation.data"

f = io.BytesIO()
smac.write_numpy_to_dense_tensor(f, val_X.astype("float32"), val_y.astype("float32"))
f.seek(0)

boto3.Session().resource("s3").Bucket(bucket).Object(
    os.path.join(prefix, "validation", validation_file)
).upload_fileobj(f)

In [38]:
from sagemaker import image_uris

container = image_uris.retrieve(region=boto3.Session().region_name, framework="linear-learner")

In [39]:
linear_job = "DEMO-linear-" + time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())

print("Job name is:", linear_job)

linear_training_params = {
    "RoleArn": role,
    "TrainingJobName": linear_job,
    "AlgorithmSpecification": {"TrainingImage": container, "TrainingInputMode": "File"},
    "ResourceConfig": {"InstanceCount": 1, "InstanceType": "ml.c4.2xlarge", "VolumeSizeInGB": 10},
    "InputDataConfig": [
        {
            "ChannelName": "train",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": "s3://{}/{}/train/".format(bucket, prefix),
                    "S3DataDistributionType": "ShardedByS3Key",
                }
            },
            "CompressionType": "None",
            "RecordWrapperType": "None",
        },
        {
            "ChannelName": "validation",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": "s3://{}/{}/validation/".format(bucket, prefix),
                    "S3DataDistributionType": "FullyReplicated",
                }
            },
            "CompressionType": "None",
            "RecordWrapperType": "None",
        },
    ],
    "OutputDataConfig": {"S3OutputPath": "s3://{}/{}/".format(bucket, prefix)},
    "HyperParameters": {
        "feature_dim": "25",
        "mini_batch_size": "100",
        "predictor_type": "regressor",
        "epochs": "10",
        "num_models": "32",
        "loss": "absolute_loss",
    },
    "StoppingCondition": {"MaxRuntimeInSeconds": 60 * 60},
}

Job name is: DEMO-linear-2022-04-18-21-15-39


In [40]:
%%time

region = boto3.Session().region_name
sm = boto3.client("sagemaker")

sm.create_training_job(**linear_training_params)

status = sm.describe_training_job(TrainingJobName=linear_job)["TrainingJobStatus"]
print(status)
sm.get_waiter("training_job_completed_or_stopped").wait(TrainingJobName=linear_job)
if status == "Failed":
    message = sm.describe_training_job(TrainingJobName=linear_job)["FailureReason"]
    print("Training failed with the following error: {}".format(message))
    raise Exception("Training job failed")

InProgress
CPU times: user 123 ms, sys: 11.8 ms, total: 135 ms
Wall time: 10min


In [41]:
linear_hosting_container = {
    "Image": container,
    "ModelDataUrl": sm.describe_training_job(TrainingJobName=linear_job)["ModelArtifacts"][
        "S3ModelArtifacts"
    ],
}

create_model_response = sm.create_model(
    ModelName=linear_job, ExecutionRoleArn=role, PrimaryContainer=linear_hosting_container
)

print(create_model_response["ModelArn"])

arn:aws:sagemaker:us-east-1:925680695682:model/demo-linear-2022-04-18-21-15-39


In [42]:
linear_endpoint_config = "DEMO-linear-endpoint-config-" + time.strftime(
    "%Y-%m-%d-%H-%M-%S", time.gmtime()
)
print(linear_endpoint_config)
create_endpoint_config_response = sm.create_endpoint_config(
    EndpointConfigName=linear_endpoint_config,
    ProductionVariants=[
        {
            "InstanceType": "ml.m4.xlarge",
            "InitialInstanceCount": 1,
            "ModelName": linear_job,
            "VariantName": "AllTraffic",
        }
    ],
)

print("Endpoint Config Arn: " + create_endpoint_config_response["EndpointConfigArn"])

DEMO-linear-endpoint-config-2022-04-18-21-34-49
Endpoint Config Arn: arn:aws:sagemaker:us-east-1:925680695682:endpoint-config/demo-linear-endpoint-config-2022-04-18-21-34-49


In [43]:
%%time

linear_endpoint = "DEMO-linear-endpoint-" + time.strftime("%Y%m%d%H%M", time.gmtime())
print(linear_endpoint)
create_endpoint_response = sm.create_endpoint(
    EndpointName=linear_endpoint, EndpointConfigName=linear_endpoint_config
)
print(create_endpoint_response["EndpointArn"])

resp = sm.describe_endpoint(EndpointName=linear_endpoint)
status = resp["EndpointStatus"]
print("Status: " + status)

sm.get_waiter("endpoint_in_service").wait(EndpointName=linear_endpoint)

resp = sm.describe_endpoint(EndpointName=linear_endpoint)
status = resp["EndpointStatus"]
print("Arn: " + resp["EndpointArn"])
print("Status: " + status)

if status != "InService":
    raise Exception("Endpoint creation did not succeed")

DEMO-linear-endpoint-202204182136
arn:aws:sagemaker:us-east-1:925680695682:endpoint/demo-linear-endpoint-202204182136
Status: Creating
Arn: arn:aws:sagemaker:us-east-1:925680695682:endpoint/demo-linear-endpoint-202204182136
Status: InService
CPU times: user 137 ms, sys: 7.61 ms, total: 145 ms
Wall time: 5min 1s


In [45]:
def np2csv(arr):
    csv = io.BytesIO()
    np.savetxt(csv, arr, delimiter=",", fmt="%g")
    return csv.getvalue().decode().rstrip()

In [46]:
runtime = boto3.client("runtime.sagemaker")

payload = np2csv(test_X)
response = runtime.invoke_endpoint(
    EndpointName=linear_endpoint, ContentType="text/csv", Body=payload
)
result = json.loads(response["Body"].read().decode())
test_pred = np.array([r["score"] for r in result["predictions"]])

In [47]:
test_mae_linear = np.mean(np.abs(test_y - test_pred))
test_mae_baseline = np.mean(
    np.abs(test_y - np.median(train_y))
)  ## training median as baseline predictor

print("Test MAE Baseline :", round(test_mae_baseline, 3))
print("Test MAE Linear:", round(test_mae_linear, 3))

Test MAE Baseline : 0.24
Test MAE Linear: 0.24


In [48]:
test_pred_class = (test_pred > 0.5) + 0
test_pred_baseline = np.repeat(np.median(train_y), len(test_y))

prediction_accuracy = np.mean((test_y == test_pred_class)) * 100
baseline_accuracy = np.mean((test_y == test_pred_baseline)) * 100

print("Prediction Accuracy:", round(prediction_accuracy, 1), "%")
print("Baseline Accuracy:", round(baseline_accuracy, 1), "%")

Prediction Accuracy: 76.0 %
Baseline Accuracy: 76.0 %
