In [2]:
import os, boto3, re, sagemaker

In [5]:
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

# S3 bucket for saving code and model artifacts.
# Feel free to specify a different bucket and prefix
bucket = sagemaker.Session().default_bucket()

prefix = ('sagemaker/terry-breast-cancer-prediction')  # place to upload training files within the bucket

In [4]:
bucket

'sagemaker-ca-central-1-376999591241'

In [6]:
region

'ca-central-1'

In [7]:
role

'arn:aws:iam::376999591241:role/service-role/AmazonSageMaker-ExecutionRole-20220327T171034'

In [9]:
import io, time, json, numpy as np, pandas as pd, matplotlib.pyplot as plt, sagemaker.amazon.common as smac

In [10]:
s3 = boto3.client('s3')

filename = 'wdbc.csv'
s3.download_file('sagemaker-sample-files', 'datasets/tabular/breast_cancer/wdbc.csv', filename)
data = pd.read_csv(filename, header=None)

# specify columns extracted from wbdc.names
data.columns = [
    'id',
    'diagnosis',
    'radius_mean',
    'texture_mean',
    'perimeter_mean',
    'area_mean',
    'smoothness_mean',
    'compactness_mean',
    'concavity_mean',
    'concave points_mean',
    'symmetry_mean',
    'fractal_dimension_mean',
    'radius_se',
    'texture_se',
    'perimeter_se',
    'area_se',
    'smoothness_se',
    'compactness_se',
    'concavity_se',
    'concave points_se',
    'symmetry_se',
    'fractal_dimension_se',
    'radius_worst',
    'texture_worst',
    'perimeter_worst',
    'area_worst',
    'smoothness_worst',
    'compactness_worst',
    'concavity_worst',
    'concave points_worst',
    'symmetry_worst',
    'fractal_dimension_worst',
]

# save the data
data.to_csv('data.csv', sep=',', index=False)

# print the shape of the data file
print(data.shape)

# show the top few rows
display(data.head())

# describe the data object
display(data.describe())

# we will also summarize the categorical field diganosis
display(data.diagnosis.value_counts())

(569, 32)


Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


Unnamed: 0,id,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,...,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,30371830.0,14.127292,19.289649,91.969033,654.889104,0.09636,0.104341,0.088799,0.048919,0.181162,...,16.26919,25.677223,107.261213,880.583128,0.132369,0.254265,0.272188,0.114606,0.290076,0.083946
std,125020600.0,3.524049,4.301036,24.298981,351.914129,0.014064,0.052813,0.07972,0.038803,0.027414,...,4.833242,6.146258,33.602542,569.356993,0.022832,0.157336,0.208624,0.065732,0.061867,0.018061
min,8670.0,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,...,7.93,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504
25%,869218.0,11.7,16.17,75.17,420.3,0.08637,0.06492,0.02956,0.02031,0.1619,...,13.01,21.08,84.11,515.3,0.1166,0.1472,0.1145,0.06493,0.2504,0.07146
50%,906024.0,13.37,18.84,86.24,551.1,0.09587,0.09263,0.06154,0.0335,0.1792,...,14.97,25.41,97.66,686.5,0.1313,0.2119,0.2267,0.09993,0.2822,0.08004
75%,8813129.0,15.78,21.8,104.1,782.7,0.1053,0.1304,0.1307,0.074,0.1957,...,18.79,29.72,125.4,1084.0,0.146,0.3391,0.3829,0.1614,0.3179,0.09208
max,911320500.0,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,...,36.04,49.54,251.2,4254.0,0.2226,1.058,1.252,0.291,0.6638,0.2075


B    357
M    212
Name: diagnosis, dtype: int64

In [12]:
rand_split = np.random.rand(len(data))
train_list = rand_split < 0.8
val_list = (rand_split >= 0.8) & (rand_split < 0.9)
test_list = rand_split >= 0.9

data_train = data[train_list]
data_val = data[val_list]
data_test = data[test_list]

y_train = ((data_train.iloc[:, 1] == 'M') + 0).to_numpy()
x_train = data_train.iloc[:, 2:].to_numpy()
y_val = ((data_val.iloc[:, 1] == 'M') + 0).to_numpy()
x_val = data_val.iloc[:, 2:].to_numpy()
y_test = ((data_test.iloc[:, 1] == 'M') + 0).to_numpy()
x_test = data_test.iloc[:, 2:].to_numpy()

In [13]:
train_file = 'linear_train.data'
file = io.BytesIO()
smac.write_numpy_to_dense_tensor(file, x_train.astype('float32'), y_train.astype('float32'))
file.seek(0)
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train', train_file)).upload_fileobj(file)

validation_file = 'linear_validation.data'
file = io.BytesIO()
smac.write_numpy_to_dense_tensor(file, x_val.astype('float32'), y_val.astype('float32'))
file.seek(0)
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'validation', validation_file)).upload_fileobj(file)

In [14]:
from sagemaker import image_uris

container = image_uris.retrieve(region=boto3.Session().region_name, framework='linear-learner')
linear_job = 'DEMO-linear-' + time.strftime('%Y-%m-%d-%H-%M-%S', time.gmtime())
print('job name is:', linear_job)
linear_training_params = {
    'RoleArn': role,
    'TrainingJobName': linear_job,
    'AlgorithmSpecification': {'TrainingImage': container, 'TrainingInputMode': 'File'},
    'ResourceConfig': {'InstanceCount': 1, 'InstanceType': 'ml.c4.2xlarge', 'VolumeSizeInGB': 10},
    'InputDataConfig': [
        {
            'ChannelName': 'train',
            'DataSource': {
                'S3DataSource': {
                    'S3DataType': 'S3Prefix',
                    'S3Uri': 's3://{}/{}/train/'.format(bucket, prefix),
                    'S3DataDistributionType': 'ShardedByS3Key',
                }
            },
            'CompressionType': 'None',
            'RecordWrapperType': 'None',
        },
        {
            'ChannelName': 'validation',
            'DataSource': {
                'S3DataSource': {
                    'S3DataType': 'S3Prefix',
                    'S3Uri': 's3://{}/{}/validation/'.format(bucket, prefix),
                    'S3DataDistributionType': 'FullyReplicated',
                }
            },
            'CompressionType': 'None',
            'RecordWrapperType': 'None',
        },
    ],
    'OutputDataConfig': {'S3OutputPath': 's3://{}/{}/'.format(bucket, prefix)},
    'HyperParameters': {
        'feature_dim': '30',
        'mini_batch_size': '100',
        'predictor_type': 'regressor',
        'epochs': '3',
        'num_models': '3',
        'loss': 'absolute_loss',
    },
    'StoppingCondition': {'MaxRuntimeInSeconds': 60 * 60},
}

job name is: DEMO-linear-2022-03-27-21-56-09


In [None]:
region = boto3.Session().region_name
sm = boto3.client('sagemaker')

sm.create_training_job(**linear_training_params)

status = sm.describe_training_job(TrainingJobName=linear_job)['TrainingJobStatus']
print(status)
sm.get_waiter('training_job_completed_or_stopped').wait(TrainingJobName=linear_job)
if status == 'Failed':
    message = sm.describe_training_job(TrainingJobName=linear_job)['FailureReason']
    print('training failed with the following error: {}'.format(message))
    raise Exception('training job failed')

In [None]:
linear_hosting_container = {
    'Image': container,
    'ModelDataUrl': sm.describe_training_job(TrainingJobName=linear_job)['ModelArtifacts'][
        'S3ModelArtifacts'
    ],
}
create_model_response = sm.create_model(
    ModelName=linear_job, ExecutionRoleArn=role, PrimaryContainer=linear_hosting_container
)
print(create_model_response['ModelArn'])

In [19]:
linear_endpoint_config = 'DEMO-linear-endpoint-config-' + time.strftime(
    '%Y-%m-%d-%H-%M-%S', time.gmtime()
)
print(linear_endpoint_config)
create_endpoint_config_response = sm.create_endpoint_config(
    EndpointConfigName=linear_endpoint_config,
    ProductionVariants=[
        {
            'InstanceType': 'ml.m4.xlarge',
            'InitialInstanceCount': 1,
            'ModelName': linear_job,
            'VariantName': 'AllTraffic',
        }
    ],
)
print('endpoint config arn: ' + create_endpoint_config_response['EndpointConfigArn'])

DEMO-linear-endpoint-config-2022-03-27-22-02-46
endpoint config arn: arn:aws:sagemaker:ca-central-1:376999591241:endpoint-config/demo-linear-endpoint-config-2022-03-27-22-02-46


In [21]:
linear_endpoint = 'DEMO-linear-endpoint-' + time.strftime('%Y%m%d%H%M', time.gmtime())
print(linear_endpoint)
create_endpoint_response = sm.create_endpoint(EndpointName=linear_endpoint, EndpointConfigName=linear_endpoint_config)
print(create_endpoint_response['EndpointArn'])

resp = sm.describe_endpoint(EndpointName=linear_endpoint)
status = resp['EndpointStatus']
print('Status: ' + status)

sm.get_waiter('endpoint_in_service').wait(EndpointName=linear_endpoint)
resp = sm.describe_endpoint(EndpointName=linear_endpoint)
status = resp['EndpointStatus']
print('arn: ' + resp['EndpointArn'])
print('status: ' + status)

if status != 'InService':
    raise Exception('endpoint creation did not succeed')

DEMO-linear-endpoint-202203272206
arn:aws:sagemaker:ca-central-1:376999591241:endpoint/demo-linear-endpoint-202203272206
Status: Creating
arn: arn:aws:sagemaker:ca-central-1:376999591241:endpoint/demo-linear-endpoint-202203272206
status: InService


In [23]:
def np2csv(arr):
    csv = io.BytesIO()
    np.savetxt(csv, arr, delimiter=',', fmt='%g')
    return csv.getvalue().decode().rstrip()

runtime = boto3.client('runtime.sagemaker')
payload = np2csv(x_test)
response = runtime.invoke_endpoint(EndpointName=linear_endpoint, ContentType='text/csv', Body=payload)
result = json.loads(response['Body'].read().decode())
test_pred = np.array([r['score'] for r in result['predictions']])
test_mae_linear = np.mean(np.abs(y_test - test_pred))
test_mae_baseline = np.mean(np.abs(y_test - np.median(y_train)))  ## training median as baseline predictor

print('test MAE Baseline :', round(test_mae_baseline, 3))
print('test MAE Linear:', round(test_mae_linear, 3))
test_pred_class = (test_pred > 0.5) + 0
test_pred_baseline = np.repeat(np.median(y_train), len(y_test))

prediction_accuracy = np.mean((y_test == test_pred_class)) * 100
baseline_accuracy = np.mean((y_test == test_pred_baseline)) * 100

print('prediction accuracy:', round(prediction_accuracy, 1), '%')
print('baseline accuracy:', round(baseline_accuracy, 1), '%')

test MAE Baseline : 0.377
test MAE Linear: 0.176
prediction accuracy: 95.1 %
baseline accuracy: 62.3 %


In [24]:
sm.delete_endpoint(EndpointName=linear_endpoint)

{'ResponseMetadata': {'RequestId': 'f089333f-6a09-493f-8000-0257dce9d08a',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'f089333f-6a09-493f-8000-0257dce9d08a',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '0',
   'date': 'Sun, 27 Mar 2022 22:25:49 GMT'},
  'RetryAttempts': 0}}