In [None]:
import re
import boto3
import sagemaker
from sagemaker import get_execution_role

In [None]:
sess = sagemaker.Session()
region = boto3.Session().region_name

# S3 bucket where data is downloaded and stored
downloaded_data_bucket = f"sagemaker-example-files-prod-{region}"
downloaded_data_prefix = "datasets/image/MNIST"

#bucket for saving cache and model artifacts
bucket = sess.default_bucket() # this is the default bucket created for this session by Sagemaker
prefix = "sagemaker/DEMO-linear-mnist"

# role
role = get_execution_role()

In [None]:
%%time
import pickle, gzip, numpy, json

# load the dataset
s3 = boto3.client("s3")
s3.download_file(downloaded_data_bucket,
                 f"{downloaded_data_prefix}/mnist.pkl.gz", "mnist.pkl.gz")
with gzip.open("mnist.pkl.gz") as f:
    train_set, valid_set, test_set = pickle.load(f, encoding="latin1")

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

plt.rcParams["figure.figsize"] = (2, 10)

def show_digit(img, caption="", subplot=None):
    if subplot is None:
        _, subplot = plt.subplots(1, 1)
    imgr = img.reshape((28, 28))
    subplot.axis("off")
    subplot.imshow(imgr, cmap="gray")
    plt.title(caption)

show_digit(train_set[0][30], f"This is a {train_set[1][30]}")


In [None]:
# Data conversion

import io
import numpy as np
import sagemaker.amazon.common as smac

In [None]:
train_set_vectors = np.array([t.tolist() for t in train_set[0]]).astype("float32")
train_set_labels = np.where(np.array([t.tolist() for t in train_set[1]]) == 0, 1, 0).astype("float32")

validation_set_vectors = np.array([t.tolist() for t in valid_set[0]]).astype("float32")
validation_set_labels = np.where(np.array([t.tolist() for t in valid_set[1]]) == 0, 1, 0).astype("float32")

train_set_buf = io.BytesIO()
validation_set_buf = io.BytesIO()

smac.write_numpy_to_dense_tensor(train_set_buf, train_set_vectors, train_set_labels)
smac.write_numpy_to_dense_tensor(validation_set_buf, validation_set_vectors, validation_set_labels)

In [None]:
train_set_buf.seek(0)


In [None]:
validation_set_buf.seek(0)

In [None]:
# create an s3 location to output model artifacts
output_location = f"s3://{bucket}/{prefix}/output"
print(f"Training artifacts will be uploaded to {output_location}")

In [None]:
# upload training data to S3 bucket
import os

key = "recordio-pb-data"
boto3.resource("s3").Bucket(bucket).Object(os.path.join(prefix, "train", key)).upload_fileobj(train_set_buf)
boto3.resource("s3").Bucket(bucket).Object(os.path.join(prefix, "validation", key)).upload_fileobj(validation_set_buf)

s3_train_data = f"s3://{bucket}/{prefix}/train/{key}"
print(f"Uploaded training data location:{s3_train_data}")
s3_validation_data = f"s3://{bucket}/{prefix}/validation/{key}"
print(f"Uploaded validation data location:{s3_validation_data}")

In [None]:
# Training with SageMaker Training
from sagemaker import image_uris

container = image_uris.retrieve(region=region, framework="linear-learner")
deploy_amt_model = True

In [None]:
linear = sagemaker.estimator.Estimator(
    image_uri=container,
    role=role,
    instance_count=1,
    instance_type="ml.c4.xlarge",
    output_path=output_location,
    sagemaker_session=sess
)

linear.set_hyperparameters(feature_dim=784,
                          predictor_type="binary_classifier",
                          mini_batch_size=200)
linear.fit({"train":s3_train_data})

In [None]:
# Training using hyperparameter tuner HPO
import time
from sagemaker.tuner import IntegerParameter, ContinuousParameter
from sagemaker.tuner import HyperparameterTuner

job_name = "DEMO-ll-mni-" + time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())
print(f"Tuning job name:{job_name}")

In [None]:
hyperparameter_ranges = {
    "wd":ContinuousParameter(1e-7, 1, scaling_type="Auto"),
    "learning_rate": ContinuousParameter(1e-5, 1, scaling_type="Auto"),
    "mini_batch_size": IntegerParameter(100, 200, scaling_type="Auto")
}

max_jobs = 6
max_parallel_jobs = 2

hp_tuner = HyperparameterTuner(
    linear,
    "validation:binary_f_beta",
    hyperparameter_ranges=hyperparameter_ranges,
    max_parallel_jobs=max_parallel_jobs,
    objective_type="Maximize"
)

hp_tuner.fit(inputs={"train":s3_train_data, "validation":s3_validation_data}, job_name=job_name)

In [None]:
# deploy the model

if deploy_amt_model:
    linear_predictor = hp_tuner.deploy(initial_instance_count=1, instance_type="ml.m4.xlarge")
else:
    linear_predictor = linear.deploy(initial_instance_count=1, instance_type="ml.m4.xlarge")

In [None]:
# Validate the model

from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer

linear_predictor.serializer = CSVSerializer()
linear_predictor.deserializer = JSONDeserializer()

In [None]:
# predict a single record
result = linear_predictor.predict(train_set[0][30:31], initial_args={"ContentType":"text/csv"})
print(result)

In [None]:
# batch prediction
import numpy as np

predictions = []
for array in np.array_split(test_set[0], 100):
    result = linear_predictor.predict(array)
    predictions += [r["predicted_label"] for r in result["predictions"]]

predictions = np.array(predictions)

In [None]:
import pandas as pd

pd.crosstab(
    np.where(test_set[1] == 0, 1, 0), predictions, rownames=["actuals"], colnames=["predictions"]
)

In [None]:
# delete endpoints and model
linear_predictor.delete_model()
linear_predictor.delete_endpoint()