In [None]:
bucket = 'weteh-sagemaker-examples'
prefix = 'sagemaker/DEMO-linear-mnist'
 
# Define IAM role
import boto3
from sagemaker import get_execution_role

role = "arn:aws:iam::869530972998:role/SagemakerAdmin"

In [None]:
import pickle, gzip, numpy, urllib.request, json

# load MNIST dataset from deepleaerning.net website as pickle format
# https://docs.python.org/3/library/pickle.html
# pickle is a python serder module that serializes/deserializes python data structures between binary form and
# python formats


urllib.request.urlretrieve("http://deeplearning.net/data/mnist/mnist.pkl.gz", "mnist.pkl.gz")

# you can use gzip python library to deserialize pickle files
# the output are in the following format and structure:
# train_set: 50000 examples
# test_set: 10000 examples
# validation set: 10000 examples
# data structure: tuple (2 dimensions with features, labels)
# features has ndarray with 784 dimensions (28 x 28), single channel.
with gzip.open('mnist.pkl.gz', 'rb') as f:
    train_set, valid_set, test_set = pickle.load(f, encoding='latin1')
    


In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

def show_digit(img, caption='', subplot=None):
    if not subplot:
        _,(subplot)=plt.subplots(1,1) # subplots returns Figure and Axes, we don't care about Figure.
    img = img.reshape((28,28))
    subplot.imshow(img, cmap='gray')
    plt.title(caption)
    


In [None]:
import random
img_idx = random.randrange(0, len(train_set[0]))
show_digit(train_set[0][img_idx], 'this is a {}'.format(train_set[1][img_idx]))

In [None]:
import io
import numpy as np
import sagemaker.amazon.common as smac
import boto3
import os

key = 'recordio-pb-data'

# We need to convert the original pickled array into recordIO so that sagemaker can process the dataset at training time
# not sure if we need to use .tolist() in the list comprehension.
# Basically it's converting a flatten structure with float32 datatypes into recordIO dformat
def write_recordio(data, data_type = 'train'):
    vectors = np.array([t.tolist() for t in data[0]]).astype('float32')

    # In this example, we are only training for binary classification, where if it's 0 then it's a 1, otherwise it's a 0
    labels = np.where(np.array([t.tolist() for t in data[1]]) == 0, 1, 0).astype('float32')

    buf = io.BytesIO()
    smac.write_numpy_to_dense_tensor(buf, vectors, labels)
    buf.seek(0)
    # for 50k records, the data size is around 151MB    
    boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix, data_type, key)).upload_fileobj(buf)
    s3_data = 's3://{}/{}/{}/{}'.format(bucket, prefix, data_type, key)
    print('uploaded {} data location: {}'.format(data_type, s3_data))    
    

In [None]:
write_recordio(train_set, data_type='train')
write_recordio(valid_set, data_type='validation')

In [None]:
output_location = 's3://{}/{}/output'.format(bucket, prefix)
print('training artifacts will be uploaded to: {}'.format(output_location))

In [None]:
from sagemaker.amazon.amazon_estimator import get_image_uri

# '382416733822.dkr.ecr.us-east-1.amazonaws.com/linear-learner:1'
container = get_image_uri(boto3.Session().region_name, 'linear-learner')

In [None]:
# setup sagemaker session
import boto3
import sagemaker

sess = sagemaker.Session()

#https://sagemaker.readthedocs.io/en/stable/estimators.html
model = sagemaker.estimator.Estimator(container, 
                                      train_instance_type='ml.c4.xlarge',
                                      role=role, 
                                      train_instance_count=1,
                                      sagemaker_session=sess,
                                      output_path=output_location)


In [None]:
# https://docs.aws.amazon.com/sagemaker/latest/dg/ll_hyperparameters.html
model.set_hyperparameters(feature_dim = len(train_set[0][0]),
                          predictor_type='binary_classifier',
                          mini_batch_size=200)

In [None]:
channels = {
    'train': 's3://{}/{}/train/{}'.format(bucket, prefix, key),
    'validation' : 's3://{}/{}/validation/{}'.format(bucket, prefix, key)
}


In [None]:
model.fit(channels)

In [None]:
# save test input as CSV for batch transform
import numpy 
test_vectors = np.array([t.tolist() for t in test_set[0]]).astype('float32')
test_csv_filename = 'test.csv'
numpy.savetxt(test_csv_filename, test_vectors, delimiter=",")


In [None]:
test_s3 = sess.upload_data(test_csv_filename,
                           bucket=bucket,
                           key_prefix='{}/test'.format(prefix))

In [None]:
# define a transformer from the trained model
model_transformer = model.transformer(instance_count=1,
                                  instance_type='ml.m4.xlarge',
                                  strategy='MultiRecord',
                                  assemble_with='Line',
                                  output_path='s3://{}/{}/predictions'.format(bucket, prefix))


In [None]:
model_transformer.transform(test_s3, content_type='text/csv', split_type='Line')
model_transformer.wait()

In [None]:
import io
import pandas as pd
s3 = boto3.client('s3')
obj = s3.get_object(Bucket=bucket, Key='{}/predictions/{}.out'.format(prefix, test_csv_filename))
df = pd.read_json(io.BytesIO(obj['Body'].read()), lines=True)

In [None]:
import json
predictions = df['predicted_label'].to_numpy()
true_test_labels = np.where(test_set[1] == 0, 1, 0)
pd.crosstab(true_test_labels, predictions, rownames=['actuals'], colnames=['predictions'])

In [None]:
!pip install pandas_ml

In [None]:
from pandas_ml import ConfusionMatrix
confusion_matrix = ConfusionMatrix(true_test_labels, predictions)

In [None]:
print("Confusion matrix:\n%s" % confusion_matrix)

In [None]:
#confusion_matrix.plot(normalized=True)
confusion_matrix.plot()
plt.show()