## Generate and Upload Housing Price Data

Run the cells below to generate daily housing price data pulled from the Boston housing price dataset and upload it to S3.

In [None]:
from sagemaker import get_execution_role, LinearLearnerPredictor
from datetime import date, datetime, timedelta
from sklearn.datasets import load_boston
import numpy as np
import boto3
import json
import csv
import os


def split_data_by_days(data, num_days):
    num_rows = len(data)
    split_data = zip(*[iter(data)] * int(num_rows / num_days))
    return list(split_data)

def write_to_csv(filename, data):
    if not filename.endswith('.csv'):
        filename = '{}.csv'.format(filename)
    with open(filename, 'w', newline='\n') as csvfile:
        writer = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
        writer.writerows(data)

In [None]:
num_days = 3 # number of days to split housing prices data into.
model_name = 'LinearLearner-HomePrices' # If you modified the ModelPrefix CloudFormation template change this to the value you modified it to be.

bucket = '<NAME OF YOUR BUCKET HERE>' # Set this to the name of bucket created by CloudFormation template. Can be found in the output of the template.
prefix = 'data/{}/train'.format(model_name)

role = get_execution_role()
region = boto3.Session().region_name

boston = load_boston()
target = boston.target
data = [np.ndarray.tolist(row) for row in boston.data[:, :]]

# Add target value as first column as expected by training algorithm
training_set = [[row[0]] + row[1] for row in zip(target, data)]

# Split data into seperate datasets for each day
train_by_day = split_data_by_days(training_set, num_days)

# Upload split datasets to S3
for day in range(num_days):
    current_date = date.today() - timedelta(day)
    key = '{}.csv'.format(current_date)
    write_to_csv(key, train_by_day[day])
    s3_uri = 's3://{}/{}/{}'.format(bucket, prefix, key)
    print('Uploading {} to {}'.format(key, s3_uri))
    boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix, key)).upload_file(key)


## Generate JSON For Testing Execution

Run the cell below then copy/paste the output JSON into the input of your Step Functions state machine from the Step Functions console to test the pipeline. This JSON is identical to the JSON that CloudWatch Events will send to Step Functions when triggering an execution of your state machine.

In [None]:
test_json = {
  "version": "0",
  "id": "89d1a02d-5ec7-412e-82f5-13505f849b41",
  "detail-type": "Scheduled Event",
  "source": "aws.events",
  "account": "123456789012",
  "time": datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ"),
  "region": "us-east-1",
  "resources": [
    "arn:aws:events:us-east-1:123456789012:rule/SampleRule"
  ],
  "detail": {}
}

print(json.dumps(test_json))

## Leverage Model Endpoint For Inference

Run the cells below to create a predictor object for the latest model deployed to the SageMaker endpoint and use it to perform inference on the model.

In [None]:
# Create the predictor object
predictor = LinearLearnerPredictor(model_name)

In [None]:
# List of features for a house to infer the price of
house1 = [0.04741, 0.0, 11.93, 0.0, 0.573, 6.03, 80.8, 2.505, 1.0, 273.0, 21.0, 396.9, 7.88]

# Convert the data to float32 format and reshape it to an 1x13 dimensional numpy array
# as this is the format and shape the predictor expects
house1 = np.asarray(house1).astype('float32').reshape(1,13)

# Predict medv house price
predictor.predict(house1)

In [None]:
# Prediction on multiple houses at once

house2 = [0.10959, 0.0, 11.93, 0.0, 0.573, 6.794, 89.3, 2.3889, 1.0, 273.0, 21.0, 393.45, 6.48]
house2 = np.asarray(house2).astype('float32').reshape(1,13)

two_houses = np.append(house1, house2, axis=0)

predictor.predict(two_houses)