<h1>Invoke SageMaker Prediction Service</h1>

In [20]:
## Standard Python Libraries
%matplotlib inline
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import os

## SageMaker Libraries
import boto3
import re
from sagemaker import get_execution_role
import sagemaker

In [21]:
# Acquire a realtime endpoint
#endpoint_name = 'xgboost-biketrain-v1'
endpoint_name = 'sagemaker-xgboost-2020-07-06-12-05-58-891'
predictor = sagemaker.predictor.RealTimePredictor(endpoint=endpoint_name)

In [22]:
from sagemaker.predictor import csv_serializer, json_deserializer

predictor.content_type = 'text/csv'
predictor.serializer = csv_serializer
predictor.deserializer = None

In [23]:
df_all = pd.read_csv('bike_test.csv',header=None)

In [24]:
df_all.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,2011-01-20 00:00:00,1,0,1,1,10.66,11.365,56,26.0027,2011,1,20,3,0
1,2011-01-20 01:00:00,1,0,1,1,10.66,13.635,56,0.0,2011,1,20,3,1
2,2011-01-20 02:00:00,1,0,1,1,10.66,13.635,56,0.0,2011,1,20,3,2
3,2011-01-20 03:00:00,1,0,1,1,10.66,12.88,56,11.0014,2011,1,20,3,3
4,2011-01-20 04:00:00,1,0,1,1,10.66,12.88,56,11.0014,2011,1,20,3,4


In [25]:
df_all.columns[1:]

Int64Index([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], dtype='int64')

In [26]:
# Need to pass an array to the prediction
# can pass a numpy array or a list of values [[19,1],[20,1]]
arr_test = df_all.as_matrix(df_all.columns[1:])

  app.launch_new_instance()


In [27]:
type(arr_test)

numpy.ndarray

In [28]:
arr_test.shape

(6493, 13)

In [29]:
arr_test[:5]

array([[1.00000e+00, 0.00000e+00, 1.00000e+00, 1.00000e+00, 1.06600e+01,
        1.13650e+01, 5.60000e+01, 2.60027e+01, 2.01100e+03, 1.00000e+00,
        2.00000e+01, 3.00000e+00, 0.00000e+00],
       [1.00000e+00, 0.00000e+00, 1.00000e+00, 1.00000e+00, 1.06600e+01,
        1.36350e+01, 5.60000e+01, 0.00000e+00, 2.01100e+03, 1.00000e+00,
        2.00000e+01, 3.00000e+00, 1.00000e+00],
       [1.00000e+00, 0.00000e+00, 1.00000e+00, 1.00000e+00, 1.06600e+01,
        1.36350e+01, 5.60000e+01, 0.00000e+00, 2.01100e+03, 1.00000e+00,
        2.00000e+01, 3.00000e+00, 2.00000e+00],
       [1.00000e+00, 0.00000e+00, 1.00000e+00, 1.00000e+00, 1.06600e+01,
        1.28800e+01, 5.60000e+01, 1.10014e+01, 2.01100e+03, 1.00000e+00,
        2.00000e+01, 3.00000e+00, 3.00000e+00],
       [1.00000e+00, 0.00000e+00, 1.00000e+00, 1.00000e+00, 1.06600e+01,
        1.28800e+01, 5.60000e+01, 1.10014e+01, 2.01100e+03, 1.00000e+00,
        2.00000e+01, 3.00000e+00, 4.00000e+00]])

In [30]:
result = predictor.predict(arr_test[:2])

In [31]:
# decode method has to be used
result

b'2.1686649322509766,1.5279628038406372'

In [32]:
arr_test.shape

(6493, 13)

In [33]:
# For large number of predictions, we can split the input data and
# Query the prediction service.
# array_split is convenient to specify how many splits are needed
predictions = []
for arr in np.array_split(arr_test,10):
    result = predictor.predict(arr)
    result = result.decode("utf-8")
    result = result.split(',')
    print (arr.shape)
    predictions += [float(r) for r in result]

(650, 13)
(650, 13)
(650, 13)
(649, 13)
(649, 13)
(649, 13)
(649, 13)
(649, 13)
(649, 13)
(649, 13)


In [34]:
len(predictions)

6493

In [35]:
np.expm1(predictions)

array([  7.74659894,   3.60877827,   2.69729228, ..., 118.54129091,
        90.78625745,  47.57346446])

In [36]:
df_all['count'] = np.expm1(predictions)

In [37]:
df_all.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,count
0,2011-01-20 00:00:00,1,0,1,1,10.66,11.365,56,26.0027,2011,1,20,3,0,7.746599
1,2011-01-20 01:00:00,1,0,1,1,10.66,13.635,56,0.0,2011,1,20,3,1,3.608778
2,2011-01-20 02:00:00,1,0,1,1,10.66,13.635,56,0.0,2011,1,20,3,2,2.697292
3,2011-01-20 03:00:00,1,0,1,1,10.66,12.88,56,11.0014,2011,1,20,3,3,2.100575
4,2011-01-20 04:00:00,1,0,1,1,10.66,12.88,56,11.0014,2011,1,20,3,4,1.786934


In [38]:
df_all = df_all.rename(columns={0: 'datetime'})
df_all[['datetime','count']].to_csv('predicted_count_cloud.csv',index=False)