In [1]:
%matplotlib inline
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import os

<h1>PCA Cloud Prediction Invocation Template</h1>
<h4>Invoke SageMaker Prediction Service</h4>

In [2]:
import boto3
import re
from sagemaker import get_execution_role
import sagemaker

In [3]:
# Acquire a realtime endpoint
endpoint_name = 'pca-biketrain-v1'
predictor = sagemaker.predictor.RealTimePredictor(endpoint=endpoint_name)

In [4]:
from sagemaker.predictor import csv_serializer, json_deserializer

predictor.content_type = 'text/csv'
predictor.serializer = csv_serializer
predictor.deserializer = json_deserializer

In [5]:
# We are not going to use numeric features: 'temp','atemp','humidity','windspeed'
# Instead, we are going to use new components (aka features) generated by PCA for model training and testing
columns = ['count', 'season', 'holiday', 'workingday', 'weather','year', 'month', 'day', 'dayofweek','hour']

# PCA Training
colums_for_pca = ['temp','atemp','humidity','windspeed']

In [6]:
df = pd.read_csv('train_normalized.csv')
df_test = pd.read_csv('test_normalized.csv')

In [7]:
df.head(2)

Unnamed: 0,count,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,year,month,day,dayofweek,hour
0,2.833213,1,0,0,1,-1.333661,-1.092737,0.993213,-1.567754,2011,1,1,5,0
1,3.713572,1,0,0,1,-1.438907,-1.182421,0.941249,-1.567754,2011,1,1,5,1


In [8]:
df_test.head(2)

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,year,month,day,dayofweek,hour
0,2011-01-20 00:00:00,1,0,1,1,-1.228414,-1.450292,-0.305883,1.617227,2011,1,20,3,0
1,2011-01-20 01:00:00,1,0,1,1,-1.228414,-1.182421,-0.305883,-1.567754,2011,1,20,3,1


In [9]:
df[colums_for_pca].head()

Unnamed: 0,temp,atemp,humidity,windspeed
0,-1.333661,-1.092737,0.993213,-1.567754
1,-1.438907,-1.182421,0.941249,-1.567754
2,-1.438907,-1.182421,0.941249,-1.567754
3,-1.333661,-1.092737,0.68143,-1.567754
4,-1.333661,-1.092737,0.68143,-1.567754


In [10]:
test = df[colums_for_pca].head().as_matrix()

  if __name__ == '__main__':


In [11]:
test

array([[-1.33366069, -1.09273697,  0.99321305, -1.56775367],
       [-1.43890721, -1.18242083,  0.94124921, -1.56775367],
       [-1.43890721, -1.18242083,  0.94124921, -1.56775367],
       [-1.33366069, -1.09273697,  0.68142998, -1.56775367],
       [-1.33366069, -1.09273697,  0.68142998, -1.56775367]])

In [13]:
result = predictor.predict(test)

In [14]:
result

{'projections': [{'projection': [-0.5232375860214233,
    -1.7736060619354248,
    -1.7270781993865967]},
  {'projection': [-0.5697786808013916,
    -1.7339260578155518,
    -1.8610637187957764]},
  {'projection': [-0.5697786808013916,
    -1.7339260578155518,
    -1.8610637187957764]},
  {'projection': [-0.7436795234680176,
    -1.554117202758789,
    -1.7062432765960693]},
  {'projection': [-0.7436795234680176,
    -1.554117202758789,
    -1.7062432765960693]}]}

In [15]:
l = [values['projection'] for values in result['projections']]

In [16]:
l

[[-0.5232375860214233, -1.7736060619354248, -1.7270781993865967],
 [-0.5697786808013916, -1.7339260578155518, -1.8610637187957764],
 [-0.5697786808013916, -1.7339260578155518, -1.8610637187957764],
 [-0.7436795234680176, -1.554117202758789, -1.7062432765960693],
 [-0.7436795234680176, -1.554117202758789, -1.7062432765960693]]

In [17]:
df_temp = pd.DataFrame(l)

In [18]:
df_temp

Unnamed: 0,0,1,2
0,-0.523238,-1.773606,-1.727078
1,-0.569779,-1.733926,-1.861064
2,-0.569779,-1.733926,-1.861064
3,-0.74368,-1.554117,-1.706243
4,-0.74368,-1.554117,-1.706243


In [19]:
# For large number of predictions, we can split the input data and
# Query the prediction service.
# array_split is convenient to specify how many splits are needed
def get_projection(arr_features):
    projections = []
    for arr in np.array_split(arr_features,100):        
        if arr.shape[0] > 0:
            print (arr.shape)
            result = predictor.predict(arr)
            projections += [values['projection'] for values in result['projections']]
    return projections
        

In [20]:
def replace_features(predictor, df, colums_for_pca):
    
    arr_features = df[colums_for_pca].as_matrix()
    
    projections = get_projection(arr_features)
    df_projection = pd.DataFrame(projections)
    
    tcols = []
    # New column names
    for i in range(df_projection.shape[1]):       
        tcols.append('component_' + str(i))
    
    df_projection.columns = tcols
    print ('components:',tcols)
    
    
    for col in df_projection.columns:
        df[col] = df_projection[col]
    
    df.drop(colums_for_pca, inplace=True, axis=1)
    
    return tcols

In [21]:
df.head(2)

Unnamed: 0,count,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,year,month,day,dayofweek,hour
0,2.833213,1,0,0,1,-1.333661,-1.092737,0.993213,-1.567754,2011,1,1,5,0
1,3.713572,1,0,0,1,-1.438907,-1.182421,0.941249,-1.567754,2011,1,1,5,1


In [22]:
new_cols = replace_features(predictor,df,colums_for_pca)

  app.launch_new_instance()


(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(108, 4)
(108, 4)
(108, 4)
(108, 4)
(108, 4)
(108, 4)
(108, 4)
(108, 4)
(108, 4)
(108, 4)
(108, 4)
(108, 4)
(108, 4)
(108, 4)
components: ['component_0', 'component_1', 'component_2']


In [23]:
replace_features(predictor,df_test,colums_for_pca)

(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)


  app.launch_new_instance()


(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(64, 4)
(64, 4)
(64, 4)
(64, 4)
(64, 4)
(64, 4)
(64, 4)
components: ['component_0', 'component_1', 'component_2']


['component_0', 'component_1', 'component_2']

In [24]:
for col in new_cols:
    columns.append(col)

In [25]:
columns

['count',
 'season',
 'holiday',
 'workingday',
 'weather',
 'year',
 'month',
 'day',
 'dayofweek',
 'hour',
 'component_0',
 'component_1',
 'component_2']

In [26]:
## Training, Validation and Test Set
### Target Variable as first column followed by input features
### Training, Validation files do not have a column header

In [27]:
# Training = 70% of the data
# Validation = 30% of the data
# Randomize the datset
np.random.seed(5)
l = list(df.index)
np.random.shuffle(l)
df = df.iloc[l]

In [28]:
rows = df.shape[0]
train = int(.7 * rows)
test = int(.3 * rows)

In [29]:
rows, train, test

(10886, 7620, 3265)

In [30]:
# Write Training Set
df[:train].to_csv('bike_train_pca.csv'
                          ,index=False,header=False
                          ,columns=columns)

In [31]:
# Write Validation Set
df[train:].to_csv('bike_validation_pca.csv'
                          ,index=False,header=False
                          ,columns=columns)

In [32]:
# Test Data has only input features
df_test.to_csv('bike_test_pca.csv',index=False)

In [33]:
# Write Column List
with open('bike_train_column_list_pca.txt','w') as f:
    f.write(','.join(columns))