In [None]:
library(reticulate) 
sagemaker <- import('sagemaker')

In [None]:
session <- sagemaker$Session() 

bucket <- session$default_bucket()
#creates a default bucket of format sagemaker-<aws-region-name>-<aws account number>

role_arn <- sagemaker$get_execution_role()

In [None]:
#load my train and test data
#The training and test data for this was provided at the following location: http://web.archive.org/web/20161224072740/http:/groupware.les.inf.puc-rio.br/har
train_url <- "https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv"
test_url  <- "https://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv"
rawTrainData <- read.csv(url(train_url))
rawTestData <- read.csv(url(test_url))
dim(rawTrainData)
dim(rawTestData)

In [None]:
summary(rawTrainData)

In [None]:
library(readr)
write_csv(rawTrainData, 'pml-training.csv', col_names = FALSE) 
write_csv(rawTestData, 'pml-testing.csv', col_names = FALSE)
session$upload_data(path = 'pml-training.csv', 
                                bucket = bucket, 
                                key_prefix = 'data')
session$upload_data(path = 'pml-testing.csv', 
                                bucket = bucket, 
                                key_prefix = 'data')

In [None]:
#DATA CLEANSING
library(caret)
## - check Variance of columns
apply(rawTrainData, 2, var)
##remove columns with near zero variance
nearZeroVarCols <- nearZeroVar(rawTrainData)
training <- rawTrainData[,-nearZeroVarCols]
testing <- rawTestData[,-nearZeroVarCols]
dim(training)
dim(testing)

In [None]:
# 2 - remove data with NA
training <- training[, colSums(is.na(training)) == 0] 
testing <- testing[, colSums(is.na(testing)) == 0] 
dim(training)
dim(testing)
head(training)

In [None]:
# 3 - remove first 5 columns - user_name, raw_timestamp_part_1, raw_timestamp_part_2,cvtd_timestamp
training <-training[,-c(1:5)]
testing <- testing[,-c(1:5)]
dim(training)
dim(testing)
head(training)

In [None]:
#We're predicting classe. Move to the first column (since Sagemaker requires it) and change from A,B,C,D,E to 0,1,2,3,4
library(tidyverse)
training1 <- training %>% relocate(classe)


head(training)
head(training1)


training1$classe<-recode(training1$classe, 'A'=0, 'B'=1, 'C'=2, 'D'=3, 'E'=4)
training1$classe  <- as.factor(training1$classe)
head(training1)

In [None]:
#create Training and Validation sets
subSamples <- createDataPartition(y=training1$classe, p=0.70, list=FALSE)
subTraining <- training1[subSamples, ] 
subValidation <- training1[-subSamples, ]## Going to keep this set aside. Will not be used to build the model

In [None]:
summary(subTraining)

In [None]:
write_csv(subTraining, 'col_headers.csv', col_names = TRUE) 

In [None]:
head(subTraining)


In [None]:
#will only use the subTraning to train the model
subSamples1 <- createDataPartition(y=subTraining$classe, p=0.70, list=FALSE)
modTraining <- subTraining[subSamples1, ] 
modValidation <- subTraining[-subSamples1, ]

dim(modTraining)
dim(modValidation)

In [None]:
write_csv(modTraining, 'clean_train1.csv', col_names = FALSE) 
write_csv(modValidation, 'clean_valid1.csv', col_names = FALSE)

s3_train <- session$upload_data(path = 'clean_train1.csv', 
                                bucket = bucket, 
                                key_prefix = 'data')
s3_valid <- session$upload_data(path = 'clean_valid1.csv', 
                                bucket = bucket, 
                                key_prefix = 'data')

s3_train_input <- sagemaker$inputs$TrainingInput(s3_data = s3_train, content_type = 'csv') 
s3_valid_input <- sagemaker$inputs$TrainingInput(s3_data = s3_valid, content_type = 'csv')

In [None]:
##TRAIN AND DEPLOY A MODEL USING XGBoost. XGBoost is available as one of the core Sagemaker Algorithms

In [None]:
#RETRIEVE THE LATEST XGBOOST Container Regisry
#Note: Getting a specific Repo version since minor differences between versions can break code

xgboost_container <- sagemaker$amazon$amazon_estimator$get_image_uri(session$boto_session$region_name,
                          'xgboost', 
                         repo_version='1.2-1')
#xgboost_container <- sagemaker$amazon$amazon_estimator$get_image_uri(session$boto_session$region_name,
#                          'xgboost', 
#                         repo_version='latest')

xgboost_container

In [None]:
s3_output <- paste0('s3://', bucket, '/output')
estimator <- sagemaker$estimator$Estimator(image_uri = xgboost_container,
                                     role = role_arn,
                                     train_instance_count = 1L,
                                     train_instance_type = 'ml.m5.large',
                                     train_volume_size = 30L,
                                     train_max_run = 3600L,
                                     input_mode = 'File',
                                     output_path = s3_output,
                                     output_kms_key = NULL,
                                     base_job_name = NULL,
                                     sagemaker_session = NULL)

In [None]:
#set the hyperparameters. Refer the model parameters trained via the caret package.
#an implicit assumption is that the Sagemaker XGBoost is similar to Caret's XGBoost
# one difference seems to be that the Sagemaker algorithm does not support k-fold cross validation 
#(I had done 5-fold in my caret package)
estimator$set_hyperparameters(
        max_depth = 6L,
        eta = 0.05,
        gamma = 0.01,
        min_child_weight = 0.5,
        subsample = 0.5,
        objective = "multi:softmax", ##since this is a multiclass
        num_class = 5L, ## required for multi:softmax
        num_round = 100L,
        colsample_bytree = 1L )

In [None]:
estimator$hyperparameters()

In [None]:
#FIT THE MODEL

In [None]:
job_name <- paste('sagemaker-train-xgboost', format(Sys.time(), '%H-%M-%S'), sep = '-')

input_data <- list('train' = s3_train_input,
                   'validation' = s3_valid_input)

In [None]:
estimator$fit(inputs = input_data,
              job_name = job_name)

In [None]:
estimator$model_data
estimator

In [None]:
#DEPLOY

In [None]:
serializer <- sagemaker$serializers$CSVSerializer(content_type='text/csv')
model_endpoint <- estimator$deploy(initial_instance_count = 1L,
                                   instance_type = 'ml.t2.medium',serializer=serializer)
#model_endpoint <- estimator$deploy(initial_instance_count=1, content_type='text/csv', instance_type='ml.t2.medium'  )

In [None]:
dim(testing)
head(testing)


In [None]:
write_csv(testing, 'test_data.csv', col_names = FALSE)

s3_test <- session$upload_data(path = 'test_data.csv', 
                                bucket = bucket, 
                                key_prefix = 'data')

s3_test_input <- sagemaker$inputs$TrainingInput(s3_data = s3_test, content_type = 'csv') 

s3_test_input

In [None]:
model_endpoint$content_type 
model_endpoint$serializer 


In [None]:
testing1 <- testing[,-c(54)]
head(testing1)
test_sample <- as.matrix(testing1[1:20, ])
predictions <- model_endpoint$predict(test_sample)
predictions

In [None]:
predictions1 <- str_split(predictions, pattern = ',', simplify = TRUE)
predictions1 <- as.numeric(predictions1)
predictions1

In [None]:
output1 <- cbind(predicted_classe = as.integer(predictions1), testing1[1:20, ])
head(output1)

In [None]:
dim(subValidation)
head(subValidation)

In [None]:
val1 <- subValidation[,-c(1)]
head(val1)
valset <- as.matrix(val1 [1:5885, ])
predictions2 <- model_endpoint$predict(valset)

In [None]:
predictions2 <- str_split(predictions2, pattern = ',', simplify = TRUE)
predictions2 <- as.numeric(predictions2)
output2 <- cbind(predicted_classe = as.integer(predictions2), subValidation[1:5885, ])
head(output2)

In [None]:
confMat_Val = table(output2$predicted_classe, output2$classe)
accuracy_Val <- sum(diag(confMat_Val))/sum(confMat_Val)
oose_Val <- 1 - accuracy_Val

confMat_Val
accuracy_Val
oose_Val

In [None]:
#FINALLY DELETE THE ENDPOINT

In [None]:
session$delete_endpoint(model_endpoint$endpoint)