In [1]:
install.packages("properties")
library(properties)

envProps <- read.properties("../env.properties")

Sys.setenv(
    "AWS_ACCESS_KEY_ID" = envProps$access_key, 
    "AWS_SECRET_ACCESS_KEY" = envProps$secret_key,
    "AWS_DEFAULT_REGION" = 'us-east-1')


Updating HTML index of packages in '.Library'

Making 'packages.html' ...
 done

“incomplete final line found on '../env.properties'”


Clear the workspace

In [2]:
rm(list = ls())

## SageMaker Processing

## Reinstall numpy to workaround conflict between reticulate and RStudio v4.x due to BLAS library dependency
## The reinstall process takes 5 minutes.
## https://github.com/rstudio/reticulate/issues/1257

In [None]:
# %%bash
# python_path=$(which python)
# echo $python_path
# sudo --set-home $python_path -m pip install --no-user --force-reinstall --no-binary numpy numpy

In [10]:
library(tidyverse)

data_file <- 'http://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data'
abalone <- read_csv(file = data_file, col_names = FALSE)
names(abalone) <- c('sex', 'length', 'diameter', 'height', 'whole_weight', 'shucked_weight', 'viscera_weight', 'shell_weight', 'rings')
head(abalone)

“‘timedatectl’ indicates the non-existent timezone name ‘n/a’”
“Your system is mis-configured: ‘/etc/localtime’ is not a symlink”
“‘/etc/localtime’ is not identical to any known timezone file”
── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.4     [32m✔[39m [34mreadr    [39m 2.1.5
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.1
[32m✔[39m [34mggplot2  [39m 3.5.1     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.3     [32m✔[39m [34mtidyr    [39m 1.3.1
[32m✔[39m [34mpurrr    [39m 1.0.2     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all con

sex,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,rings
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7
I,0.425,0.3,0.095,0.3515,0.141,0.0775,0.12,8


## Imports

In [3]:
suppressWarnings(library(reticulate))
path_to_python <- system("which python", intern = TRUE)
use_python(path_to_python)
sagemaker <- import('sagemaker')

In [5]:
role = sagemaker$get_execution_role()
session = sagemaker$Session()
s3_bucket_name = "my_bucket"
s3_prefix = "use-case/data"

account_id <- session$account_id()
region <- session$boto_region_name

algo_ecr_name_tag = "sagemaker-prebuilt-image-xyz:latest"

In [6]:
container_uri <- paste(account_id, "dkr.ecr", region, "amazonaws.com/", algo_ecr_name_tag, sep=".")
print(container_uri)

[1] "482851446821.dkr.ecr.us-east-1.amazonaws.com/.sagemaker-prebuilt-image-xyz:latest"


In [9]:
row_s3_uri = "<TO FILL IN>"
output_uri <- paste("s3://", s3_bucket_name,"/", s3_prefix, "/output",  sep="")
print(output_uri)

[1] "s3://my_bucket/use-case/data/output"


In [None]:
processor <- sagemaker$processing$ScriptProcessor(image_uri = container_uri,
                                                   command=list("Rscript"),
                                                   role = role,
                                                   instance_count=1L,
                                                   instance_type="ml.c5.xlarge")
processor$run(
    code="./preprocessing.R",
    job_name=paste("r-processing", as.integer(as.numeric(Sys.time())), sep="-"),
    inputs=[ProcessingInput(source=raw_s3, destination="/opt/ml/processing/input")],
        outputs=[
            ProcessingOutput(
                output_name="csv",
                destination="{}/csv".format(output_uri),
                source="/opt/ml/processing/output"
        ],
)