## SageMaker Processing Deep Dive

In [None]:
import sagemaker
import boto3
from sagemaker import get_execution_role
from sagemaker.s3 import S3Uploader, S3Downloader

In [None]:
sess = boto3.Session()
sm = sess.client('sagemaker')
role = sagemaker.get_execution_role()
region_name = region_name=sess.region_name

In [None]:
account_id = sess.client('sts', region_name=region_name).get_caller_identity()["Account"]

In [None]:
input_data_s3_url = "s3://sagemaker-{}-{}/sagemaker/DEMO-xgboost-churn/data/RawData.csv".format(region_name, account_id)

In [None]:
print(input_data_s3_url)

### Using the Scikit-learn Processor

In [None]:
from sagemaker.sklearn.processing import SKLearnProcessor
sklearn_processor = SKLearnProcessor(framework_version='0.23-1',
                                     role=role,
                                     instance_type='ml.m5.xlarge',
                                     instance_count=1)

In [None]:
%%time
from sagemaker.processing import ProcessingInput, ProcessingOutput

sklearn_processor.run(code='preprocessing.py',
                      outputs=[ProcessingOutput(output_name='train',
                                                source='/opt/ml/processing/train'),
                               ProcessingOutput(output_name='validation',
                                                source='/opt/ml/processing/validation'),
                              ProcessingOutput(output_name="test", source="/opt/ml/processing/test")],
                               
                      arguments=['--input-data', input_data_s3_url]
                     )

preprocessing_job_description = sklearn_processor.jobs[-1].describe()

output_config = preprocessing_job_description['ProcessingOutputConfig']
for output in output_config['Outputs']:
    if output['OutputName'] == 'train':
        preprocessed_training_data = output['S3Output']['S3Uri']
    if output['OutputName'] == 'validation':
        preprocessed_validation_data = output['S3Output']['S3Uri']
    if output['OutputName'] == 'test':
        preprocessed_test_data = output['S3Output']['S3Uri']


In [None]:
print(preprocessed_training_data)
print(preprocessed_validation_data)
print(preprocessed_test_data)

### Using the Build Your Own Processing Container 

In [None]:
image_uri = "{}.dkr.ecr.{}.amazonaws.com/smstudio-custom:customer-churn-sm-processing".format(account_id, region_name)

In [None]:
print(image_uri)

In [None]:
from sagemaker.processing import Processor, ProcessingInput, ProcessingOutput

custom_processor = Processor(image_uri=image_uri,
                     role=role,
                     instance_count=1,
                     instance_type="ml.m5.xlarge")


custom_processor.run(
                   outputs=[ProcessingOutput(output_name='train',
                                                source='/opt/ml/processing/train'),
                               ProcessingOutput(output_name='validation',
                                                source='/opt/ml/processing/validation'),
                              ProcessingOutput(output_name="test", source="/opt/ml/processing/test")],
                               
                      arguments=['--input-data', input_data_s3_url]
                    )

preprocessing_job_description = custom_processor.jobs[-1].describe()

output_config = preprocessing_job_description['ProcessingOutputConfig']
for output in output_config['Outputs']:
    if output['OutputName'] == 'train':
        preprocessed_training_data = output['S3Output']['S3Uri']
    if output['OutputName'] == 'validation':
        preprocessed_validation_data = output['S3Output']['S3Uri']
    if output['OutputName'] == 'test':
        preprocessed_test_data = output['S3Output']['S3Uri']

In [None]:
print(preprocessed_training_data)
print(preprocessed_validation_data)
print(preprocessed_test_data)