In [1]:
import sagemaker
import os
from sagemaker.processing import ScriptProcessor, ProcessingInput, ProcessingOutput
from sagemaker import get_execution_role

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [2]:
role = get_execution_role()
sagemaker_session = sagemaker.Session()
region = sagemaker_session.boto_region_name

In [3]:
model_img = sagemaker.image_uris.retrieve("sklearn", region, "1.2-1") # This function returns path to latest docker image
print(model_img)

141502667606.dkr.ecr.eu-west-1.amazonaws.com/sagemaker-scikit-learn:1.2-1-cpu-py3


In [6]:
os.makedirs("02_input_output", exist_ok=True) # Create folder for training code

In [8]:
%%writefile 02_input_output/sample_data.csv
a,b,c
2,3,4
5,6,7
8,9,10

Overwriting 02_input_output/sample_data.csv


In [23]:
%%writefile 02_input_output/process_data.py
from __future__ import print_function
import pandas as pd
import os

if __name__ == '__main__':

    for key, value in os.environ.items():
        print(f"{key}: {value}")

    input_path = "/opt/ml/processing/input/myinput/"
    output_path = '/opt/ml/processing/output/'

    input_file_path = os.path.join(input_path, "sample_data.csv")
    output_file_path = os.path.join(output_path, "output.csv")
    
    # Read the CSV file
    df = pd.read_csv(input_file_path)
    
    # Calculate the sum of all columns
    column_sums = df.sum()
    
    # Store the sums in a text file
    with open(output_file_path, 'w') as f:
        for column, sum_value in column_sums.items():
            f.write(f'{column}: {sum_value}\n')

Overwriting 02_input_output/process_data.py


In [24]:
!aws s3 cp 02_input_output/process_data.py s3://sagemaker-bucket-ds/PROCESSING/02_CODE/
!aws s3 cp 02_input_output/sample_data.csv s3://sagemaker-bucket-ds/PROCESSING/02_INPUT/

upload: 02_input_output/process_data.py to s3://sagemaker-bucket-ds/PROCESSING/02_CODE/process_data.py
upload: 02_input_output/sample_data.csv to s3://sagemaker-bucket-ds/PROCESSING/02_INPUT/sample_data.csv


In [25]:
# Define the ScriptProcessor
script_processor = ScriptProcessor(
    role=role,
    image_uri=model_img,
    command=['python3'],
    instance_count=1,
    instance_type='ml.t3.medium',
    volume_size_in_gb=5,
    max_runtime_in_seconds=3600,
    base_job_name='02-processing-job',
    sagemaker_session=sagemaker_session
)

In [26]:
# Define the Processing Inputs and Outputs
processing_inputs = [

    ProcessingInput(
        source='s3://sagemaker-bucket-ds/PROCESSING/02_INPUT/',
        destination='/opt/ml/processing/input/myinput/',
        input_name='INPUT1'
    )
]

processing_outputs = [
    ProcessingOutput(
        source='/opt/ml/processing/output/',
        destination='s3://sagemaker-bucket-ds/PROCESSING/02_OUTPUT/',
        output_name='OUTPUT1'
    )
]

In [27]:
# Run the Processing Job
script_processor.run(
    code='s3://sagemaker-bucket-ds/PROCESSING/02_CODE/process_data.py',
    inputs=processing_inputs,
    outputs=processing_outputs
)

INFO:sagemaker:Creating processing-job with name 02-processing-job-2024-07-10-14-58-23-149


............................................................................................[34mPATH: /miniconda3/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin[0m
[34mHOSTNAME: ip-10-0-244-125.eu-west-1.compute.internal[0m
[34mAWS_REGION: eu-west-1[0m
[34mAWS_CONTAINER_CREDENTIALS_RELATIVE_URI: /v2/credentials/U-A3py6Mt4msUOE8E4cvZP9zuO1MgFPsfLnCYgLZLM4[0m
[34mDEBIAN_FRONTEND: noninteractive[0m
[34mPIP_ROOT_USER_ACTION: ignore[0m
[34mPYTHONDONTWRITEBYTECODE: 1[0m
[34mPYTHONUNBUFFERED: 1[0m
[34mPYTHONIOENCODING: UTF-8[0m
[34mLANG: C.UTF-8[0m
[34mLC_ALL: C.UTF-8[0m
[34mSAGEMAKER_SKLEARN_VERSION: 1.2-1[0m
[34mSAGEMAKER_TRAINING_MODULE: sagemaker_sklearn_container.training:main[0m
[34mSAGEMAKER_SERVING_MODULE: sagemaker_sklearn_container.serving:main[0m
[34mSKLEARN_MMS_CONFIG: /home/model-server/config.properties[0m
[34mSM_INPUT: /opt/ml/input[0m
[34mSM_INPUT_TRAINING_CONFIG_FILE: /opt/ml/input/config/hyperparameters.json[0m
[34mSM_INPU