In [19]:
import sagemaker
import os
from sagemaker.processing import ScriptProcessor, ProcessingInput, ProcessingOutput
from sagemaker import get_execution_role
from sagemaker.processing import FrameworkProcessor
from sagemaker.sklearn.estimator import SKLearn

In [20]:
role = get_execution_role()
sagemaker_session = sagemaker.Session()
region = sagemaker_session.boto_region_name

In [21]:
os.makedirs("03_framework_processor", exist_ok=True) # Create folder for training code

In [40]:
%%writefile 03_framework_processor/requirements.txt
torch==1.13.1

Overwriting 03_framework_processor/requirements.txt


In [41]:
%%writefile 03_framework_processor/sample_data.csv
a,b,c
2,3,4
5,6,7
8,9,10

Overwriting 03_framework_processor/sample_data.csv


In [48]:
%%writefile 03_framework_processor/process_data.py
from __future__ import print_function
import pandas as pd
import os
import torch # this is only to make sure that torch was installed

if __name__ == '__main__':

    print(torch.__version__) # check the version of torch

    for key, value in os.environ.items():
        print(f"{key}: {value}")

    input_path = "/opt/ml/processing/input/myinput/"
    output_path = '/opt/ml/processing/output/'

    input_file_path = os.path.join(input_path, "sample_data.csv")
    output_file_path = os.path.join(output_path, "output.csv")
    
    # Read the CSV file
    df = pd.read_csv(input_file_path)
    
    # Calculate the sum of all columns
    column_sums = df.sum()
    
    # Store the sums in a text file
    with open(output_file_path, 'w') as f:
        for column, sum_value in column_sums.items():
            f.write(f'{column}: {sum_value}\n')

Overwriting 03_framework_processor/process_data.py


In [49]:
import tarfile

# Define the files to include in the tar.gz archive
files = ['03_framework_processor/process_data.py', '03_framework_processor/requirements.txt']

# Create a tar.gz archive
with tarfile.open('03_framework_processor/sourcedir.tar.gz', 'w:gz') as tar:
    for file in files:
        tar.add(file, arcname=file.split('/')[-1])

In [50]:
!aws s3 cp 03_framework_processor/sourcedir.tar.gz s3://sagemaker-bucket-ds/PROCESSING/03_CODE/
!aws s3 cp 03_framework_processor/sample_data.csv s3://sagemaker-bucket-ds/PROCESSING/03_INPUT/

upload: 03_framework_processor/sourcedir.tar.gz to s3://sagemaker-bucket-ds/PROCESSING/03_CODE/sourcedir.tar.gz
upload: 03_framework_processor/sample_data.csv to s3://sagemaker-bucket-ds/PROCESSING/03_INPUT/sample_data.csv


In [45]:
est_cls = sagemaker.sklearn.estimator.SKLearn
framework_version_str = "0.20.0"

script_processor = FrameworkProcessor(
    role=role,
    instance_count=1,
    instance_type="ml.t3.medium",
    estimator_cls=est_cls,
    framework_version=framework_version_str,
    base_job_name="03-processing"
)

In [46]:
# Define the Processing Inputs and Outputs
processing_inputs = [

    ProcessingInput(
        source='s3://sagemaker-bucket-ds/PROCESSING/03_INPUT/',
        destination='/opt/ml/processing/input/myinput/',
        input_name='INPUT1'
    )
]

processing_outputs = [
    ProcessingOutput(
        source='/opt/ml/processing/output/',
        destination='s3://sagemaker-bucket-ds/PROCESSING/03_OUTPUT/',
        output_name='OUTPUT1'
    )
]

In [None]:
# Run the Processing Job
script_processor.run(
    code='process_data.py',
    source_dir = "s3://sagemaker-bucket-ds/PROCESSING/03_CODE/sourcedir.tar.gz", # This is the file that contains codes and requirements
    inputs=processing_inputs,
    outputs=processing_outputs
)

INFO:sagemaker.processing:Uploaded s3://sagemaker-bucket-ds/PROCESSING/03_CODE/sourcedir.tar.gz to s3://sagemaker-bucket-ds/PROCESSING/03_CODE/sourcedir.tar.gz
INFO:sagemaker.processing:runproc.sh uploaded to s3://sagemaker-bucket-ds/PROCESSING/03_CODE/runproc.sh
INFO:sagemaker:Creating processing-job with name 03-processing-2024-07-10-15-05-27-452


.................................................................