In [6]:
import boto3
import sagemaker
import sagemaker.session
import os
from sagemaker.workflow.pipeline_context import PipelineSession

In [7]:
region = boto3.Session().region_name
sagemaker_session = sagemaker.session.Session()
role = sagemaker.get_execution_role()
default_bucket = sagemaker_session.default_bucket()

pipeline_session = PipelineSession()

In [8]:
os.makedirs("01_simple_processing_pipeline", exist_ok=True) # Create folder for training code

In [11]:
%%writefile 01_simple_processing_pipeline/processing.py

import os
import pandas as pd
import numpy as np

from sklearn import datasets
from sklearn.model_selection import train_test_split


if __name__ == '__main__':
    
    iris = datasets.load_iris()
    df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
    df["class"] = pd.Series(iris.target)
    df = df[df['class'].isin([0, 1])] # Lets keep only class 0 and 1 to have binary classification
    df = df[[list(df.columns)[-1]] + list(df.columns)[:-1]] # Reorder target as the first column
    df.columns = df.columns.str.replace(' ', '_').str.replace('(', '').str.replace(')', '')
    
    train_df, test_df = train_test_split(df, test_size=0.33, random_state=42, stratify=df["class"])
    
    iris_train = train_df.to_numpy()
    np.savetxt('/opt/ml/processing/output/iris_train.csv', iris_train, delimiter=',', fmt='%1.1f, %1.3f, %1.3f, %1.3f, %1.3f')
    
    iris_test = test_df.to_numpy()
    np.savetxt('/opt/ml/processing/output/iris_test.csv', iris_test, delimiter=',', fmt='%1.1f, %1.3f, %1.3f, %1.3f, %1.3f')
    
    iris_inference = test_df.iloc[:, 1:].to_numpy()
    np.savetxt('/opt/ml/processing/output/iris_inference.csv', iris_inference, delimiter=',', fmt='%1.3f, %1.3f, %1.3f, %1.3f')
    
    column_names_list = ','.join(df.columns)
    with open('/opt/ml/processing/output//column_names.csv', 'w') as file:
        file.write(column_names_list)

Writing 01_simple_processing_pipeline/processing.py


In [24]:
from sagemaker.sklearn.processing import SKLearnProcessor

framework_version = "0.23-1"

sklearn_processor = SKLearnProcessor(
    framework_version=framework_version,
    instance_type='ml.t3.medium',
    instance_count=1,
    base_job_name="01-simple-processing-pipeline",
    sagemaker_session=pipeline_session,
    role=role,
)

INFO:sagemaker.image_uris:Defaulting to only available Python version: py3


In [25]:
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.workflow.steps import ProcessingStep
   
processor_args = sklearn_processor.run(
    inputs=[],
    outputs=[
        ProcessingOutput(
            output_name="default",
            source="/opt/ml/processing/output",
            destination='s3://sagemaker-bucket-ds/PIPELINE/01_simple_processing_pipeline/',),

    ],
    code="01_simple_processing_pipeline/processing.py",
) 

step_process = ProcessingStep(
    name="Preprocessing",
    step_args=processor_args
)

In [26]:
from sagemaker.workflow.pipeline import Pipeline

pipeline_name = f"01-simple-processing-pipeline"
pipeline = Pipeline(
    name=pipeline_name,
    steps=[step_process],
)

In [27]:
pipeline.upsert(role_arn=role)



{'PipelineArn': 'arn:aws:sagemaker:eu-west-1:211125740051:pipeline/01-simple-processing-pipeline',
 'ResponseMetadata': {'RequestId': '74b95fae-6392-4c44-ae8d-f8f7662e8008',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '74b95fae-6392-4c44-ae8d-f8f7662e8008',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '97',
   'date': 'Thu, 11 Jul 2024 10:02:23 GMT'},
  'RetryAttempts': 0}}

In [28]:
execution = pipeline.start()