In [49]:
import os

# Set  path  to config file
os.environ["SAGEMAKER_USER_CONFIG_OVERRIDE"] = os.getcwd()

After this we instantiate a SageMaker Session for our Notebook and retrieve metadata such as the Execution Role and Region.

In [50]:
import sagemaker
from sagemaker.workflow.function_step import step

sagemaker_session = sagemaker.session.Session()
role = sagemaker.get_execution_role()
bucket = sagemaker_session.default_bucket()
region = sagemaker_session.boto_region_name

INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials


INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials


In [51]:
# Parameter definition
from sagemaker.workflow.parameters import ParameterString
instance_type = ParameterString(name="TrainInstanceType", default_value="ml.c5.2xlarge")

#### Step Orchestration

In [52]:
# step one
@step(
  name="preprocess",
  instance_type=instance_type,
  keep_alive_period_in_seconds=600
)
def create_data() -> tuple:
  import numpy as np
  np.random.seed(0)
  X = np.random.rand(100, 1)
  y = 2*X + 1 + 0.1*np.random.randn(100,1)
  data = (X, y)
  return data

In [53]:
# step two
@step(
  name="training",
  instance_type=instance_type,
  keep_alive_period_in_seconds=600
)
def train_model(data: tuple):
  import joblib
  from sklearn.model_selection import train_test_split
  from sklearn.linear_model import LinearRegression
  import boto3

  # use boto3 to work with S3
  s3 = boto3.client("s3")

  # unique bucket name
  bucket_name = "sagemaker-pipelie-step-richard-0304"
  # create s3 bucket name
  s3.create_bucket(Bucket=bucket_name)

  # unpack data
  X = data[0]
  y = data[1]
  # Split the data into training and testing sets
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
  # Create a Linear Regression Model
  model = LinearRegression()

  # Train the model on the training data
  model.fit(X_train, y_train)

  # Serialize trained model for inference
  model_filename = "model.joblib"
  joblib.dump(model, model_filename)

  # Upload model artifacts to s3
  s3_file_name = "model-artifacts/model.joblib" # key to store model artifacts

  # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3/client/upload_file.html
  # S3.Client.upload_file(Filename, Bucket, Key, ExtraArgs=None, Callback=None, Config=None)
  s3.upload_file(model_filename, bucket_name, s3_file_name)
  artifacts = (model_filename, bucket_name, s3_file_name, X_test, y_test)
  return artifacts


In [54]:
# step three
@step(
  name="inference_evaluation",
  instance_type=instance_type,
  keep_alive_period_in_seconds=600
)
def model_inference(artifacts: tuple) ->float:
  import joblib
  from sklearn.metrics import mean_squared_error
  import numpy as np
  import boto3

  s3 = boto3.client("s3")

  # load up arifacts from previous step
  model_filename = artifacts[0]
  bucket_name = artifacts[1]
  s3_file_name = artifacts[2]
  X_test = artifacts[3]
  y_test = artifacts[4]

  # download model.joblib
  # S3.Client.download_file(Bucket, Key, Filename, ExtraArgs=None, Callback=None, Config=None)
  s3.download_file(bucket_name, s3_file_name, model_filename)

  # model loading + inference
  serialized_model = joblib.load(model_filename)
  preds = serialized_model(X_test)

  # evaluation
  mse = mean_squared_error(y_test, preds)
  rmse = float(np.sqrt(mse))

  return rmse

#### Pipeline Orchestration and Execution

In [55]:
# stitch together pipeline
from sagemaker.workflow.pipeline import Pipeline

data = create_data()
artifacts = train_model(data)
rmse = model_inference(artifacts)

pipeline = Pipeline(
  name="sklearn-pipeline",
  parameters=[
    instance_type
  ],
  steps=[
    rmse,
  ],
)

INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials


In [56]:
pipeline.upsert(role_arn=role)
execution = pipeline.start()
execution.describe()
# execution.wait()

sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.RemoteFunction.Dependencies
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.RemoteFunction.IncludeLocalWorkDir
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.RemoteFunction.CustomFileFilter.IgnoreNamePatterns
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.RemoteFunction.RoleArn


2024-03-04 07:49:56,286 sagemaker.remote_function INFO     Uploading serialized function code to s3://sagemaker-ap-northeast-2-532805286864/sklearn-pipeline/inference_evaluation/2024-03-04-07-49-54-308/function
2024-03-04 07:49:56,393 sagemaker.remote_function INFO     Uploading serialized function arguments to s3://sagemaker-ap-northeast-2-532805286864/sklearn-pipeline/inference_evaluation/2024-03-04-07-49-54-308/arguments
2024-03-04 07:49:56,725 sagemaker.remote_function INFO     Copied dependencies file at './requirements.txt' to '/tmp/tmptjsvn2je/requirements.txt'
2024-03-04 07:49:56,768 sagemaker.remote_function INFO     Successfully uploaded dependencies and pre execution scripts to 's3://sagemaker-ap-northeast-2-532805286864/sklearn-pipeline/inference_evaluation/2024-03-04-07-49-54-308/pre_exec_script_and_dependencies'
2024-03-04 07:49:56,771 sagemaker.remote_function INFO     Copied user workspace to '/tmp/tmpjiiddkbw/temp_workspace/sagemaker_remote_function_workspace'
2024-03-

sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.RemoteFunction.Dependencies
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.RemoteFunction.IncludeLocalWorkDir
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.RemoteFunction.CustomFileFilter.IgnoreNamePatterns
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.RemoteFunction.RoleArn


2024-03-04 07:49:58,346 sagemaker.remote_function INFO     Uploading serialized function code to s3://sagemaker-ap-northeast-2-532805286864/sklearn-pipeline/training/2024-03-04-07-49-54-308/function
2024-03-04 07:49:58,445 sagemaker.remote_function INFO     Uploading serialized function arguments to s3://sagemaker-ap-northeast-2-532805286864/sklearn-pipeline/training/2024-03-04-07-49-54-308/arguments
2024-03-04 07:49:58,531 sagemaker.remote_function INFO     Copied dependencies file at './requirements.txt' to '/tmp/tmpst50z2tt/requirements.txt'
2024-03-04 07:49:58,573 sagemaker.remote_function INFO     Successfully uploaded dependencies and pre execution scripts to 's3://sagemaker-ap-northeast-2-532805286864/sklearn-pipeline/training/2024-03-04-07-49-54-308/pre_exec_script_and_dependencies'


sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.RemoteFunction.Dependencies
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.RemoteFunction.IncludeLocalWorkDir
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.RemoteFunction.CustomFileFilter.IgnoreNamePatterns
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.RemoteFunction.RoleArn


2024-03-04 07:50:00,036 sagemaker.remote_function INFO     Uploading serialized function code to s3://sagemaker-ap-northeast-2-532805286864/sklearn-pipeline/preprocess/2024-03-04-07-49-54-308/function
2024-03-04 07:50:00,136 sagemaker.remote_function INFO     Uploading serialized function arguments to s3://sagemaker-ap-northeast-2-532805286864/sklearn-pipeline/preprocess/2024-03-04-07-49-54-308/arguments
2024-03-04 07:50:00,225 sagemaker.remote_function INFO     Copied dependencies file at './requirements.txt' to '/tmp/tmpnqed4sz4/requirements.txt'
2024-03-04 07:50:00,277 sagemaker.remote_function INFO     Successfully uploaded dependencies and pre execution scripts to 's3://sagemaker-ap-northeast-2-532805286864/sklearn-pipeline/preprocess/2024-03-04-07-49-54-308/pre_exec_script_and_dependencies'
2024-03-04 07:50:00,668 sagemaker.remote_function INFO     Uploading serialized function code to s3://sagemaker-ap-northeast-2-532805286864/sklearn-pipeline/inference_evaluation/2024-03-04-07-

{'PipelineArn': 'arn:aws:sagemaker:ap-northeast-2:532805286864:pipeline/sklearn-pipeline',
 'PipelineExecutionArn': 'arn:aws:sagemaker:ap-northeast-2:532805286864:pipeline/sklearn-pipeline/execution/n75gdf6a5ye7',
 'PipelineExecutionDisplayName': 'execution-1709538602034',
 'PipelineExecutionStatus': 'Executing',
 'CreationTime': datetime.datetime(2024, 3, 4, 16, 50, 1, 963000, tzinfo=tzlocal()),
 'LastModifiedTime': datetime.datetime(2024, 3, 4, 16, 50, 1, 963000, tzinfo=tzlocal()),
 'CreatedBy': {},
 'LastModifiedBy': {},
 'ResponseMetadata': {'RequestId': '4abfb7f4-c28c-4e71-ad9c-3a9819f7161b',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '4abfb7f4-c28c-4e71-ad9c-3a9819f7161b',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '407',
   'date': 'Mon, 04 Mar 2024 07:50:02 GMT'},
  'RetryAttempts': 0}}

In [62]:
execution.list_steps()

[{'StepName': 'preprocess',
  'StartTime': datetime.datetime(2024, 3, 4, 16, 50, 3, 384000, tzinfo=tzlocal()),
  'StepStatus': 'Executing',
  'Metadata': {'TrainingJob': {'Arn': 'arn:aws:sagemaker:ap-northeast-2:532805286864:training-job/pipelines-n75gdf6a5ye7-preprocess-DV4hOQUU91'}},
  'AttemptCount': 1}]