## Part X.1 - Configure CI/CD Pipeline for Custom Model

University of San Diego - MS Applied AI

AAI-540 Team 5

October 21, 2024

### Environment Setup

In [301]:
# Setup environment
%run 0-Environment_Setup.ipynb

[0mStored 's3_datalake_path_csv' (str)
Stored 'local_data_path_csv' (str)
Stored 's3_datalake_path_parquet' (str)


In [None]:
# Upgrade sagemaker version
!pip install -U sagemaker

In [303]:
# Import modules
import sys

import boto3
import sagemaker
from sagemaker.workflow.pipeline_context import PipelineSession

In [304]:
# Set session variables
sm_client = boto3.client('sagemaker', region_name=region)
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
region = sagemaker_session.boto_session.region_name
bucket = sess.default_bucket()
pipeline_session = PipelineSession()

In [305]:
# Set model package group name to register successful models to
model_package_group_name = "custom-model-package-group"

# Set input and output S3 paths for the pipelines
base_path = f"s3://{bucket}/store-sales-forecasting/pipelines"
input_data_path = f"{base_path}/input/"
output_data_path = f"{base_path}/output/"
print(input_data_path)
print(output_data_path)

s3://sagemaker-us-east-1-342408968837/store-sales-forecasting/pipelines/input/
s3://sagemaker-us-east-1-342408968837/store-sales-forecasting/pipelines/output/


In [306]:
# Pull the data from the feature store sorted by date and then store number
sales_features_store_df = get_store_dataset_from_offline_feature_group_date_sort(store_sales_feature_group)
sales_features_store_df.head()

Running 
    SELECT *
    FROM
        "store_sales_feature_group_offline_1728878780"
    ORDER BY
        date ASC, store_nbr ASC
    


INFO:sagemaker:Query f3b608c4-8584-480c-9912-34c19b4baa57 is being executed.
INFO:sagemaker:Query f3b608c4-8584-480c-9912-34c19b4baa57 successfully executed.


Unnamed: 0,date,store_nbr,sales,oil,onpromotion,is_holiday,city,state,cluster,year,...,month_sin,day_cos,day_sin,dow_cos,dow_sin,sales_record_id,event_time,write_time,api_invocation_time,is_deleted
0,2013-01-01,1,0.0,93.14,0,1,18,12,13,2013,...,0.5,0.97953,0.201299,0.62349,0.781831,2013-01-01:1,1728879000.0,2024-10-14 04:11:58.235,2024-10-14 04:06:44.000,False
1,2013-01-01,2,0.0,93.14,0,1,18,12,13,2013,...,0.5,0.97953,0.201299,0.62349,0.781831,2013-01-01:2,1728879000.0,2024-10-14 04:11:58.188,2024-10-14 04:06:45.000,False
2,2013-01-01,3,0.0,93.14,0,1,18,12,8,2013,...,0.5,0.97953,0.201299,0.62349,0.781831,2013-01-01:3,1728879000.0,2024-10-14 04:11:58.119,2024-10-14 04:06:45.000,False
3,2013-01-01,4,0.0,93.14,0,1,18,12,9,2013,...,0.5,0.97953,0.201299,0.62349,0.781831,2013-01-01:4,1728879000.0,2024-10-14 04:11:57.992,2024-10-14 04:06:45.000,False
4,2013-01-01,5,0.0,93.14,0,1,21,14,4,2013,...,0.5,0.97953,0.201299,0.62349,0.781831,2013-01-01:5,1728879000.0,2024-10-14 04:11:58.307,2024-10-14 04:06:45.000,False


In [307]:
# Save the data from the feature store locally and upload to the pipeline input S3 path
sales_features_store_df.to_csv("input_data.csv")
!aws s3 cp "input_data.csv" $input_data_path

### Pipeline Configuration

In [448]:
from sagemaker.workflow.parameters import (
    ParameterInteger,
    ParameterString,
    ParameterFloat,
)

# Set pipeline parameters
processing_instance_count = ParameterInteger(name="ProcessingInstanceCount", default_value=1)
instance_type = ParameterString(name="TrainingInstanceType", default_value="ml.m5.xlarge")
model_approval_status = ParameterString(
    name="ModelApprovalStatus", default_value="PendingManualApproval"
)
input_data = ParameterString(
    name="InputData",
    default_value=input_data_path,
)
batch_data = ParameterString(
    name="BatchData",
    default_value=output_data_path,
)

# Define the RMSE score threshold that determines if we keep the model
rmse_threshold = ParameterFloat(name="RmseThreshold", default_value=0.65)

### Data Preprocessing Pipeline Step

In [479]:
%%writefile custom-model-code/preprocessing.py

# Define a preprocessing script that will run in the pipeline
# This script will take the data in the feature store, split it, and transform
# it into the format expected by the model

import json
import argparse
import os
import requests
import tempfile

import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Select features to use in the model
def get_store_features(row):
    return [
      row["sales"], 
      row["oil"], 
      row["onpromotion"],
      row["is_holiday"], 
      row["hash_0"], 
      row["hash_1"], 
      row["hash_2"], 
      row["hash_3"], 
      row["hash_4"], 
      row["hash_5"], 
      row["hash_6"], 
      row["hash_7"], 
      row["hash_8"], 
      row["hash_9"], 
      row["month_cos"],
      row["month_sin"],
      # row["day_cos"],
      # row["day_sin"],
      row["dow_cos"],
      row["dow_sin"]
]

# Split the data sets into input windows and associated targets
def generate_windows(data, input_seq_length, target_seq_length, stride):
    windows = []
    targets = []
    num_days = data.shape[1]
    
    for i in range(0, num_days, stride):
        if (i+input_seq_length+target_seq_length) <= num_days:
            input_window_end = i + input_seq_length
            target_window_end = input_window_end + target_seq_length
            
            input_window = data[:, i:input_window_end, :]
            target_window = data[:, input_window_end:target_window_end, 0]
            
            windows.append(input_window)
            targets.append(target_window)
            
    return np.array(windows), np.array(targets)



if __name__ == "__main__":
    
    # Base directory inside the pipeline
    base_dir = "/opt/ml/processing"
    
    # Load the data
    df = pd.read_csv(f"{base_dir}/input/input_data.csv", index_col=0)
    
    # Apply feature selection function
    df["features"] = df.apply(get_store_features, axis=1)
    num_continuous_features = 3
    
    # Drop uneeded columns
    drop_columns = [col for col in df.columns if col not in ["date", "store_nbr", "features"]]
    df.drop(columns=drop_columns, inplace=True)
    
    # Pivot the data to be in the format (store number, date, features)
    df_pivoted = df.pivot(index="store_nbr", columns="date", values="features")
    
    # Convert the data to an array
    stacked_df = np.array(df_pivoted.values.tolist())
    
    # Split the data into test/train/val sets with a 80/10/10 split
    n = stacked_df.shape[1]
    train_data = stacked_df[:, :int(n*0.8), :]
    test_data = stacked_df[:, int(n*0.8):int(n*0.9), :]
    val_data = stacked_df[:, int(n*0.9):-7, :]
    
    # Withold the last 7 days of the data for forecasting
    forecast_data = stacked_df[:, -7:, :]
    
    # Get the mean and standard deviation for normalization
    scaler = StandardScaler()

    # Flatten the first 2 dimensions into (stores*instances, features)
    train_data_2d = train_data.reshape(-1, train_data.shape[2])
    test_data_2d = test_data.reshape(-1, test_data.shape[2])
    val_data_2d = val_data.reshape(-1, val_data.shape[2])
    forecast_data_2d = forecast_data.reshape(-1, forecast_data.shape[2])

    # Scale just the continuous features
    train_data_2d[:, :num_continuous_features] = scaler.fit_transform(train_data_2d[:, :num_continuous_features])
    test_data_2d[:, :num_continuous_features] = scaler.transform(test_data_2d[:, :num_continuous_features])
    val_data_2d[:, :num_continuous_features] = scaler.transform(val_data_2d[:, :num_continuous_features])
    forecast_data_2d[:, :num_continuous_features] = scaler.transform(forecast_data_2d[:, :num_continuous_features])

    # Add Gaussian noise to the continuous features
    train_data_2d[:, :num_continuous_features] = train_data_2d[:, :num_continuous_features] + np.random.normal(0, 0.2, train_data_2d[:, :num_continuous_features].shape)
    test_data_2d[:, :num_continuous_features] = test_data_2d[:, :num_continuous_features] + np.random.normal(0, 0.2, test_data_2d[:, :num_continuous_features].shape)
    val_data_2d[:, :num_continuous_features] = val_data_2d[:, :num_continuous_features] + np.random.normal(0, 0.2, val_data_2d[:, :num_continuous_features].shape)
    forecast_data_2d[:, :num_continuous_features] = forecast_data_2d[:, :num_continuous_features] + np.random.normal(0, 0.2, forecast_data_2d[:, :num_continuous_features].shape)

    # Reshape the data back to its original dimensions
    train_data = train_data_2d.reshape(train_data.shape)
    test_data = test_data_2d.reshape(test_data.shape)
    val_data = val_data_2d.reshape(val_data.shape)
    forecast_data = forecast_data_2d.reshape(forecast_data.shape)
    
    # Generate windows for train/test/val sets
    input_seq_length = 7
    target_seq_length = 1
    stride = 1

    # Create the input and target windows for the data splits
    train_inputs, train_targets = generate_windows(train_data, input_seq_length, target_seq_length, stride)
    print(f"Train inputs shape: {train_inputs.shape}")
    print(f"Train targets shape: {train_targets.shape}")

    test_inputs, test_targets = generate_windows(test_data, input_seq_length, target_seq_length, stride)
    print(f"Test inputs shape: {test_inputs.shape}")
    print(f"Test targets shape: {test_targets.shape}")

    val_inputs, val_targets = generate_windows(val_data, input_seq_length, target_seq_length, stride)
    print(f"Validation inputs shape: {val_inputs.shape}")
    print(f"Validation inputs shape: {val_targets.shape}")
    
    # Save data splits
    np.save(f"{base_dir}/train/train_inputs.npy", train_inputs)
    np.save(f"{base_dir}/train/train_targets.npy", train_targets)

    np.save(f"{base_dir}/test/test_inputs.npy", test_inputs)
    np.save(f"{base_dir}/test/test_targets.npy", test_targets)

    np.save(f"{base_dir}/validation/val_inputs.npy", val_inputs)
    np.save(f"{base_dir}/validation/val_targets.npy", val_targets)

    # Save the evaluation data for the batch transform evaluation job
    with open(f"{base_dir}/transform-input/validation_data.ndjson", "w") as f:
        for i, window in enumerate(val_inputs):
            instance = {"input_1": window.tolist()}
            json_line = json.dumps(instance)
            if i < len(val_inputs) - 1:
                f.write(json_line + "\n")
            else:
                f.write(json_line)
    
    
    # Save the forecasting data
    with open(f"{base_dir}/forecast-input/forecast_data.ndjson", "w") as f:
        instance = {"input_1": forecast_data.tolist()}
        json_line = json.dumps(instance)
        f.write(json_line)



Overwriting custom-model-code/preprocessing.py


In [453]:
from sagemaker.sklearn.processing import SKLearnProcessor

# Define a sklearn processor container to run the preprocessing script
sklearn_processor = SKLearnProcessor(
    framework_version="1.2-1",
    instance_type="ml.m5.xlarge",
    instance_count=processing_instance_count,
    base_job_name="sklearn-custom-model-process",
    role=role,
    sagemaker_session=pipeline_session,
)

INFO:sagemaker.image_uris:Defaulting to only available Python version: py3


In [454]:
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.workflow.steps import ProcessingStep

# Set the arguments for the data processing step
processor_args = sklearn_processor.run(
    inputs=[
        ProcessingInput(source=input_data, destination="/opt/ml/processing/input"),
    ],
    outputs=[
        ProcessingOutput(output_name="train", source="/opt/ml/processing/train"),
        ProcessingOutput(output_name="validation", source="/opt/ml/processing/validation"),
        ProcessingOutput(output_name="test", source="/opt/ml/processing/test"),
        ProcessingOutput(output_name="transform-input", source="/opt/ml/processing/transform-input"),
        ProcessingOutput(output_name="forecast-input", source="/opt/ml/processing/forecast-input"),
    ],
    code="custom-model-code/preprocessing.py",
)

# Define the data processing step
step_process = ProcessingStep(name="CustomModelProcess", step_args=processor_args)

### Model Training Pipeline Step

In [455]:
from sagemaker.estimator import Estimator
from sagemaker.inputs import TrainingInput
from sagemaker.tensorflow import TensorFlow

# Define regex patterns for capturing training metrics
metric_definitions=[
    {'Name': 'loss', 'Regex': "loss: ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'root_mean_squared_error', 'Regex': "root_mean_squared_error: ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'mean_absolute_error', 'Regex': "mean_absolute_error: ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'val_loss', 'Regex': "val_loss: ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'val_root_mean_squared_error', 'Regex': "val_root_mean_squared_error: ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'val_mean_absolute_error', 'Regex': "val_mean_absolute_error: ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'epoch', 'Regex': "Epoch ([0-9]+(.|e\-)[0-9]+),?"}]

# Specify tensorflow image to train the model in
image_uri = sagemaker.image_uris.retrieve(
    framework='tensorflow',
    region=region,
    version='2.6.0',
    image_scope='training',
    instance_type='ml.m5.xlarge'
)

# Define an estimator using custom training logic
model_path = f"s3://{bucket}/CustomModelTrain"
custom_model_train = Estimator(
    entry_point='custom-model-code/train.py',
    image_uri=image_uri,
    instance_type='ml.m5.xlarge',
    instance_count=1,
    output_path=model_path,
    role=role,
    sagemaker_session=pipeline_session,
    hyperparameters={
        'batch_size': 10,
        'epochs': 50,
        'learning_rate': 0.002,
        'l2_regularization': 0.004,
        'dropout': 0.2
    },
    metric_definitions=metric_definitions
)

train_args = custom_model_train.fit(
    inputs={
        "train": TrainingInput(
            s3_data=step_process.properties.ProcessingOutputConfig.Outputs["train"].S3Output.S3Uri,
            content_type="application/x-npy",
        ),
        "test": TrainingInput(
            s3_data=step_process.properties.ProcessingOutputConfig.Outputs["test"].S3Output.S3Uri,
            content_type="application/x-npy",
        ),
        "validation": TrainingInput(
            s3_data=step_process.properties.ProcessingOutputConfig.Outputs[
                "validation"
            ].S3Output.S3Uri,
            content_type="application/x-npy",
        ),
    }
)


INFO:sagemaker.image_uris:Defaulting to only available Python version: py38


In [456]:
from sagemaker.inputs import TrainingInput
from sagemaker.workflow.steps import TrainingStep

# Define the training step
step_train = TrainingStep(
    name="CustomModelTrain",
    step_args=train_args,
)

In [457]:
from sagemaker.model import Model

# Create a model from the training job in the previous step
custom_model = Model(
    image_uri='763104351884.dkr.ecr.us-east-1.amazonaws.com/tensorflow-inference:2.6-cpu',
    model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts,
    sagemaker_session=pipeline_session,
    role=role,
)

In [458]:
from sagemaker.inputs import CreateModelInput
from sagemaker.workflow.model_step import ModelStep

# Create the model creation step
step_create_model = ModelStep(
    name="CustomModel",
    step_args=custom_model.create(instance_type="ml.m5.xlarge"),
)

### Batch Transform Evaluation Pipeline Step

In [460]:
from sagemaker.transformer import Transformer
from sagemaker.inputs import TransformInput
from sagemaker.workflow.steps import TransformStep

# Create a batch transform step that generates predictions on the validation set for evaluation
transformer = Transformer(
    model_name=step_create_model.properties.ModelName,
    instance_type="ml.m5.xlarge",
    instance_count=1,
    strategy="MultiRecord",
    assemble_with="Line",
    output_path=f"{output_data_path}transform-results",
    accept="application/jsonlines"
)

# Specify the validation data written out in the preprocessing step as input
transform_input = TransformInput(
    data=step_process.properties.ProcessingOutputConfig.Outputs["transform-input"].S3Output.S3Uri, 
    split_type="Line",
    content_type="application/jsonlines"
)

# Create the batch transform step
step_transform_eval = TransformStep(
    name="CustomModelBatchTransform", transformer=transformer, inputs=transform_input
)

In [478]:
%%writefile custom-model-code/evaluation.py

# Define an evaluation script that will run in the pipeline
# This script evaluates the predictions made on the validation set and
# logs an evaluation report with RMSE and MAE scores

import os
import json
import pathlib
import numpy as np

from sklearn.metrics import mean_absolute_error, mean_squared_error


if __name__ == "__main__":
    
    # Set base directory inside the pipeline
    base_dir = "/opt/ml/processing"
    print(os.getcwd())
    
    # Load validation set true target values
    val_targets = np.load(os.path.join(f"{base_dir}/validation", "val_targets.npy"))
    print(val_targets.shape)
    
    # Load predictions from the batch transform job
    with open(f"{base_dir}/transform-results/validation_data.ndjson.out", "r") as f:
        predictions = []
        for line in f:
            obj = json.loads(line.strip())
            predictions.extend(obj["predictions"])
    
    # Convert the predictions back into a numpy array
    predictions_array = np.array(predictions)
    print(predictions_array.shape)
    
    # Flatten the targets and predictions for computing metrics
    targets_flat = val_targets.flatten()
    predictions_flat = predictions_array.flatten()

    # Compute the RMSE, MAE, and standard deviation of the residuals
    rmse = mean_squared_error(targets_flat, predictions_flat, squared=False)
    mae = mean_absolute_error(targets_flat, predictions_flat)
    std = np.std(targets_flat - predictions_flat)
    print(f"RMSE: {rmse} MAE: {mae}")

    # Write the evaluation metrics out to an evaluation report
    report_dict = {
        "regression_metrics": {
            "rmse": {"value": rmse, "standard_deviation": std},
            "mae": {"value": mae, "standard_deviation": std}
        },
    }

    output_dir = "/opt/ml/processing/evaluation"
    pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)

    evaluation_path = f"{output_dir}/evaluation.json"
    with open(evaluation_path, "w") as f:
        f.write(json.dumps(report_dict))

Overwriting custom-model-code/evaluation.py


In [464]:
from sagemaker.sklearn.processing import SKLearnProcessor

# Define a sklearn processor container to run the evaluation script
sklearn_eval_processor = SKLearnProcessor(
    framework_version="1.2-1",
    instance_type="ml.m5.xlarge",
    instance_count=processing_instance_count,
    base_job_name="sklearn-custom-eval-process",
    role=role,
    sagemaker_session=pipeline_session,
)

# Set the input, output, and script for the evaluation step
eval_args = sklearn_eval_processor.run(
    inputs=[
        ProcessingInput(
            source=step_process.properties.ProcessingOutputConfig.Outputs["validation"].S3Output.S3Uri,
            destination="/opt/ml/processing/validation",
        ),
        ProcessingInput(
            source=step_transform_eval.properties.TransformOutput.S3OutputPath,
            destination="/opt/ml/processing/transform-results",
        ),
    ],
    outputs=[
        ProcessingOutput(output_name="evaluation", source="/opt/ml/processing/evaluation"),
    ],
    code="custom-model-code/evaluation.py",
)

INFO:sagemaker.image_uris:Defaulting to only available Python version: py3


In [465]:
from sagemaker.workflow.properties import PropertyFile

# Define the file the evaluation report will be written to
evaluation_report = PropertyFile(
    name="EvaluationReport", output_name="evaluation", path="evaluation.json"
)

# Create the evaluation step
step_eval = ProcessingStep(
    name="CustomModelEval",
    step_args=eval_args,
    property_files=[evaluation_report],
)

### Model Registration Pipeline Step

In [466]:
from sagemaker.model_metrics import MetricsSource, ModelMetrics

# Load the metrics computed in the evaluation step
model_metrics = ModelMetrics(
    model_statistics=MetricsSource(
        s3_uri="{}/evaluation.json".format(
            step_eval.arguments["ProcessingOutputConfig"]["Outputs"][0]["S3Output"]["S3Uri"]
        ),
        content_type="application/json",
    )
)

# Register the model to the custom model package group
register_args = custom_model.register(
    inference_instances=["ml.t2.medium", "ml.m5.xlarge"],
    transform_instances=["ml.m5.xlarge"],
    model_package_group_name=model_package_group_name,
    approval_status=model_approval_status,
    model_metrics=model_metrics
)

# Create the model registration step
# NOTE: This step will only run if the RMSE threshold is not exceeded in the eval step
step_register = ModelStep(name="CustomModelRegister", step_args=register_args)



### Forecasting Pipeline Steps

In [None]:
# Get the container image and S3 artifacts for the last model 
model_packages = sm_client.list_model_packages(
    ModelPackageGroupName=model_package_group_name, SortBy="CreationTime", SortOrder="Descending")

model_package = sm_client.describe_model_package(ModelPackageName=model_packages['ModelPackageSummaryList'][0]["ModelPackageArn"])
previous_model_image = model_package['InferenceSpecification']['Containers'][0]['Image']
previous_model_artifacts = model_package['InferenceSpecification']['Containers'][0]['ModelDataUrl']

In [480]:
# Recreate the last model that was added to the registry
# This model will be used for forecasting if the model trained
# In this pipeline exceeds the RMSE threshold
custom_model_existing = Model(
    image_uri=previous_model_image,
    model_data=previous_model_artifacts,
    sagemaker_session=pipeline_session,
    role=role,
)

# Create a step that recreates an existing model for inference
step_recreate_existing_model = ModelStep(
    name="PreviousCustomModel",
    step_args=custom_model_existing.create(instance_type="ml.m5.xlarge"),
)

In [469]:
from sagemaker.transformer import Transformer

# Define the batch transform that will run with the new model
new_forecast_transformer = Transformer(
    model_name=step_create_model.properties.ModelName,
    instance_type="ml.m5.xlarge",
    instance_count=1,
    strategy="MultiRecord",
    assemble_with="Line",
    output_path=f"{output_data_path}sales-forecast",
    accept="application/jsonlines"
)

# Define the batch transform that will run with the existing model
old_forecast_transformer = Transformer(
    model_name=step_recreate_existing_model.properties.ModelName,
    instance_type="ml.m5.xlarge",
    instance_count=1,
    strategy="MultiRecord",
    assemble_with="Line",
    output_path=f"{output_data_path}sales-forecast",
    accept="application/jsonlines"
)

In [470]:
# Set the input for the forecasting step
transform_input_forecast = TransformInput(
    data=step_process.properties.ProcessingOutputConfig.Outputs["forecast-input"].S3Output.S3Uri,
    split_type="Line",
    content_type="application/jsonlines"
)

# Create the batch transform step that runs if the conditional check succeeds
step_transform_forecast_new = TransformStep(
    name="CustomModelBatchForecastNew", transformer=new_forecast_transformer, inputs=transform_input_forecast
)

# Create the batch transform step that runs if the conditional check fails
step_transform_forecast_existing = TransformStep(
    name="CustomModelBatchForecastExisting", transformer=old_forecast_transformer, inputs=transform_input_forecast
)


### Conditional Pipeline Step

In [472]:
from sagemaker.workflow.conditions import ConditionLessThanOrEqualTo
from sagemaker.workflow.condition_step import ConditionStep
from sagemaker.workflow.functions import JsonGet


# Define a conditional check that checks if the RMSE computed in the evaluation
# step is below the threshold parameter
cond_lte = ConditionLessThanOrEqualTo(
    left=JsonGet(
        step_name=step_eval.name,
        property_file=evaluation_report,
        json_path="regression_metrics.rmse.value",
    ),
    right=rmse_threshold,
)

# Create a step that registers the new model if the conditional check succeeds and
# uses the new model for forecasting, and otherwise loads the most recent model from 
# the registry and uses that model for forecasting
step_cond = ConditionStep(
    name="CustomModelRMSECond",
    conditions=[cond_lte],
    if_steps=[step_register, step_transform_forecast_new],
    else_steps=[step_recreate_existing_model, step_transform_forecast_existing],
)

### Pipeline Execution

In [473]:
from sagemaker.workflow.pipeline import Pipeline

# Configure the pipeline
pipeline_name = f"CustomModelPipeline"
pipeline = Pipeline(
    name=pipeline_name,
    parameters=[
        processing_instance_count,
        instance_type,
        model_approval_status,
        input_data,
        batch_data,
        rmse_threshold,
    ],
    steps=[step_process, step_train, step_create_model, step_transform_eval, step_eval, step_cond],
)

In [475]:
# Upsert the IAM role to the pipeline steps
pipeline.upsert(role_arn=role)



{'PipelineArn': 'arn:aws:sagemaker:us-east-1:342408968837:pipeline/CustomModelPipeline',
 'ResponseMetadata': {'RequestId': '8eb65de6-893f-4afe-b891-435df14df6b7',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '8eb65de6-893f-4afe-b891-435df14df6b7',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '87',
   'date': 'Sat, 19 Oct 2024 00:52:52 GMT'},
  'RetryAttempts': 0}}

In [476]:
# Start the pipeline execution
execution = pipeline.start()
execution.describe()

{'PipelineArn': 'arn:aws:sagemaker:us-east-1:342408968837:pipeline/CustomModelPipeline',
 'PipelineExecutionArn': 'arn:aws:sagemaker:us-east-1:342408968837:pipeline/CustomModelPipeline/execution/oot6k1g9zwmg',
 'PipelineExecutionDisplayName': 'execution-1729299173064',
 'PipelineExecutionStatus': 'Executing',
 'CreationTime': datetime.datetime(2024, 10, 19, 0, 52, 52, 967000, tzinfo=tzlocal()),
 'LastModifiedTime': datetime.datetime(2024, 10, 19, 0, 52, 52, 967000, tzinfo=tzlocal()),
 'CreatedBy': {'UserProfileArn': 'arn:aws:sagemaker:us-east-1:342408968837:user-profile/d-2cr7fbmrqyrg/jlawton',
  'UserProfileName': 'jlawton',
  'DomainId': 'd-2cr7fbmrqyrg',
  'IamIdentity': {'Arn': 'arn:aws:sts::342408968837:assumed-role/LabRole/SageMaker',
   'PrincipalId': 'AROAU7OJKHKCWCCLLHI6O:SageMaker'}},
 'LastModifiedBy': {'UserProfileArn': 'arn:aws:sagemaker:us-east-1:342408968837:user-profile/d-2cr7fbmrqyrg/jlawton',
  'UserProfileName': 'jlawton',
  'DomainId': 'd-2cr7fbmrqyrg',
  'IamIdenti