In [2]:
import os
import boto3
import re
import time
from time import gmtime, strftime
import json
from datetime import datetime, timedelta
from sagemaker import get_execution_role, session
import pandas as pd

region = boto3.Session().region_name

role = get_execution_role()
print("RoleArn: {}".format(role))

RoleArn: arn:aws:iam::874163252636:role/service-role/AmazonSageMaker-ExecutionRole-20201201T202376


In [3]:
bucket = session.Session(boto3.Session()).default_bucket()

print("Demo Bucket: {}".format(bucket))
# prefix = f"sagemaker/demo-preprocess-custom-monitor-batch-transform/{int(time.time())}" 1682420795
prefix = f"sagemaker/demo-preprocess-custom-monitor-batch-transform/1682420795" 

# training dataset paths
training_prefix = f"{prefix}/training"
training_data_prefix = training_prefix + "/data"
s3_training_data_uri = f"s3://{bucket}/{training_data_prefix}/training-dataset-with-header.csv"


# baseline dataset paths
baseline_prefix = f"{prefix}/baseline"
baseline_data_prefix = baseline_prefix + "/data"
s3_baseline_data_uri = f"s3://{bucket}/{baseline_data_prefix}"   # this has to come from training pipeline

# model tar file location
s3_model_artifact_uri = f"s3://{bucket}/{prefix}/model-artifacts"

print(f"Training data uri: {s3_training_data_uri}")
print(f"Model Artifacts: {s3_model_artifact_uri}")
print(f"Baseline data uri: {s3_baseline_data_uri}")

Demo Bucket: sagemaker-ap-south-1-874163252636
Training data uri: s3://sagemaker-ap-south-1-874163252636/sagemaker/demo-preprocess-custom-monitor-batch-transform/1682420795/training/data/training-dataset-with-header.csv
Model Artifacts: s3://sagemaker-ap-south-1-874163252636/sagemaker/demo-preprocess-custom-monitor-batch-transform/1682420795/model-artifacts
Baseline data uri: s3://sagemaker-ap-south-1-874163252636/sagemaker/demo-preprocess-custom-monitor-batch-transform/1682420795/baseline/data


In [4]:
import sagemaker
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.workflow.pipeline_context import PipelineSession
from sagemaker.workflow.parameters import ParameterString
from sagemaker.workflow.steps import TrainingStep
from sagemaker.workflow.steps import ProcessingStep

from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput

from sagemaker.model import Model
from sagemaker.image_uris import retrieve
from sagemaker.inputs import TrainingInput

In [5]:
pipeline_session = PipelineSession()

training data ...  to be populated for training pipeline

In [6]:
!aws s3 cp test_data/training-dataset-with-header.csv {s3_training_data_uri}
!aws s3 ls {s3_training_data_uri}

upload: test_data/training-dataset-with-header.csv to s3://sagemaker-ap-south-1-874163252636/sagemaker/demo-preprocess-custom-monitor-batch-transform/1682420795/training/data/training-dataset-with-header.csv
2023-05-02 11:11:56     375873 training-dataset-with-header.csv


In [7]:
baseline_input_data = ParameterString(
    name="BaselineData", 
    default_value=s3_baseline_data_uri
)

model_location = ParameterString(
    name="ModelSaveLocation", 
    default_value=s3_model_artifact_uri
)


training_input_data = ParameterString(
    name="TrainingData", 
    default_value=s3_training_data_uri
)



### Preprocessing

create the script that is going to do the preprocessing, then define the script object

In [8]:
!mkdir -p training_code

In [9]:
%%writefile training_code/preprocess.py
import argparse
import boto3
import pandas as pd
import numpy as np
import sys
import subprocess

subprocess.check_call([
    sys.executable, "-m", "pip", "install", "-r",
    "/opt/ml/processing/input/code/custom_packages/requirements.txt",
])

base_dir = '/opt/ml/processing'

def preprocess(data):
    # do some pre-processing
    return data

def main(args):
    # read training data for preprocessing from {base_dir}/input
    data = pd.read_csv(f'{base_dir}/input/{args.file_name}')
    processed_data = preprocess(data)
    # store the training ready processed data ta {base_dir}/output
    processed_data.to_csv(f'{base_dir}/output/training_processed_data.csv', header=False, index=False)
    
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--file_name', help='file name for preprocessing')
    args = parser.parse_args()
    main(args)

Overwriting training_code/preprocess.py


In [10]:
framework_version = "1.2-1"

sklearn_processor = SKLearnProcessor(
    framework_version=framework_version,
    instance_type="ml.m5.xlarge",
    instance_count=1,
    role=role,
    sagemaker_session=pipeline_session,
)
processor_args = sklearn_processor.run(
    inputs=[
        ProcessingInput(source=training_input_data, destination="/opt/ml/processing/input"),
        ProcessingInput(source='custom_packages/', destination="/opt/ml/processing/input/code/custom_packages/"),
    ],
    outputs=[
        ProcessingOutput(source="/opt/ml/processing/output", destination=baseline_input_data)
    ],
    code="training_code/preprocess.py",
    arguments=['--file_name', 'training-dataset-with-header.csv']
)

step_process = ProcessingStep(name="TrainingPreProcessing", step_args=processor_args)



### Training

In [11]:
%%writefile training_code/train.py
import argparse
import os
import boto3
import json
import pandas as pd
import numpy as np
import xgboost as xgb
from io import StringIO


if __name__ == "__main__":
    # Pass in environment variables and hyperparameters
    parser = argparse.ArgumentParser()

    # sm_model_dir: model artifacts stored here after training
    parser.add_argument("--sm-model-dir", type=str, default=os.environ.get("SM_MODEL_DIR"))
    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN"))

    args, _ = parser.parse_known_args()
    sm_model_dir = args.sm_model_dir
    training_dir = args.train
    model_file = 'model.json'
    
    # Read in data
    df = pd.read_csv(f"{training_dir}/training_processed_data.csv", header=None, index_col=False)
    
    print(df.head())
    
    X = df.iloc[:,1:].copy()
    X.columns = range(X.columns.size)
    y = df.iloc[:,0].copy()
    
    

    # Build model
    model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
    model.fit(X, y)

    # Save model
    model.save_model(os.path.join(sm_model_dir, model_file))
    

def model_fn(model_dir):
    model = xgb.XGBClassifier()
    model.load_model(os.path.join(model_dir,"model.json"))
    return model

"""
input_fn
    request_body: The body of the request sent to the model.
    request_content_type: (string) specifies the format/variable type of the request
"""
def input_fn(request_body, request_content_type):
    if request_content_type == "text/csv":
        data = pd.read_csv(StringIO(request_body), header=None, index_col=False)
        return data
    else:
        raise ValueError("This model only supports text/csv input")

"""
predict_fn
    input_data: returned array from input_fn above
    model (sklearn model) returned model loaded from model_fn above
"""
def predict_fn(input_data, model):
    return model.predict(input_data)

"""
output_fn
    prediction: the returned value from predict_fn above
    content_type: the content type the endpoint expects to be returned. Ex: JSON, string
"""
def output_fn(prediction, content_type):
    response = '\n'.join([str(x) for x in prediction.tolist()])
    return response

Overwriting training_code/train.py


In [12]:
from sagemaker.sklearn import SKLearn

sk_estimator = SKLearn(
    entry_point="train.py",
    role=role,
    instance_count=1,
    instance_type="ml.c5.xlarge",
    py_version="py3",
    framework_version="1.2-1",
    script_mode=True,
    source_dir='training_code',
    dependencies=['custom_packages/requirements.txt'],
    output_path=model_location,
    sagemaker_session=pipeline_session,
)

# Train the estimator
train_args = sk_estimator.fit({"train": baseline_input_data})

step_train = TrainingStep(name='ModelTraining', step_args=train_args, depends_on=[step_process])

In [13]:
pipeline_name = f"TrainingPipeline"
pipeline = Pipeline(
    name=pipeline_name,
    parameters=[
        baseline_input_data,
        model_location,
        training_input_data,
    ],
    steps=[step_process, step_train],
)

In [14]:
pipeline.upsert(role_arn=role)
execution = pipeline.start()