In [1]:
import logging

logging.getLogger("sagemaker").setLevel(logging.WARNING)
logging.getLogger("boto3").setLevel(logging.WARNING)

import boto3
import sagemaker
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.workflow.pipeline_context import PipelineSession
from sagemaker.workflow.parameters import (
    ParameterString,
    ParameterFloat,
    ParameterInteger,
)
from sagemaker.workflow.steps import ProcessingStep, TrainingStep, CacheConfig
from sagemaker.sklearn.processing import SKLearnProcessor



sagemaker.config INFO - Not applying SDK defaults from location: C:\ProgramData\sagemaker\sagemaker\config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: C:\Users\tochi\AppData\Local\sagemaker\sagemaker\config.yaml


In [2]:
# SageMaker session and role
sagemaker_session = sagemaker.Session()
pipeline_session = PipelineSession()

account = boto3.client("sts").get_caller_identity().get("Account")
role = f"arn:aws:iam::{account}:role/service-role/AmazonSageMakerServiceCatalogProductsUseRole"

In [3]:
# Define pipeline parameters
years_to_filter = ParameterString(name="Historical_Years", default_value="30")
train_instance_type = ParameterString(
    name="TrainingInstanceType", default_value="ml.m5.large"
)
model_approval_status = ParameterString(
    name="ModelApprovalStatus", default_value="Approved"
)
cache_config = CacheConfig(
    enable_caching=True, expire_after="1d"
)  # Cache configuration

## Data Ingestion

In [4]:
# Use an SKLearnProcessor for data ingestion
data_ingestion_processor = SKLearnProcessor(
    framework_version="1.0-1",
    command=["python3"],
    role=role,
    instance_count=1,
    instance_type="ml.m5.large",
)

# Data ingestion step
step_data_ingestion = ProcessingStep(
    name="DataIngestion",
    processor=data_ingestion_processor,
    inputs=[],
    outputs=[
        sagemaker.processing.ProcessingOutput(
            output_name="snp500",
            source="/opt/ml/processing/output",
            destination="s3://aws-portfolio-projects/snp500-data/input_data/",
        ),
    ],
    code="training_scripts/data_ingestion.py",  # This script fetches data from the yahoo finance API
    cache_config=cache_config,
    job_arguments=["--years-to-filter", years_to_filter],
)

## Data Processing Pipeline

In [5]:
data_preprocessor = SKLearnProcessor(
    framework_version="1.0-1", role=role, instance_type="ml.m5.large", instance_count=1
)

# Data processing step
step_data_processing = ProcessingStep(
    name="DataPreprocessing",
    processor=data_preprocessor,
    inputs=[
        sagemaker.processing.ProcessingInput(
            source=step_data_ingestion.properties.ProcessingOutputConfig.Outputs[
                "snp500"
            ].S3Output.S3Uri,
            destination="/opt/ml/processing/input",
        )
    ],
    outputs=[
        sagemaker.processing.ProcessingOutput(
            output_name="train_data",
            source="/opt/ml/processing/output/train",
            destination="s3://aws-portfolio-projects/snp500-data/train_data/",
        )
    ],
    code="training_scripts/data_processing.py",
    cache_config=cache_config,
    job_arguments=[
        "--input_path",
        "/opt/ml/processing/input/snp500.csv",
        "--output_dir",
        "/opt/ml/processing/output/train",
    ],
)

## Model training pipeline

In [6]:
from sagemaker.xgboost.estimator import XGBoost

# Define the XGBoost estimator
xgboost_estimator = XGBoost(
    entry_point="training_scripts/train.py",  # Training script path
    role=role,  # IAM role with required permissions
    instance_count=1,  # Number of instances for training
    instance_type="ml.m5.large",  # Instance type
    framework_version="1.5-1",  # XGBoost framework version
    py_version="py3",  # Python version
    output_path="s3://aws-portfolio-projects/snp500-data/model_artifacts/",  # Output path in S3
    base_job_name="xgboost-stockmarket-training-job",  # Job name prefix
    disable_profiler=True,  # Disable profiler
)

# Model training step
step_train = TrainingStep(
    name="ModelTraining",
    estimator=xgboost_estimator,
    inputs={
        "train": sagemaker.inputs.TrainingInput(
            s3_data=step_data_processing.properties.ProcessingOutputConfig.Outputs[
                "train_data"
            ].S3Output.S3Uri,
            content_type="text/csv",
        )
    },
    cache_config=cache_config,
)

## Model Evaluation

In [7]:
from sagemaker.workflow.properties import PropertyFile
from sagemaker.processing import ScriptProcessor

# Define the ScriptProcessor
evaluation_processor = ScriptProcessor(
    image_uri=sagemaker.image_uris.retrieve(
        framework="xgboost", region=sagemaker_session.boto_region_name, version="1.5-1"
    ),
    command=["python3"],
    instance_type="ml.m5.large",
    instance_count=1,
    base_job_name="model-evaluation",
    role=role,
    sagemaker_session=sagemaker_session,
)

evaluation_report = PropertyFile(
    name="EvaluationReport", output_name="evaluation", path="evaluation.json"
)

# Evaluation step
step_evaluation = ProcessingStep(
    name="ModelEvaluation",
    processor=evaluation_processor,
    inputs=[
        sagemaker.processing.ProcessingInput(
            source=step_train.properties.ModelArtifacts.S3ModelArtifacts,
            destination="/opt/ml/processing/model",
        ),
        sagemaker.processing.ProcessingInput(
            source=step_data_processing.properties.ProcessingOutputConfig.Outputs[
                "train_data"
            ].S3Output.S3Uri,
            destination="/opt/ml/processing/input",
        ),
    ],
    outputs=[
        sagemaker.processing.ProcessingOutput(
            output_name="evaluation",
            source="/opt/ml/processing/evaluation",
            destination="s3://aws-portfolio-projects/snp500-data/evaluation_results/",
        )
    ],
    code="training_scripts/evaluate_model.py",  # This script evaluates the model
    property_files=[evaluation_report],
    cache_config=cache_config,
    job_arguments=[
        "--input-path",
        "/opt/ml/processing/input/train.csv",
        "--model-path",
        "/opt/ml/processing/model",
        "--output-path",
        "/opt/ml/processing/evaluation",
    ],
)

## Model Registration

In [8]:
from sagemaker.model_metrics import MetricsSource, ModelMetrics
from sagemaker.workflow.step_collections import RegisterModel

model_metrics = ModelMetrics(
    model_statistics=MetricsSource(
        s3_uri="{}/evaluation.json".format(
            step_evaluation.arguments["ProcessingOutputConfig"]["Outputs"][0][
                "S3Output"
            ]["S3Uri"]
        ),
        content_type="application/json",
    )
)

# Step 2: Define the RegisterModel step
step_register = RegisterModel(
    name="RegisterModel",
    estimator=xgboost_estimator,
    model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts,
    content_types=["text/csv"],
    response_types=["text/csv"],
    inference_instances=["ml.m5.large"],
    transform_instances=["ml.m5.large"],
    model_package_group_name="StockPredictionModels",
    approval_status="Approved",
    model_metrics=model_metrics,
)

## Model Deployment

In [14]:
from sagemaker.model import Model
from sagemaker.xgboost.model import XGBoostPredictor
from sagemaker.workflow.steps import CreateModelStep
from sagemaker.workflow.model_step import ModelStep

# Define the XGBoost model
model = Model(
    image_uri=sagemaker.image_uris.retrieve(
        framework="xgboost",  # Change the framework to xgboost
        region=pipeline_session.boto_region_name,
        version="1.5-1",  # Use a compatible version of XGBoost
        instance_type="ml.m5.large",
    ),
    model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts,
    sagemaker_session=pipeline_session,
    role=role,
    predictor_cls=XGBoostPredictor,  # Update the predictor class to XGBoostPredictor
    entry_point="training_scripts/inference.py",
)

# Create the ModelStep
step_create_model = ModelStep(
    name="StockMarketPrediction",
    step_args=model.create(instance_type="ml.m5.large"),
)

In [15]:
# Custom Lambda Step
import time
from sagemaker.workflow.lambda_step import (
    LambdaStep,
    LambdaOutput,
    LambdaOutputTypeEnum,
)
from sagemaker.lambda_helper import Lambda


lambda_role = "arn:aws:iam::930627915954:role/sagemaker-pipeline-lambda-role"
current_time = time.strftime("%m-%d-%H-%M-%S", time.localtime())
model_name = "stock-market-model" + current_time
endpoint_config_name = "stock-prediction-endpoint-config-" + current_time
# endpoint_name = f"stock-prediction-endpoint-{current_time}"
endpoint_name = "stock-market-prediction-endpoint"

function_name = "sagemaker-lambda-step-endpoint-deploy" + current_time

# Lambda helper class can be used to create the Lambda function
func = Lambda(
    function_name=function_name,
    execution_role_arn=lambda_role,
    script="training_scripts/lambda_helper.py",
    handler="lambda_helper.lambda_handler",
)

output_param_1 = LambdaOutput(
    output_name="statusCode", output_type=LambdaOutputTypeEnum.String
)
output_param_2 = LambdaOutput(
    output_name="body", output_type=LambdaOutputTypeEnum.String
)
output_param_3 = LambdaOutput(
    output_name="other_key", output_type=LambdaOutputTypeEnum.String
)

step_deploy_lambda = LambdaStep(
    name="DeployStep",
    lambda_func=func,
    inputs={
        "model_name": step_create_model.properties.ModelName,
        "endpoint_config_name": endpoint_config_name,
        "endpoint_name": endpoint_name,
    },
    outputs=[output_param_1, output_param_2, output_param_3],
)

## Deployment Condition

In [16]:
from sagemaker.workflow.conditions import ConditionGreaterThanOrEqualTo
from sagemaker.workflow.condition_step import (
    ConditionStep,
    JsonGet,
)

cond_lte = ConditionGreaterThanOrEqualTo(
    left=JsonGet(
        step=step_evaluation,
        property_file=evaluation_report,
        json_path="precision",
    ),
    right=0.5,
)

step_conditional_register = ConditionStep(
    name="stockmarket_precision_cond",
    conditions=[cond_lte],
    if_steps=[step_register, step_deploy_lambda],
    else_steps=[],
)

## Execute the pipeline

In [17]:
# Define the pipeline
pipeline = Pipeline(
    name="StockTrainingPipeline",
    parameters=[years_to_filter, train_instance_type, model_approval_status],
    steps=[
        step_data_ingestion,
        step_data_processing,
        step_train,
        step_evaluation,
        step_conditional_register,
    ],
    sagemaker_session=sagemaker_session,
)

# Submit the pipeline
pipeline.upsert(role_arn=role)
execution = pipeline.start()
execution.wait()

## Schedule Training Pipeline

In [1]:
import pytz
from datetime import datetime, timedelta
from sagemaker.workflow.triggers import PipelineSchedule

# Set US Central Time (UTC-6)
central_tz = pytz.timezone('US/Central')

# Set the schedule time to 8 AM US Central Time on the next Sunday
now = datetime.now(central_tz)
next_sunday = now + timedelta((6 - now.weekday()) % 7 + 1)
schedule_time = central_tz.localize(next_sunday.replace(hour=8, minute=0, second=0, microsecond=0))

# Convert to UTC
schedule_time_utc = schedule_time.astimezone(pytz.utc)

# Define the pipeline schedule to run every Sunday morning at 8 AM US Central Time (which is 2 PM UTC)
pipeline_schedule = PipelineSchedule(
    name='training-pipeline-weekly-schedule',
    start_date=schedule_time_utc,
    at=schedule_time_utc,
    frequency='cron(0 8 ? * 1 *)',  # Cron expression for every Sunday at 8 AM US Central Time
    enabled=True
)