In [3]:
# Do the imports at the top

import random
from zenml import ExternalArtifact, pipeline 
from zenml.client import Client
from zenml.logger import get_logger
from uuid import UUID

import os
from typing import Optional, List

from zenml import pipeline

from pipelines import feature_engineering

from steps import (
    data_loader,
    data_preprocessor,
    data_splitter,
    model_evaluator,
    model_trainer,
    inference_predict,
    inference_preprocessor
)

logger = get_logger(__name__)

client = Client()

[1;35mNumExpr defaulting to 8 threads.[0m


# Start local with a simple training pipeline

First, lets run our training pipeline locally

<img src="_assets/default_stack.png" alt="Drawing" style="width: 800px;"/>

In [2]:
# To start with, we use the default stack
!zenml init
!zenml stack set default

[1;35mNumExpr defaulting to 8 threads.[0m
[?25l[32m⠋[0m Initializing ZenML repository at 
/home/apenner/PycharmProjects/zenml-projects/stack-showcase.
[2K[1A[2K[1A[2K[32m⠙[0m Initializing ZenML repository at 
/home/apenner/PycharmProjects/zenml-projects/stack-showcase.
[2K[1A[2K[1A[2K[32m⠹[0m Initializing ZenML repository at 
/home/apenner/PycharmProjects/zenml-projects/stack-showcase.
[2K[1A[2K[1A[2K[32m⠸[0m Initializing ZenML repository at 
/home/apenner/PycharmProjects/zenml-projects/stack-showcase.
[1;35mSetting the repo active workspace to 'default'.[0m
[33mSetting the repo active stack to default.[0m
[2K[1A[2K[1A[2K[32m⠼[0m Initializing ZenML repository at 
/home/apenner/PycharmProjects/zenml-projects/stack-showcase.
[2K[1A[2K[1A[2K[32m⠴[0m Initializing ZenML repository at 
/home/apenner/PycharmProjects/zenml-projects/stack-showcase.
[2K[1A[2K[1A[2K[2;36mZenML repository initialized at [0m
[2;35m/home/apenner/PycharmProjects/zen

In [3]:
import pandas as pd
from sklearn.datasets import load_breast_cancer
from zenml import step
from zenml.logger import get_logger

logger = get_logger(__name__)


@step
def data_loader() -> pd.DataFrame:
    """Dataset reader step."""
    dataset = load_breast_cancer(as_frame=True)
    inference_size = int(len(dataset.target) * 0.05)
    dataset: pd.DataFrame = dataset.frame
    dataset.reset_index(drop=True, inplace=True)
    logger.info(f"Dataset with {len(dataset)} records loaded!")

    ### YOUR CODE ENDS HERE ###
    return dataset


In [4]:
data_loader()

[1;35mDataset with 569 records loaded![0m
[1;35mDataset with 569 records loaded![0m


Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890,0
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902,0
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300,0
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,...,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115,0
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,...,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637,0
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,...,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820,0
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,...,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400,0


In [6]:
@pipeline
def training(
    train_dataset_id: Optional[UUID] = None,
    test_dataset_id: Optional[UUID] = None,
    min_train_accuracy: float = 0.0,
    min_test_accuracy: float = 0.0,
):
    """Model training pipeline."""
    # Execute Feature Engineering Pipeline
    dataset_trn, dataset_tst = feature_engineering()

    model = model_trainer(
        dataset_trn=dataset_trn,
    )

    model_evaluator(
        model=model,
        dataset_trn=dataset_trn,
        dataset_tst=dataset_tst,
        min_train_accuracy=min_train_accuracy,
        min_test_accuracy=min_test_accuracy,
    )


In [6]:
pipeline_args = {"enable_cache": False}
pipeline_args["config_path"] = os.path.join("configs", "training.yaml")
fe_t_configured = training.with_options(**pipeline_args)

In [7]:
fe_t_configured()

[1;35mInitiating a new run for the pipeline: [0m[1;36mtraining[1;35m.[0m
[1;35mRegistered new version: [0m[1;36m(version 6)[1;35m.[0m
[1;35mNew model version [0m[1;36m68[1;35m was created.[0m
[1;35mExecuting a new run.[0m
[1;35mCaching is disabled by default for [0m[1;36mtraining[1;35m.[0m
[1;35mUsing user: [0m[1;36malexej@zenml.io[1;35m[0m
[1;35mUsing stack: [0m[1;36mdefault[1;35m[0m
[1;35m  artifact_store: [0m[1;36mdefault[1;35m[0m
[1;35m  orchestrator: [0m[1;36mdefault[1;35m[0m
[1;35mStep [0m[1;36mdata_loader[1;35m has started.[0m
[1;35mDataset with 541 records loaded![0m
[1;35mStep [0m[1;36mdata_loader[1;35m has finished in [0m[1;36m7.496s[1;35m.[0m
[1;35mStep [0m[1;36mdata_splitter[1;35m has started.[0m
[1;35mStep [0m[1;36mdata_splitter[1;35m has finished in [0m[1;36m10.536s[1;35m.[0m
[1;35mStep [0m[1;36mdata_preprocessor[1;35m has started.[0m
[1;35mStep [0m[1;36mdata_preprocessor[1;35m has finished in 

# Let's outsource some compute to Sagemaker!

Let's farm some compute to AWS with a training job with a certain number of CPUs and Memory

<img src="_assets/local_sagmaker_so_stack.png" alt="Drawing" style="width: 800px;"/>

In [4]:
# This pip installs the requirements locally
!zenml integration install aws s3 -y

# This changes the active stack
!zenml stack set local-sagemaker-step-operator-stack

[1;35mNumExpr defaulting to 8 threads.[0m
[2K[32m⠏[0m Installing integrations...Collecting argparse
  Using cached argparse-1.4.0-py2.py3-none-any.whl (23 kB)
[2K[32m⠹[0m Installing integrations...Installing collected packages: argparse
[2K[32m⠸[0m Installing integrations...Successfully installed argparse-1.4.0
You should consider upgrading via the '/home/apenner/.pyenv/versions/3.9.13/envs/demo/bin/python -m pip install --upgrade pip' command.[0m[33m
[2K[32m⠧[0m Installing integrations...
[1A[2K[1;35mNumExpr defaulting to 8 threads.[0m
[?25l[32m⠋[0m Setting the repository active stack to 
[2K[1A[2K[32m⠙[0m Setting the repository active stack to 
[2K[1A[2K[32m⠹[0m Setting the repository active stack to 
[2K[1A[2K[32m⠸[0m Setting the repository active stack to 
[2K[1A[2K[32m⠼[0m Setting the repository active stack to 
[2K[1A[2K[32m⠴[0m Setting the repository active stack to 
[2K[1A[2K[2;36mActive repository stack set to: [0m[2;32m'lo

In [8]:
step_args = {}
step_args["step_operator"] = "sagemaker-eu"

# M5 Large is what we need for this big data!
step_args["settings"] = {"step_operator.sagemaker": {"estimator_args": {"instance_type" : "ml.m5.large"}}}

# Update the step. We can also do this in YAML
model_trainer = model_trainer.with_options(**step_args)
    
pipeline_args = {"enable_cache": False}
pipeline_args["config_path"] = os.path.join("configs", "training.yaml")
fe_t_configured = training.with_options(**pipeline_args)

In [None]:
fe_t_configured()

[1;35mInitiating a new run for the pipeline: [0m[1;36mtraining[1;35m.[0m
[1;35mReloading configuration file /home/apenner/PycharmProjects/zenml-projects/stack-showcase/.zen/config.yaml[0m
[1;35mReusing registered version: [0m[1;36m(version: 6)[1;35m.[0m
[1;35mNew model version [0m[1;36m69[1;35m was created.[0m
[1;35mBuilding Docker image(s) for pipeline [0m[1;36mtraining[1;35m.[0m
[1;35mBuilding Docker image [0m[1;36m715803424590.dkr.ecr.eu-central-1.amazonaws.com/zenml:training-model_trainer-sagemaker_step_operator[1;35m.[0m
[1;35m- Including integration requirements: [0m[1;36maws-profile-manager[1;35m, [0m[1;36mboto3<=1.26.76[1;35m, [0m[1;36mkubernetes[1;35m, [0m[1;36ms3fs>2022.3.0,<=2023.4.0[1;35m, [0m[1;36msagemaker==2.117.0[1;35m, [0m[1;36mscikit-learn<1.3[1;35m[0m
[33mCould not import Azure service connector: No module named 'azure.identity'.[0m
[1;35mStep 1/9 : FROM zenmldocker/zenml:0.50.0-py3.9[0m
[1;35mStep 2/9 : WORKDIR /ap

# Let's run the entire pipeline on Airflow now


<img src="_assets/airflow_stack.png" alt="Drawing" style="width: 800px;"/>


In [None]:
!zenml stack set sagemaker-airflow-stack
!zenml integration install airflow -y
!pip install apache-airflow-providers-docker apache-airflow~=2.5.0
!zenml stack up

In [None]:
fe_t_configured()

# Switch to full Sagemaker Stack

![Sagemaker local stack](_assets/sagemaker_stack.png)


In [None]:
!zenml stack set sagemaker-stack

In [None]:
fe_t_configured()