In [1]:
# Do the imports at the top

import random
from zenml import ExternalArtifact, pipeline 
from zenml.client import Client
from zenml.logger import get_logger
from uuid import UUID

import os
from typing import Optional, List

from zenml import pipeline

from pipelines import feature_engineering

from steps import (
    data_loader,
    data_preprocessor,
    data_splitter,
    model_evaluator,
    model_trainer,
    inference_predict,
    inference_preprocessor
)

logger = get_logger(__name__)

client = Client()

[1;35mNumExpr defaulting to 8 threads.[0m


# Start local with a simple training pipeline

First, lets run our training pipeline locally

<img src="_assets/default_stack.png" alt="Drawing" style="width: 800px;"/>

In [2]:
# To start with, we use the default stack
!zenml init
!zenml stack set default

# We also need to connect to a remote ZenML Instance
# !zenml connect --url ...

[1;35mNumExpr defaulting to 8 threads.[0m
[?25l[2;36mFound existing ZenML repository at path [0m
[2;32m'/home/apenner/PycharmProjects/zenml-projects/stack-showcase'[0m[2;36m.[0m
[2;32m⠋[0m[2;36m Initializing ZenML repository at [0m
[2;36m/home/apenner/PycharmProjects/zenml-projects/stack-showcase.[0m
[2K[1A[2K[1A[2K[32m⠋[0m Initializing ZenML repository at 
/home/apenner/PycharmProjects/zenml-projects/stack-showcase.

[1A[2K[1A[2K[1A[2K[1;35mNumExpr defaulting to 8 threads.[0m
[2K[2;36mActive repository stack set to: [0m[2;32m'default'[0m.
[2K[32m⠙[0m Setting the repository active stack to 'default'...t'...[0m
[1A[2K

In [3]:
import pandas as pd
from sklearn.datasets import load_breast_cancer
from zenml import step
from zenml.logger import get_logger

logger = get_logger(__name__)


@step
def data_loader() -> pd.DataFrame:
    """Dataset reader step."""
    dataset = load_breast_cancer(as_frame=True)
    inference_size = int(len(dataset.target) * 0.05)
    dataset: pd.DataFrame = dataset.frame
    dataset.reset_index(drop=True, inplace=True)
    logger.info(f"Dataset with {len(dataset)} records loaded!")

    ### YOUR CODE ENDS HERE ###
    return dataset


In [None]:
data_loader()

In [4]:
from zenml.config import DockerSettings

docker_settings = DockerSettings(
    requirements=[
        "pyarrow",
    ],
)

@pipeline(settings={"docker": docker_settings})
def training(
    train_dataset_id: Optional[UUID] = None,
    test_dataset_id: Optional[UUID] = None,
    min_train_accuracy: float = 0.0,
    min_test_accuracy: float = 0.0,
):
    """Model training pipeline."""
    # Execute Feature Engineering Pipeline
    dataset_trn, dataset_tst = feature_engineering()

    model = model_trainer(
        dataset_trn=dataset_trn,
    )

    model_evaluator(
        model=model,
        dataset_trn=dataset_trn,
        dataset_tst=dataset_tst,
        min_train_accuracy=min_train_accuracy,
        min_test_accuracy=min_test_accuracy,
    )


In [None]:
pipeline_args = {"enable_cache": False}
pipeline_args["config_path"] = os.path.join("configs", "training.yaml")
fe_t_configured = training.with_options(**pipeline_args)

In [None]:
fe_t_configured()

# Let's outsource some compute to Sagemaker!

Let's farm some compute to AWS with a training job with a certain number of CPUs and Memory














<img src="_assets/local_sagmaker_so_stack.png" alt="Drawing" style="width: 800px;"/>

In [5]:
# This pip installs the requirements locally
!zenml integration install aws s3 -y

# This changes the active stack
!zenml stack set local-sagemaker-step-operator-stack

[1;35mNumExpr defaulting to 8 threads.[0m
[2K[32m⠏[0m Installing integrations...Collecting attrs<23,>=20.3.0
  Using cached attrs-22.2.0-py3-none-any.whl (60 kB)
[2K[32m⠋[0m Installing integrations...Collecting argparse
  Using cached argparse-1.4.0-py2.py3-none-any.whl (23 kB)
[2K[32m⠴[0m Installing integrations...Installing collected packages: argparse, attrs
  Attempting uninstall: attrs
    Found existing installation: attrs 23.1.0
[2K[32m⠦[0m Installing integrations...    Uninstalling attrs-23.1.0:
      Successfully uninstalled attrs-23.1.0
[2K[32m⠧[0m Installing integrations...[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behavior is the source of the following dependency conflicts.
cattrs 23.2.3 requires attrs>=23.1.0, but you have attrs 22.2.0 which is incompatible.[0m[31m
[0mSuccessfully installed argparse-1.4.0 attrs-22.2.0
You should consider upgrading via the '/home/apenner/.pyenv/ve

In [6]:
step_args = {}
step_args["step_operator"] = "sagemaker-eu"

# M5 Large is what we need for this big data!
step_args["settings"] = {"step_operator.sagemaker": {"estimator_args": {"instance_type" : "ml.m5.large"}}}

# Update the step. We can also do this in YAML
model_trainer = model_trainer.with_options(**step_args)
    
pipeline_args = {"enable_cache": False}
pipeline_args["config_path"] = os.path.join("configs", "training.yaml")
fe_t_configured = training.with_options(**pipeline_args)

In [None]:
fe_t_configured()

# Let's run the entire pipeline on Airflow now


<img src="_assets/airflow_stack.png" alt="Drawing" style="width: 800px;"/>


In [7]:
!zenml stack set sagemaker-airflow-stack
!zenml integration install airflow -y
!pip install apache-airflow-providers-docker apache-airflow~=2.5.0
!zenml stack up

[1;35mNumExpr defaulting to 8 threads.[0m
[2K[2;36mActive repository stack set to: [0m[2;32m'sagemaker-airflow-stack'[0m.
[2K[32m⠴[0m Setting the repository active stack to 'sagemaker-airflow-stack'...k'...[0m
[1A[2K[1;35mNumExpr defaulting to 8 threads.[0m
[2K[32m⠏[0m Installing integrations.....Collecting apache-airflow~=2.4.0
  Using cached apache_airflow-2.4.3-py3-none-any.whl (6.5 MB)
Collecting attrs>=22.1.0
[2K[32m⠸[0m Installing integrations...  Using cached attrs-23.1.0-py3-none-any.whl (61 kB)
[2K[32m⠇[0m Installing integrations...Installing collected packages: attrs, apache-airflow
  Attempting uninstall: attrs
    Found existing installation: attrs 22.2.0
    Uninstalling attrs-22.2.0:
      Successfully uninstalled attrs-22.2.0
[2K[32m⠏[0m Installing integrations...  Attempting uninstall: apache-airflow
    Found existing installation: apache-airflow 2.5.3
[2K[32m⠸[0m Installing integrations...    Uninstalling apache-airflow-2.5.3:
      Succe

In [8]:
fe_t_configured()

[1;35mInitiating a new run for the pipeline: [0m[1;36mtraining[1;35m.[0m
[1;35mReloading configuration file /home/apenner/PycharmProjects/zenml-projects/stack-showcase/.zen/config.yaml[0m
[1;35mReusing registered version: [0m[1;36m(version: 8)[1;35m.[0m
[1;35mNew model version [0m[1;36m11[1;35m was created.[0m
[1;35mBuilding Docker image(s) for pipeline [0m[1;36mtraining[1;35m.[0m
[1;35mBuilding Docker image [0m[1;36m715803424590.dkr.ecr.eu-central-1.amazonaws.com/zenml:training-orchestrator[1;35m.[0m
[1;35m- Including user-defined requirements: [0m[1;36mpyarrow[1;35m[0m
[1;35m- Including integration requirements: [0m[1;36mapache-airflow~=2.4.0[1;35m, [0m[1;36maws-profile-manager[1;35m, [0m[1;36mboto3<=1.26.76[1;35m, [0m[1;36mkubernetes[1;35m, [0m[1;36ms3fs>2022.3.0,<=2023.4.0[1;35m, [0m[1;36msagemaker==2.117.0[1;35m, [0m[1;36mscikit-learn<1.3[1;35m[0m
[33mCould not import Azure service connector: No module named 'azure.identity'.

# Switch to full Sagemaker Stack

![Sagemaker local stack](_assets/sagemaker_stack.png)


In [None]:
!zenml stack set sagemaker-stack

In [None]:
fe_t_configured()