In [4]:
!zenml init
!zenml stack set default

[1;35mNumExpr defaulting to 8 threads.[0m
[?25l[2;36mFound existing ZenML repository at path [0m
[2;32m'/home/apenner/PycharmProjects/template-starter/template'[0m[2;36m.[0m
[2;32m⠋[0m[2;36m Initializing ZenML repository at [0m
[2;36m/home/apenner/PycharmProjects/template-starter/template.[0m
[2K[1A[2K[1A[2K[32m⠋[0m Initializing ZenML repository at 
/home/apenner/PycharmProjects/template-starter/template.

[1A[2K[1A[2K[1A[2K[1;35mNumExpr defaulting to 8 threads.[0m
[2K[2;36mActive repository stack set to: [0m[2;32m'default'[0m.
[2K[32m⠙[0m Setting the repository active stack to 'default'...t'...[0m
[1A[2K

In [5]:
# Do the imports at the top

import random
from zenml import ExternalArtifact, pipeline 
from zenml.client import Client
from zenml.logger import get_logger
from uuid import UUID

import os
from typing import Optional, List

from zenml import pipeline

from steps import (
    data_loader,
    data_preprocessor,
    data_splitter,
    model_evaluator,
    model_trainer,
    inference_predict,
    inference_preprocessor
)

logger = get_logger(__name__)

client = Client()

# Run the feature engineering pipeline

In [7]:
@pipeline
def feature_engineering(
    test_size: float = 0.2,
    drop_na: Optional[bool] = None,
    normalize: Optional[bool] = None,
    drop_columns: Optional[List[str]] = None,
    target: Optional[str] = "target",
):
    """
    Feature engineering pipeline.

    This is a pipeline that loads the data, processes it and splits
    it into train and test sets.

    Args:
        test_size: Size of holdout set for training 0.0..1.0
        drop_na: If `True` NA values will be removed from dataset
        normalize: If `True` dataset will be normalized with MinMaxScaler
        drop_columns: List of columns to drop from dataset
        target: Name of target column in dataset
    """
    ### ADD YOUR OWN CODE HERE - THIS IS JUST AN EXAMPLE ###
    # Link all the steps together by calling them and passing the output
    # of one step as the input of the next step.
    raw_data = data_loader(random_state=random.randint(0, 100), target=target)
    dataset_trn, dataset_tst = data_splitter(
        dataset=raw_data,
        test_size=test_size,
    )
    dataset_trn, dataset_tst, _ = data_preprocessor(
        dataset_trn=dataset_trn,
        dataset_tst=dataset_tst,
        drop_na=drop_na,
        normalize=normalize,
        drop_columns=drop_columns,
        target=target,
    )
    
    return dataset_trn, dataset_tst

In [8]:
pipeline_args = {}
pipeline_args["config_path"] = os.path.join("configs", "feature_engineering.yaml")
fe_p_configured = feature_engineering.with_options(**pipeline_args)

In [9]:
latest_run = fe_p_configured()

[1;35mInitiating a new run for the pipeline: [0m[1;36mfeature_engineering[1;35m.[0m
[1;35mReloading configuration file /home/apenner/PycharmProjects/template-starter/template/.zen/config.yaml[0m
[1;35mReusing registered version: [0m[1;36m(version: 1)[1;35m.[0m
[1;35mNew model version [0m[1;36m38[1;35m was created.[0m
[1;35mExecuting a new run.[0m
[1;35mUsing user: [0m[1;36malexej@zenml.io[1;35m[0m
[1;35mUsing stack: [0m[1;36mdefault[1;35m[0m
[1;35m  artifact_store: [0m[1;36mdefault[1;35m[0m
[1;35m  orchestrator: [0m[1;36mdefault[1;35m[0m
[1;35mStep [0m[1;36mdata_loader[1;35m has started.[0m
[1;35mDataset with 541 records loaded![0m
[1;35mStep [0m[1;36mdata_loader[1;35m has finished in [0m[1;36m7.423s[1;35m.[0m
[1;35mStep [0m[1;36mdata_splitter[1;35m has started.[0m
[1;35mStep [0m[1;36mdata_splitter[1;35m has finished in [0m[1;36m10.617s[1;35m.[0m
[1;35mStep [0m[1;36mdata_preprocessor[1;35m has started.[0m
[1;35mSt

# Run the training Pipeline

![title](_assets/default_stack.png)

In [10]:
@pipeline
def training(
    train_dataset_id: Optional[UUID] = None,
    test_dataset_id: Optional[UUID] = None,
    min_train_accuracy: float = 0.0,
    min_test_accuracy: float = 0.0,
):
    """
    Model training pipeline.

    This is a pipeline that loads the data, processes it and splits
    it into train and test sets, then search for best hyperparameters,
    trains and evaluates a model.

    Args:
        test_size: Size of holdout set for training 0.0..1.0
        drop_na: If `True` NA values will be removed from dataset
        normalize: If `True` dataset will be normalized with MinMaxScaler
        drop_columns: List of columns to drop from dataset
    """
    ### ADD YOUR OWN CODE HERE - THIS IS JUST AN EXAMPLE ###
    # Link all the steps together by calling them and passing the output
    # of one step as the input of the next step.
    
    # Execute Feature Engineering Pipeline
    if train_dataset_id is None or test_dataset_id is None:
        dataset_trn, dataset_tst = feature_engineering()
    else:
        dataset_trn = ExternalArtifact(id=train_dataset_id)
        dataset_tst = ExternalArtifact(id=test_dataset_id)
    
    model = model_trainer(
        dataset_trn=dataset_trn,
    )

    model_evaluator(
        model=model,
        dataset_trn=dataset_trn,
        dataset_tst=dataset_tst,
        min_train_accuracy=min_train_accuracy,
        min_test_accuracy=min_test_accuracy,
    )


In [11]:
pipeline_args = {}
pipeline_args["config_path"] = os.path.join("configs", "training.yaml")
fe_t_configured = training.with_options(**pipeline_args)

In [12]:
fe_t_configured()

[1;35mInitiating a new run for the pipeline: [0m[1;36mtraining[1;35m.[0m
[1;35mReusing registered version: [0m[1;36m(version: 2)[1;35m.[0m
[1;35mNew model version [0m[1;36m39[1;35m was created.[0m
[1;35mExecuting a new run.[0m
[1;35mUsing user: [0m[1;36malexej@zenml.io[1;35m[0m
[1;35mUsing stack: [0m[1;36mdefault[1;35m[0m
[1;35m  artifact_store: [0m[1;36mdefault[1;35m[0m
[1;35m  orchestrator: [0m[1;36mdefault[1;35m[0m
[1;35mStep [0m[1;36mdata_loader[1;35m has started.[0m
[1;35mDataset with 541 records loaded![0m
[1;35mStep [0m[1;36mdata_loader[1;35m has finished in [0m[1;36m6.700s[1;35m.[0m
[1;35mStep [0m[1;36mdata_splitter[1;35m has started.[0m
[1;35mStep [0m[1;36mdata_splitter[1;35m has finished in [0m[1;36m11.217s[1;35m.[0m
[1;35mStep [0m[1;36mdata_preprocessor[1;35m has started.[0m
[1;35mStep [0m[1;36mdata_preprocessor[1;35m has finished in [0m[1;36m15.339s[1;35m.[0m
[1;35mCaching [0m[1;36mdisabled[1;

# Switch the Stack

![title](_assets/airflow_stack.png)

In [14]:
!zenml stack set sagemaker-airflow-stack
!zenml integration install airflow aws
!pip install apache-airflow-providers-docker apache-airflow~=2.5.0
!zenml stack up

[1;35mNumExpr defaulting to 8 threads.[0m
[2K[2;36mActive repository stack set to: [0m[2;32m'sagemaker-airflow-stack'[0m.
[2K[32m⠧[0m Setting the repository active stack to 'sagemaker-airflow-stack'...k'...[0m
[1A[2K[1;35mNumExpr defaulting to 8 threads.[0m
[2;36mAll required packages for integration [0m[2;32m'airflow'[0m[2;36m are already installed.[0m
[2;36mAll required packages for integration [0m[2;32m'aws'[0m[2;36m are already installed.[0m
You should consider upgrading via the '/home/apenner/.pyenv/versions/3.9.13/envs/demo/bin/python -m pip install --upgrade pip' command.[0m[33m
[0m[1;35mNumExpr defaulting to 8 threads.[0m
[1;35mFound credentials in shared credentials file: ~/.aws/credentials[0m
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/xdg-ubuntu/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/apenner/.config/sagemaker/config.yaml
[2;36mProvisioning resources for acti

In [15]:
step_args = {}
step_args["step_operator"] = "sagemaker-eu"
model_trainer_configured = model_trainer.with_options(**step_args)

@pipeline
def training(
    train_dataset_id: Optional[UUID] = None,
    test_dataset_id: Optional[UUID] = None,
    min_train_accuracy: float = 0.0,
    min_test_accuracy: float = 0.0,
):
    """
    Model training pipeline.

    This is a pipeline that loads the data, processes it and splits
    it into train and test sets, then search for best hyperparameters,
    trains and evaluates a model.

    Args:
        test_size: Size of holdout set for training 0.0..1.0
        drop_na: If `True` NA values will be removed from dataset
        normalize: If `True` dataset will be normalized with MinMaxScaler
        drop_columns: List of columns to drop from dataset
    """
    ### ADD YOUR OWN CODE HERE - THIS IS JUST AN EXAMPLE ###
    # Link all the steps together by calling them and passing the output
    # of one step as the input of the next step.
    
    # Execute Feature Engineering Pipeline
    if train_dataset_id is None or test_dataset_id is None:
        dataset_trn, dataset_tst = feature_engineering()
    else:
        dataset_trn = ExternalArtifact(id=train_dataset_id)
        dataset_tst = ExternalArtifact(id=test_dataset_id)
    
    model = model_trainer_configured(
        dataset_trn=dataset_trn,
    )

    model_evaluator(
        model=model,
        dataset_trn=dataset_trn,
        dataset_tst=dataset_tst,
        min_train_accuracy=min_train_accuracy,
        min_test_accuracy=min_test_accuracy,
    )

In [16]:
pipeline_args = {}
pipeline_args["config_path"] = os.path.join("configs", "training.yaml")
fe_t_configured = training.with_options(**pipeline_args)

In [17]:
fe_t_configured()

[1;35mInitiating a new run for the pipeline: [0m[1;36mtraining[1;35m.[0m
[1;35mReloading configuration file /home/apenner/PycharmProjects/template-starter/template/.zen/config.yaml[0m
[1;35mFound credentials in shared credentials file: ~/.aws/credentials[0m
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/xdg-ubuntu/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/apenner/.config/sagemaker/config.yaml
[1;35mRegistered new version: [0m[1;36m(version 4)[1;35m.[0m
[1;35mNew model version [0m[1;36m40[1;35m was created.[0m
[1;35mBuilding Docker image(s) for pipeline [0m[1;36mtraining[1;35m.[0m
[1;35mBuilding Docker image [0m[1;36m715803424590.dkr.ecr.eu-central-1.amazonaws.com/zenml:training-orchestrator[1;35m.[0m
[1;35m- Including integration requirements: [0m[1;36maiohttp>=3.8.1[1;35m, [0m[1;36mapache-airflow~=2.4.0[1;35m, [0m[1;36maws-profile-manager[1;35m, [0m[1;36mboto3<=1.26.7

# Switch to full Sagemaker Stack

![title](_assets/sagemaker_stack.png)

In [20]:
!zenml stack set sagemaker-stack

[1;35mNumExpr defaulting to 8 threads.[0m
[2K[2;36mActive repository stack set to: [0m[2;32m'sagemaker-stack'[0m.
[2K[32m⠸[0m Setting the repository active stack to 'sagemaker-stack'...k'...[0m
[1A[2K

In [None]:
fe_t_configured()

[1;35mInitiating a new run for the pipeline: [0m[1;36mtraining[1;35m.[0m
[1;35mReloading configuration file /home/apenner/PycharmProjects/template-starter/template/.zen/config.yaml[0m
[1;35mReusing registered version: [0m[1;36m(version: 4)[1;35m.[0m
[1;35mNew model version [0m[1;36m42[1;35m was created.[0m
[1;35mBuilding Docker image(s) for pipeline [0m[1;36mtraining[1;35m.[0m
[1;35mBuilding Docker image [0m[1;36m715803424590.dkr.ecr.eu-central-1.amazonaws.com/zenml:training-orchestrator[1;35m.[0m
[1;35m- Including integration requirements: [0m[1;36maws-profile-manager[1;35m, [0m[1;36mboto3<=1.26.76[1;35m, [0m[1;36mkubernetes[1;35m, [0m[1;36ms3fs>2022.3.0,<=2023.4.0[1;35m, [0m[1;36msagemaker==2.117.0[1;35m, [0m[1;36mscikit-learn<1.3[1;35m[0m
[1;35mStep 1/8 : FROM zenmldocker/zenml:0.50.0-py3.9[0m
[1;35mStep 2/8 : WORKDIR /app[0m
[1;35mStep 3/8 : COPY .zenml_integration_requirements .[0m
[1;35mStep 4/8 : RUN pip install --default-tim