In [None]:
!zenml integration install sklearn mlflow -y

import IPython

# automatically restart kernel
IPython.Application.instance().kernel.do_shutdown(restart=True)

In [1]:
!zenml init

[?25l[2;36mFound existing ZenML repository at path [0m
[2;32m'/Users/strickvl/coding/zenml/repos/zenml/examples/quickstart/new_quickstart'[0m[2;36m.[0m
[2;32m⠋[0m[2;36m [0m[2;36mInitializing ZenML repository at [0m
[2;36m/Users/strickvl/coding/zenml/repos/zenml/examples/quickstart/new_quickstart.[0m
[2K[1A[2K[1A[2K[32m⠋[0m Initializing ZenML repository at 
/Users/strickvl/coding/zenml/repos/zenml/examples/quickstart/new_quickstart.

[1A[2K[1A[2K[1A[2K

In [2]:
# Register the MLflow experiment tracker
!zenml experiment-tracker register mlflow_tracker --flavor=mlflow

# Register the MLflow model registry
!zenml model-registry register mlflow_registry --flavor=mlflow

# Register the MLflow model deployer
!zenml model-deployer register mlflow_deployer --flavor=mlflow

# Register a new stack with the new stack components
!zenml stack register quickstart_stack -a default\
                                       -o default\
                                       -d mlflow_deployer\
                                       -e mlflow_tracker\
                                       -r mlflow_registry\

!zenml stack set quickstart_stack

[2;36mConnected to the ZenML server: [0m[2;32m'http://127.0.0.1:8237'[0m
[2;36mRunning with active workspace: [0m[2;32m'default'[0m[2;36m [0m[1;2;36m([0m[2;36mrepository[0m[1;2;36m)[0m
[?25l[2;36mActive repository stack set to: [0m[2;32m'quickstart_stack'[0m
[2K[32m⠋[0m Setting the repository active stack to 'quickstart_stack'...kstart_stack'...[0m
[1A[2K

In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from zenml import step
from zenml.steps import Output


@step(enable_cache=True)
def training_data_loader() -> (
    Output(
        X_train=pd.DataFrame,
        X_test=pd.DataFrame,
        y_train=pd.Series,
        y_test=pd.Series,
    )
):
    """Load the Census Income dataset as tuple of Pandas DataFrame / Series."""
    # Load the dataset
    url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
    column_names = [
        "age",
        "workclass",
        "fnlwgt",
        "education",
        "education-num",
        "marital-status",
        "occupation",
        "relationship",
        "race",
        "sex",
        "capital-gain",
        "capital-loss",
        "hours-per-week",
        "native-country",
        "income",
    ]
    data = pd.read_csv(
        url, names=column_names, na_values="?", skipinitialspace=True
    )

    # Drop rows with missing values
    data = data.dropna()

    # Select a few columns for simplicity
    selected_columns = [
        "age",
        "education-num",
        "hours-per-week",
        "sex",
        "income",
    ]
    data = data[selected_columns]

    # Encode categorical features
    le = LabelEncoder()
    data["sex"] = le.fit_transform(data["sex"])
    data["income"] = le.fit_transform(data["income"])

    # Separate features and target
    X = data.drop("income", axis=1)
    y = data["income"]

    # Split the dataset into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    return X_train, X_test, y_train, y_test

In [11]:
X_train, X_test, y_train, y_test = training_data_loader()

In [12]:
X_train

Unnamed: 0,age,education-num,hours-per-week,sex
19863,53,5,70,1
24342,49,13,70,1
10027,28,10,40,1
25710,60,6,40,0
13824,53,9,40,1
...,...,...,...,...
32171,40,9,35,1
5875,41,10,40,1
935,37,9,99,1
17056,56,7,40,0


In [13]:
import mlflow

from sklearn.base import ClassifierMixin
from sklearn.ensemble import RandomForestClassifier

from zenml.client import Client

experiment_tracker = Client().active_stack.experiment_tracker


@step(enable_cache=True, experiment_tracker=experiment_tracker.name)
def random_forest_trainer_mlflow(
    X_train: pd.DataFrame,
    y_train: pd.Series,
) -> RandomForestClassifier:
    """Train a sklearn Random Forest classifier and log to MLflow."""
    mlflow.sklearn.autolog()  # log all model hparams and metrics to MLflow
    model = RandomForestClassifier()
    model.fit(X_train.to_numpy(), y_train.to_numpy())
    train_acc = model.score(X_train.to_numpy(), y_train.to_numpy())
    print(f"Train accuracy: {train_acc}")
    return model

In [14]:
from sklearn.linear_model import SGDClassifier


@step(enable_cache=True, experiment_tracker=experiment_tracker.name)
def sgd_trainer_mlflow(
    X_train: pd.DataFrame,
    y_train: pd.Series,
) -> ClassifierMixin:
    """Train a SGD classifier and log to MLflow."""
    mlflow.sklearn.autolog()  # log all model hparams and metrics to MLflow
    model = SGDClassifier()
    model.fit(X_train.to_numpy(), y_train.to_numpy())
    train_acc = model.score(X_train.to_numpy(), y_train.to_numpy())
    print(f"Train accuracy: {train_acc}")
    return model

In [15]:
@step
def evaluator(
    X_test: pd.DataFrame,
    y_test: pd.Series,
    model1: ClassifierMixin,
    model2: ClassifierMixin,
) -> ClassifierMixin:
    """Calculate the accuracy on the test set and return the best model of two."""
    test_acc1 = model1.score(X_test.to_numpy(), y_test.to_numpy())
    test_acc2 = model2.score(X_test.to_numpy(), y_test.to_numpy())
    print(f"Test accuracy ({model1.__class__.__name__}): {test_acc1}")
    print(f"Test accuracy ({model2.__class__.__name__}): {test_acc2}")
    return model1 if test_acc1 > test_acc2 else model2

In [26]:
from zenml import pipeline
from zenml.integrations.mlflow.steps.mlflow_registry import (
    mlflow_register_model_step,
)
from zenml.model_registries.base_model_registry import (
    ModelRegistryModelMetadata,
)


@pipeline(enable_cache=True)
def training_pipeline() -> ClassifierMixin:
    """Train a model."""
    X_train, X_test, y_train, y_test = training_data_loader()
    model1 = random_forest_trainer_mlflow(X_train=X_train, y_train=y_train)
    model2 = sgd_trainer_mlflow(X_train=X_train, y_train=y_train)
    best_model = evaluator(
        X_test=X_test, y_test=y_test, model1=model1, model2=model2
    )
    return best_model


training_pipeline()

[1;35mRegistered pipeline [0m[33mtraining_pipeline[1;35m (version 3).[0m
[1;35mRunning pipeline [0m[33mtraining_pipeline[1;35m on stack [0m[33mquickstart_stack[1;35m (caching enabled)[0m
[1;35mStep [0m[33mtraining_data_loader[1;35m has started.[0m
[1;35mUsing cached version of [0m[33mtraining_data_loader[1;35m.[0m
[1;35mStep [0m[33mrandom_forest_trainer_mlflow[1;35m has started.[0m
[1;35mUsing cached version of [0m[33mrandom_forest_trainer_mlflow[1;35m.[0m
[1;35mStep [0m[33msgd_trainer_mlflow[1;35m has started.[0m
[1;35mUsing cached version of [0m[33msgd_trainer_mlflow[1;35m.[0m
[1;35mStep [0m[33mevaluator[1;35m has started.[0m
[1;35mUsing cached version of [0m[33mevaluator[1;35m.[0m
[1;35mPipeline run [0m[33mtraining_pipeline-2023_06_13-15_22_56_666974[1;35m has finished in 2.084s.[0m
[1;35mDashboard URL: http://127.0.0.1:8237/workspaces/default/pipelines/448b1867-285b-401e-b91d-0a77ab040b11/runs[0m


In [27]:
@pipeline
def register_and_deploy_model() -> None:
    """Print the name of the model."""
    best_model = training_pipeline()
    mlflow_register_model_step.with_options(
        parameters=dict(
            name="zenml-quickstart-model",
            metadata=ModelRegistryModelMetadata(
                arch=f"{best_model.__class__.__name__}"
            ),
            description="The first run of the Quickstart pipeline.",
        )
    )(best_model)

In [28]:
register_and_deploy_model()

[1;35mRegistered pipeline [0m[33mregister_and_deploy_model[1;35m (version 1).[0m
[1;35mRunning pipeline [0m[33mregister_and_deploy_model[1;35m on stack [0m[33mquickstart_stack[1;35m (caching enabled)[0m
[1;35mStep [0m[33mtraining_data_loader[1;35m has started.[0m
[1;35mUsing cached version of [0m[33mtraining_data_loader[1;35m.[0m
[1;35mStep [0m[33mrandom_forest_trainer_mlflow[1;35m has started.[0m
[1;35mUsing cached version of [0m[33mrandom_forest_trainer_mlflow[1;35m.[0m
[1;35mStep [0m[33msgd_trainer_mlflow[1;35m has started.[0m
[1;35mUsing cached version of [0m[33msgd_trainer_mlflow[1;35m.[0m
[1;35mStep [0m[33mevaluator[1;35m has started.[0m
[1;35mUsing cached version of [0m[33mevaluator[1;35m.[0m
[1;35mStep [0m[33mmlflow_register_model_step[1;35m has started.[0m
[1;35mUsing cached version of [0m[33mmlflow_register_model_step[1;35m.[0m
[1;35mPipeline run [0m[33mregister_and_deploy_model-2023_06_13-15_23_24_476188[1;35