# Iris Classification Pipeline with ZenML

This notebook demonstrates a ZenML pipeline for iris classification, including data loading, model training, evaluation, explainability, and data drift detection.

In [None]:
from zenml.client import Client

In [None]:
Client().activate_stack("default")
# Client().activate_stack("ihopeitworks2")

In [2]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from zenml import step
from zenml import log_artifact_metadata
from typing import Tuple, Dict, Any
from typing_extensions import Annotated

def safe_metadata(data: Any) -> Dict[str, Any]:
    """Create metadata dict with only supported types."""
    metadata = {"shape": data.shape}
    if isinstance(data, pd.DataFrame):
        metadata["columns"] = list(data.columns)
    return metadata

@step
def load_data() -> Tuple[
    Annotated[pd.DataFrame, "X_train"],
    Annotated[pd.DataFrame, "X_test"],
    Annotated[pd.Series, "y_train"],
    Annotated[pd.Series, "y_test"],
]:
    """Load the iris dataset and split into train and test sets."""
    iris = load_iris(as_frame=True)
    X = iris.data
    y = iris.target
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    for name, data in [("X_train", X_train), ("X_test", X_test), ("y_train", y_train), ("y_test", y_test)]:
        log_artifact_metadata(
            artifact_name=name,
            metadata={"dataset_info": safe_metadata(data)}
        )

    return X_train, X_test, y_train, y_test

In [3]:
import pandas as pd
from sklearn.svm import SVC
from zenml import step, ArtifactConfig
from zenml import log_model_metadata, log_artifact_metadata
from typing_extensions import Annotated

@step
def train_model(
    X_train: pd.DataFrame,
    y_train: pd.Series,
) -> Annotated[SVC, ArtifactConfig(name="model", is_model_artifact=True)]:
    """Train an SVM classifier."""
    model = SVC(kernel='rbf', probability=True)
    model.fit(X_train, y_train)
    train_accuracy = model.score(X_train, y_train)

    log_model_metadata(
        metadata={
            "training_metrics": {
                "train_accuracy": float(train_accuracy),
            },
            "model_info": {
                "model_type": type(model).__name__,
                "kernel": model.kernel,
            }
        }
    )

    log_artifact_metadata(
        artifact_name="model",
        metadata={
            "model_details": {
                "type": type(model).__name__,
                "kernel": model.kernel,
                "n_support": model.n_support_.tolist(),
            }
        }
    )

    return model

In [4]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from zenml import step
from zenml import log_model_metadata, log_artifact_metadata
from typing import Tuple
from typing_extensions import Annotated

@step
def evaluate_model(
    model: SVC,
    X_test: pd.DataFrame,
    y_test: pd.Series,
) -> Tuple[
    Annotated[np.ndarray, "predictions"],
    Annotated[np.ndarray, "probabilities"]
]:
    """Evaluate the model and make predictions."""
    test_accuracy = model.score(X_test, y_test)
    predictions = model.predict(X_test)
    probabilities = model.predict_proba(X_test)

    log_model_metadata(
        metadata={
            "evaluation_metrics": {
                "test_accuracy": float(test_accuracy),
            }
        }
    )

    log_artifact_metadata(
        artifact_name="predictions",
        metadata={
            "prediction_info": {
                "shape": predictions.shape,
                "unique_values": np.unique(predictions).tolist()
            }
        }
    )

    log_artifact_metadata(
        artifact_name="probabilities",
        metadata={
            "probability_info": {
                "shape": probabilities.shape,
                "min": float(np.min(probabilities)),
                "max": float(np.max(probabilities))
            }
        }
    )

    return predictions, probabilities

In [5]:
import pandas as pd
import shap
from sklearn.svm import SVC
from zenml import step
from zenml import log_artifact_metadata
from typing_extensions import Annotated

class SHAPVisualization:
    def __init__(self, shap_values, feature_names):
        self.shap_values = shap_values
        self.feature_names = feature_names

@step
def explain_model(
    model: SVC,
    X_train: pd.DataFrame
) -> Annotated[SHAPVisualization, "shap_visualization"]:
    """Generate SHAP values for model explainability and create a visualization."""
    explainer = shap.KernelExplainer(model.predict_proba, shap.sample(X_train, 100))
    shap_values = explainer.shap_values(X_train.iloc[:100])

    log_artifact_metadata(
        artifact_name="shap_values",
        metadata={
            "shap_info": {
                "shape": [arr.shape for arr in shap_values],
                "n_classes": len(shap_values),
                "n_features": shap_values[0].shape[1],
            }
        }
    )

    return SHAPVisualization(shap_values, X_train.columns)

In [8]:
import pandas as pd
from scipy.stats import ks_2samp
from zenml import step
from zenml import log_artifact_metadata
from typing import Dict
from typing_extensions import Annotated

@step
def detect_data_drift(
    X_train: pd.DataFrame,
    X_test: pd.DataFrame,
) -> Annotated[Dict[str, float], "drift_metrics"]:
    """Detect data drift between training and test sets."""
    drift_metrics = {}
    for column in X_train.columns:
        _, p_value = ks_2samp(X_train[column], X_test[column])
        drift_metrics[column] = p_value

    log_artifact_metadata(
        artifact_name="drift_metrics",
        metadata={
            "drift_summary": {
                "high_drift_features": [col for col, p in drift_metrics.items() if p < 0.05]
            }
        }
    )

    return drift_metrics

In [9]:
from zenml import pipeline, Model
from zenml.config import DockerSettings

@pipeline(
    enable_cache=False,
    settings={"docker": DockerSettings(python_package_installer="uv", requirements="requirements.txt")},
    model=Model(name="high_risk_classification")
)
def iris_classification_pipeline():
    X_train, X_test, y_train, y_test = load_data()
    model = train_model(X_train, y_train)
    evaluate_model(model, X_test, y_test)
    explain_model(model, X_train)
    drift_metrics = detect_data_drift(X_train, X_test)

In [10]:

# Run the pipeline
iris_classification_pipeline()

[1;35mInitiating a new run for the pipeline: [0m[1;36miris_classification_pipeline[1;35m.[0m
sagemaker.config INFO - Not applying SDK defaults from location: /Library/Application Support/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /Users/htahir1/Library/Application Support/sagemaker/config.yaml
[1;35mArchiving notebook code...[0m
[33mCould not import GCP service connector: No module named 'google.api_core'.[0m
[33mCould not import Azure service connector: No module named 'azure'.[0m
[33mCould not import HyperAI service connector: No module named 'paramiko'.[0m
[1;35mUploading code to [0m[1;36ms3://zenml-cxwkvj-339712793861/code_uploads/a757941243d0d58c87b4f6982a9d6bcb5030cc7d.tar.gz[1;35m (Size: 1.83 KiB).[0m
[1;35mCode upload finished.[0m
[1;35mNew model version [0m[1;36m18[1;35m was created.[0m
[1;35mDashboard URL for Model Version with name 18 : [0m[34mhttps://cloud.zenml.io/organizations/fc992c14-d960-4db7-812e-