We will finetune a pretrained MobileNetv2 model on the Fashion MNIST dataset using Pytorch.

Following the [Pytorch Quickstart](https://pytorch.org/tutorials/beginner/basics/quickstart_tutorial.html)

However we change the train() and test() functions to log their losses and accuracies
to TensorBoard instead of printing them, following the [Pytorch TensorBoard Tutorial](https://pytorch.org/docs/stable/tensorboard.html).

This is the state of ML in research, following shows how to transition to prod with ZenML.

Additionally, we also log all hyperparameters and final metrics to MLFlow.

Furthermore, in addition to the data loading, training, and testing from the PyTorch quickstart,
we also deploy the model with MLFlow and define an inference pipeline that sends
inference requests to it...

In [None]:
!zenml integration install pytorch mlflow deepchecks dash -y

In [None]:
import mlflow
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor

from zenml.steps import step, Output
from zenml.pipelines import pipeline
from zenml.integrations.mlflow.mlflow_step_decorator import enable_mlflow

In [None]:
@step
def load_data() -> Output(
    train_dataloader=DataLoader, test_dataloader=DataLoader
):
    """Load the Fashion MNIST dataset as tuple of torch Datasets.
    
    From https://pytorch.org/tutorials/beginner/basics/quickstart_tutorial.html.
    """
    batch_size = 64

    # Download training data from open datasets.
    training_data = datasets.FashionMNIST(
        root="data",
        train=True,
        download=True,
        transform=ToTensor(),
    )

    # Download test data from open datasets.
    test_data = datasets.FashionMNIST(
        root="data",
        train=False,
        download=True,
        transform=ToTensor(),
    )

    # Create data loaders.
    train_dataloader = DataLoader(training_data, batch_size=batch_size)
    test_dataloader = DataLoader(test_data, batch_size=batch_size)

    return train_dataloader, test_dataloader

In [None]:
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28*28, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 10)
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

@step
def load_model() -> nn.Module:
    """Define a PyTorch classification model.
    
    From https://pytorch.org/tutorials/beginner/basics/quickstart_tutorial.html.
    """
    model = NeuralNetwork()
    print(model)
    return model

In [None]:
import os
from zenml.steps import StepContext
from torch.utils.tensorboard import SummaryWriter


def train(dataloader, model, loss_fn, optimizer, device, tensorboard_writer, global_step):
    """Train a model for one epoch.
    
    From https://pytorch.org/tutorials/beginner/basics/quickstart_tutorial.html.
    """
    size = len(dataloader.dataset)
    model.train()
    correct, accuracy = 0, 0
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)

        # Compute prediction error
        pred = model(X)
        loss = loss_fn(pred, y)
        correct += (pred.argmax(1) == y).type(torch.float).sum().item()

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 100 == 0:
            step_in_epoch = (batch + 1) * len(X)
            current_step = global_step + step_in_epoch
            loss = loss.item()
            accuracy = 100 * correct / step_in_epoch
            tensorboard_writer.add_scalar("Loss/train", loss, current_step)
            tensorboard_writer.add_scalar("Accuracy/train", accuracy, current_step)
    return accuracy


def test(dataloader, model, loss_fn, device, tensorboard_writer, global_step):
    """Test a model on the validation / test dataset.
    
    From https://pytorch.org/tutorials/beginner/basics/quickstart_tutorial.html.
    """
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    test_accuracy = 100 * correct / size
    tensorboard_writer.add_scalar("Loss/test", test_loss, global_step)
    tensorboard_writer.add_scalar("Accuracy/test", test_accuracy, global_step)
    return test_accuracy

@enable_mlflow  # setup MLflow
@step(enable_cache=False)
def train_test(
    model: nn.Module,
    train_dataloader: DataLoader, 
    test_dataloader: DataLoader, 
    context: StepContext,
) -> Output(trained_model=nn.Module, test_acc=float):
    """Train and simultaneously evaluate a torch model on given dataloaders.
    
    Adjusted from https://pytorch.org/tutorials/beginner/basics/quickstart_tutorial.html.
    """
    device = "cuda" if torch.cuda.is_available() else "cpu"
    log_dir = os.path.join(context.get_output_artifact_uri(output_name="trained_model"), "logs")
    tensorboard_writer = SummaryWriter(log_dir)
    print(f"Using {device} device")
    model = model.to(device)
    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
    epochs = 5
    test_acc = 0
    for t in range(epochs):
        print(f"Epoch {t+1}\n-------------------------------")
        global_step = t * len(train_dataloader)
        train_acc = train(train_dataloader, model, loss_fn, optimizer, device, tensorboard_writer, global_step)
        test_acc = test(test_dataloader, model, loss_fn, device, tensorboard_writer, global_step)
    print("Done!")
    mlflow.log_metric("Train Accuracy", train_acc)
    mlflow.log_metric("Test Accuracy", test_acc)
    mlflow.pytorch.log_model(model, "model")
    return model, test_acc

In [None]:
@step
def deployment_trigger(test_acc: float) -> bool:
    """Only deploy if the test accuracy > 90%."""
    return test_acc > 0.9

In [None]:
from zenml.integrations.mlflow.steps import mlflow_model_deployer_step

In [None]:
from zenml.pipelines import pipeline


@pipeline(enable_cache=False)
def training_pipeline(
    load_training_data,
    model_definition,
    train_test,
    deployment_trigger,
    model_deployer,
):
    """Train, evaluate, and deploy a model."""
    train_dataloader, test_dataloader = load_training_data()
    model = model_definition()
    model, test_acc = train_test(model, train_dataloader, test_dataloader)
    deployment_decision = deployment_trigger(test_acc)
    model_deployer(deployment_decision, model)

In [None]:
training_pipeline(
    load_training_data=load_data(),
    model_definition=load_model(),
    train_test=train_test(),
    deployment_trigger=deployment_trigger(),
    model_deployer=mlflow_model_deployer_step(),
).run()

In [None]:
!zenml model-deployer models list

## TensorBoard Experiment Tracking

In [None]:
from zenml.integrations.tensorboard.visualizers import (
    stop_tensorboard_server,
    visualize_tensorboard,
)

In [None]:
visualize_tensorboard(
    pipeline_name="training_pipeline",
    step_name="train_test",
)

## MLflow Experiment Tracking

Lastly, remember how we added MLflow experiment tracking to our `svc_trainer_mlflow` step before?
Those two simple lines of code automatically configured and initialized MLflow and logged all hyperparameters and metrics there.

Let's start up the MLflow UI and check it out!

In [None]:
from zenml.environment import Environment
from zenml.integrations.mlflow.mlflow_utils import get_tracking_uri


def open_mlflow_ui(port=4997):
    if Environment.in_google_colab():
        from pyngrok import ngrok

        public_url = ngrok.connect(port)
        print(f"\x1b[31mIn Colab, use this URL instead: {public_url}!\x1b[0m")

    !mlflow ui --backend-store-uri="{get_tracking_uri()}" --port={port}


open_mlflow_ui()

In [None]:
@pipeline(enable_cache=False)
def inference_pipeline(
    inference_data_loader,
    prediction_service_loader,
    predictor,
):
    """Inference pipeline with data drift detection."""
    inference_data = inference_data_loader()
    model_deployment_service = prediction_service_loader()
    predictor(model_deployment_service, inference_data)

In [None]:
import numpy as np

@step
def inference_data_loader() -> np.ndarray:
    """Load some (random) inference data."""
    return np.random.rand(10, 1, 24, 24)  # 10 images, each 24x24

In [None]:
from zenml.services import BaseService
from zenml.repository import Repository


@step(enable_cache=False)
def prediction_service_loader() -> BaseService:
    """Load the model service of our train_evaluate_deploy_pipeline."""
    repo = Repository()
    model_deployer = repo.active_stack.model_deployer
    services = model_deployer.find_model_server(
        pipeline_name="training_pipeline",
        pipeline_step_name="mlflow_model_deployer_step",
        running=True,
    )
    service = services[0]
    return service

In [None]:
@step
def predictor(
    service: BaseService,
    data: np.ndarray,
) -> Output(predictions=list):
    """Run a inference request against a prediction service"""
    service.start(timeout=10)  # should be a NOP if already started
    prediction = service.predict(data)
    prediction = prediction.argmax(axis=-1)
    print(f"Prediction is: {[prediction.tolist()]}")
    return [prediction.tolist()]


# @step
# def predict(model: nn.Module, test_data: Dataset):

#     classes = [
#         "T-shirt/top",
#         "Trouser",
#         "Pullover",
#         "Dress",
#         "Coat",
#         "Sandal",
#         "Shirt",
#         "Sneaker",
#         "Bag",
#         "Ankle boot",
#     ]

#     model.eval()
#     x, y = test_data[0][0], test_data[0][1]
#     with torch.no_grad():
#         pred = model(x)
#         predicted, actual = classes[pred[0].argmax(0)], classes[y]
#         print(f'Predicted: "{predicted}", Actual: "{actual}"')

In [None]:
inference_pipeline(
    inference_data_loader=inference_data_loader(),
    prediction_service_loader=prediction_service_loader(),
    predictor=predictor(),
).run()