In [2]:
import mlflow
import torch

from torch import nn
from torch.utils.data import DataLoader
from torchinfo import summary
from torchmetrics import Accuracy
from torchvision import datasets
from torchvision.transforms import ToTensor

mlflow.set_tracking_uri('http://localhost:8080')

# Download training data from open datasets.
training_data = datasets.FashionMNIST(
    root="data",
    train=True,
    download=True,
    transform=ToTensor(),
)

# Create data loaders.
train_dataloader = DataLoader(training_data, batch_size=64)

# Get cpu or gpu for training.
# device = "cuda" if torch.cuda.is_available() else "cpu"
device = "mps"


# Define the model.
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28 * 28, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 10),
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits


def train(dataloader, model, loss_fn, metrics_fn, optimizer):
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)

        pred = model(X)
        loss = loss_fn(pred, y)
        accuracy = metrics_fn(pred, y)

        # Backpropagation.
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if batch % 100 == 0:
            loss, current = loss.item(), batch
            mlflow.log_metric("loss", f"{loss:3f}", step=(batch // 100))
            mlflow.log_metric(
                "accuracy", f"{accuracy:3f}", step=(batch // 100))
            print(
                f"loss: {loss:3f} accuracy: {accuracy:3f} [{current} / {len(dataloader)}]"
            )


epochs = 3
loss_fn = nn.CrossEntropyLoss()
metric_fn = Accuracy(task="multiclass", num_classes=10).to(device)
model = NeuralNetwork().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)

with mlflow.start_run():
    params = {
        "epochs": epochs,
        "learning_rate": 1e-3,
        "batch_size": 64,
        "loss_function": loss_fn.__class__.__name__,
        "metric_function": metric_fn.__class__.__name__,
        "optimizer": "SGD",
    }
    # Log training parameters.
    mlflow.log_params(params)

    # Log model summary.
    with open("model_summary.txt", "w") as f:
        f.write(str(summary(model)))
    mlflow.log_artifact("model_summary.txt")

    for t in range(epochs):
        print(f"Epoch {t+1}\n-------------------------------")
        train(train_dataloader, model, loss_fn, metric_fn, optimizer)

    # Save the trained model to MLflow.
    mlflow.pytorch.log_model(model, "model")

Epoch 1
-------------------------------
loss: 2.303074 accuracy: 0.078125 [0 / 938]
loss: 2.287631 accuracy: 0.218750 [100 / 938]
loss: 2.273887 accuracy: 0.250000 [200 / 938]
loss: 2.268203 accuracy: 0.390625 [300 / 938]
loss: 2.253303 accuracy: 0.328125 [400 / 938]
loss: 2.223333 accuracy: 0.515625 [500 / 938]
loss: 2.235642 accuracy: 0.312500 [600 / 938]
loss: 2.203082 accuracy: 0.437500 [700 / 938]
loss: 2.211604 accuracy: 0.296875 [800 / 938]
loss: 2.176794 accuracy: 0.531250 [900 / 938]
Epoch 2
-------------------------------
loss: 2.176627 accuracy: 0.375000 [0 / 938]
loss: 2.168037 accuracy: 0.406250 [100 / 938]
loss: 2.118265 accuracy: 0.468750 [200 / 938]
loss: 2.137598 accuracy: 0.484375 [300 / 938]
loss: 2.086800 accuracy: 0.515625 [400 / 938]
loss: 2.024076 accuracy: 0.578125 [500 / 938]
loss: 2.065770 accuracy: 0.484375 [600 / 938]
loss: 1.984468 accuracy: 0.593750 [700 / 938]
loss: 2.017170 accuracy: 0.453125 [800 / 938]
loss: 1.930746 accuracy: 0.578125 [900 / 938]
Epoc



🏃 View run enchanting-robin-477 at: http://localhost:8080/#/experiments/0/runs/b0f1ee61fedb4519a1e0f82e1be48860
🧪 View experiment at: http://localhost:8080/#/experiments/0


In [3]:
import os

import lightning as L
import torch
from torch.nn import functional as F
from torch.utils.data import DataLoader, Subset
from torchmetrics import Accuracy
from torchvision import transforms
from torchvision.datasets import MNIST

import mlflow.pytorch
from mlflow import MlflowClient


class MNISTModel(L.LightningModule):
    def __init__(self):
        super().__init__()
        self.l1 = torch.nn.Linear(28 * 28, 10)
        self.accuracy = Accuracy("multiclass", num_classes=10)

    def forward(self, x):
        return torch.relu(self.l1(x.view(x.size(0), -1)))

    def training_step(self, batch, batch_nb):
        x, y = batch
        logits = self(x)
        loss = F.cross_entropy(logits, y)
        pred = logits.argmax(dim=1)
        acc = self.accuracy(pred, y)

        # PyTorch `self.log` will be automatically captured by MLflow.
        self.log("train_loss", loss, on_epoch=True)
        self.log("acc", acc, on_epoch=True)
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=0.02)


def print_auto_logged_info(r):
    tags = {k: v for k, v in r.data.tags.items() if not k.startswith("mlflow.")}
    artifacts = [f.path for f in MlflowClient(
    ).list_artifacts(r.info.run_id, "model")]
    print(f"run_id: {r.info.run_id}")
    print(f"artifacts: {artifacts}")
    print(f"params: {r.data.params}")
    print(f"metrics: {r.data.metrics}")
    print(f"tags: {tags}")


# Initialize our model.
mnist_model = MNISTModel()

# Load MNIST dataset.
train_ds = MNIST(
    os.getcwd(), train=True, download=True, transform=transforms.ToTensor()
)
# Only take a subset of the data for faster training.
indices = torch.arange(32)
train_ds = Subset(train_ds, indices)
train_loader = DataLoader(train_ds, batch_size=8)

# Initialize a trainer.
trainer = L.Trainer(max_epochs=3)

# Auto log all MLflow entities
mlflow.pytorch.autolog()

# Train the model.
with mlflow.start_run() as run:
    trainer.fit(mnist_model, train_loader)

# Fetch the auto logged parameters and metrics.
print_auto_logged_info(mlflow.get_run(run_id=run.info.run_id))

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 404: Not Found

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz to /Users/yinnnyou/workspace/mlflow_framework/MNIST/raw/train-images-idx3-ubyte.gz


100%|██████████| 9912422/9912422 [00:00<00:00, 38644308.57it/s]


Extracting /Users/yinnnyou/workspace/mlflow_framework/MNIST/raw/train-images-idx3-ubyte.gz to /Users/yinnnyou/workspace/mlflow_framework/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 404: Not Found

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz to /Users/yinnnyou/workspace/mlflow_framework/MNIST/raw/train-labels-idx1-ubyte.gz


100%|██████████| 28881/28881 [00:00<00:00, 1641516.28it/s]

Extracting /Users/yinnnyou/workspace/mlflow_framework/MNIST/raw/train-labels-idx1-ubyte.gz to /Users/yinnnyou/workspace/mlflow_framework/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 404: Not Found

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz





Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz to /Users/yinnnyou/workspace/mlflow_framework/MNIST/raw/t10k-images-idx3-ubyte.gz


100%|██████████| 1648877/1648877 [00:00<00:00, 12340220.36it/s]


Extracting /Users/yinnnyou/workspace/mlflow_framework/MNIST/raw/t10k-images-idx3-ubyte.gz to /Users/yinnnyou/workspace/mlflow_framework/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 404: Not Found

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz to /Users/yinnnyou/workspace/mlflow_framework/MNIST/raw/t10k-labels-idx1-ubyte.gz


100%|██████████| 4542/4542 [00:00<00:00, 3435622.86it/s]
INFO: GPU available: True (mps), used: True
INFO:lightning.pytorch.utilities.rank_zero:GPU available: True (mps), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO:lightning.pytorch.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO: IPU available: False, using: 0 IPUs
INFO:lightning.pytorch.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO: HPU available: False, using: 0 HPUs
INFO:lightning.pytorch.utilities.rank_zero:HPU available: False, using: 0 HPUs
/Users/yinnnyou/anaconda3/envs/mlflow/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py:75: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `lightning.pytorch` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages

Extracting /Users/yinnnyou/workspace/mlflow_framework/MNIST/raw/t10k-labels-idx1-ubyte.gz to /Users/yinnnyou/workspace/mlflow_framework/MNIST/raw

Epoch 0: 100%|██████████| 4/4 [00:01<00:00,  2.10it/s, v_num=0]



Epoch 1: 100%|██████████| 4/4 [00:00<00:00, 102.73it/s, v_num=0]



Epoch 2: 100%|██████████| 4/4 [00:00<00:00, 104.96it/s, v_num=0]

INFO: `Trainer.fit` stopped: `max_epochs=3` reached.
INFO:lightning.pytorch.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=3` reached.


Epoch 2: 100%|██████████| 4/4 [00:00<00:00, 86.08it/s, v_num=0] 




🏃 View run marvelous-shrew-671 at: http://localhost:8080/#/experiments/0/runs/07e0143010d6403b9e47d93d41e6c98b
🧪 View experiment at: http://localhost:8080/#/experiments/0
run_id: 07e0143010d6403b9e47d93d41e6c98b
artifacts: ['model/MLmodel', 'model/conda.yaml', 'model/data', 'model/python_env.yaml', 'model/requirements.txt']
params: {'foreach': 'None', 'fused': 'None', 'betas': '(0.9, 0.999)', 'differentiable': 'False', 'weight_decay': '0', 'epochs': '3', 'eps': '1e-08', 'capturable': 'False', 'lr': '0.02', 'maximize': 'False', 'optimizer_name': 'Adam', 'amsgrad': 'False'}
metrics: {'acc_step': 0.0, 'train_loss_epoch': 1.6329033374786377, 'train_loss_step': 2.323289632797241, 'train_loss': 1.6329033374786377, 'acc_epoch': 0.3125, 'acc': 0.3125}
tags: {'Mode': 'training'}
