# Training A `PyTorch` Classifier With And Without `Horovod`

This test uses MNIST dataset to train a model using PyTorch with and without Horovod. Later it will verify that:

  * The accuracy was not damaged in Horovod.
  * The Horovod run was faster (only possible on big data). 

## General Configurations

In [None]:
# Test is set to install latest to make sure we are always up-to-date with the latest releases.
!pip install plotly torch torchvision

In [None]:
import os

# Path of the directory to save the data in:
DATA_PATH = os.path.abspath("./data")

# Path of the directory to save the code in:
SCRIPTS_PATH = os.path.abspath("./scripts")

# Number of epochs to train (to increase the training time without increasing the memory usage):
N_EPOCHS = 4

# Number of ranks (horovod workers) to deploy for the open mpi job:
N_RANKS = 4

Prepare the directories:

In [None]:
os.makedirs(DATA_PATH, exist_ok=True)
os.makedirs(SCRIPTS_PATH, exist_ok=True)

## 1. Training Code

1. Get the MNIST data from `torchvision.datasets`.
2. Initialize a model.
3. Run training on the training set with validation on the testing set.

Accuracy score will be logged as a result as part of MLRun auto-logging.

In [None]:
%%writefile {SCRIPTS_PATH}/mnist_trainer.py
from typing import Tuple
import os
import time
import torch
import torchvision

import mlrun
import mlrun.frameworks.pytorch as mlrun_torch


def get_datasets(data_path: str, batch_size: int) -> Tuple[torch.utils.data.DataLoader, torch.utils.data.DataLoader]:
    # Download the data:
    train_set = torchvision.datasets.MNIST(
        os.path.join(data_path, "mnist_training_files"),
        train=True,
        download=True,
        transform=torchvision.transforms.Compose(
            [
                torchvision.transforms.ToTensor(),
                torchvision.transforms.Normalize((0.1307,), (0.3081,)),
            ]
        ),
    )
    test_set = torchvision.datasets.MNIST(
        os.path.join(data_path, "mnist_validation_files"),
        train=False,
        download=True,
        transform=torchvision.transforms.Compose(
            [
                torchvision.transforms.ToTensor(),
                torchvision.transforms.Normalize((0.1307,), (0.3081,)),
            ]
        ),
    )
    
    # Initialize data loaders:
    train_set = torch.utils.data.DataLoader(
        train_set,
        batch_size=batch_size,
        shuffle=True,
    )

    test_set = torch.utils.data.DataLoader(
        test_set,
        batch_size=batch_size,
        shuffle=False,
    )
    
    return train_set, test_set


class MNISTModel(torch.nn.Module):
    def __init__(self):
        super(MNISTModel, self).__init__()
        
        # Add the layers:
        self.layers = torch.nn.Sequential(
            torch.nn.Flatten(),
            torch.nn.Linear(in_features=784, out_features=128),
            torch.nn.ReLU(),
            torch.nn.Linear(in_features=128, out_features=128),
            torch.nn.ReLU(),
            torch.nn.Linear(in_features=128, out_features=10),
            torch.nn.Softmax(dim=1),
        )

    def forward(self, x):
        return self.layers(x)

    
def accuracy(y_pred: torch.Tensor, y_true: torch.Tensor):
    return sum((torch.argmax(y_pred, 1) - y_true) == 0) / len(y_true)
    

@mlrun.handler(outputs=["time"])
def train(context: mlrun.MLClientCtx, scripts_path: str, data_path: str, n_epochs: int):
    # Start the timer:
    run_time = time.time()
    
    # Get the data:
    batch_size = 32
    train_set, test_set = get_datasets(
        data_path=data_path, 
        batch_size=batch_size
    )

    # Initialize the model:
    model = MNISTModel()
    
    # Initialize optimizer and loss:
    optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
    loss_function = torch.nn.CrossEntropyLoss()

    # Train:
    mlrun_torch.train(
        model=model, 
        training_set=train_set,
        loss_function=loss_function,
        optimizer=optimizer,
        validation_set=test_set,
        metric_functions=[accuracy],
        epochs=n_epochs,
        use_cuda=False,
        custom_objects_map={"mnist_trainer.py": "MNISTModel"},
        custom_objects_directory=scripts_path,
        context=context
    )
    run_time = time.time() - run_time
    
    return run_time

## 2. Create a Project

1. Create the MLRun project.
2. Create an MLRun function of the training code.

In [None]:
import shutil
import numpy as np
import mlrun

In [None]:
# Create the project:
project = mlrun.get_or_create_project(name="horovod-pytorch-test", context="./", user_project=True)

In [None]:
# Create the job function:
job_function = project.set_function(os.path.join(SCRIPTS_PATH, "mnist_trainer.py"), name="train_job", kind="job", image="mlrun/mlrun",
                                    handler="train", requirements=["torch", "torchvision", "tensorboard"])
job_function.apply(mlrun.auto_mount())
job_function.deploy()

In [None]:
# Create the open mpi function: (note: using torch==2.0.0 due to ML-5669)
mpijob_function = project.set_function(os.path.join(SCRIPTS_PATH, "mnist_trainer.py"), name="train_mpijob", kind="mpijob",
                                       image="mlrun/mlrun", handler="train", 
                                       requirements=["torchvision", "tensorboard", "horovod[pytorch]", "torch==2.0.0"])
mpijob_function.apply(mlrun.auto_mount())
mpijob_function.spec.replicas = N_RANKS
mpijob_function.with_commands(["pip install torch==2.0.0"])
mpijob_function.deploy(builder_env={"HOROVOD_WITH_PYTORCH": "1"})

## 3. Run As A Job

Run the training as a `job` and storing the results.

In [None]:
# Run as a job:
job_run = job_function.run(
    name="training_job",
    params={
        "scripts_path": SCRIPTS_PATH,
        "data_path": DATA_PATH,
        "n_epochs": N_EPOCHS,
    },
)

# Store results:
job_time = job_run.status.results['time']
job_accuracy = job_run.status.results['validation_accuracy']

## 4. Run As a MPIJob

Run the training as a `mpijob` and storing the results.

In [None]:
# Run as a mpijob:
mpijob_run = mpijob_function.run(
    name="training_mpijob",
    params={
        "scripts_path": SCRIPTS_PATH,
        "data_path": DATA_PATH,
        "n_epochs": N_EPOCHS,
    },
)

# Store results:
mpijob_time = mpijob_run.status.results['time']
mpijob_accuracy = mpijob_run.status.results['validation_accuracy']

## 5. Compare Runtimes

1. Print a summary message.
2. Verify that:
  * The mpijob run took less time (only in stronger machines). 
  * The accuracy value is equal between the runs.

In [None]:
# Delete the test outputs:
shutil.rmtree(DATA_PATH)
shutil.rmtree(SCRIPTS_PATH)

# Delete the MLRun project:
mlrun.get_run_db().delete_project(name=project.name, deletion_strategy="cascading")

In [None]:
# Print the test's collected results:
print(
    f"Job:\n" 
    f"\t{'%.2f' % job_time} Seconds\n"
    f"\tAccuracy: {job_accuracy}"
)
print(
    f"Open MPI Job (Horovod):\n"
    f"\t{'%.2f' % mpijob_time} Seconds\n"
    f"\tAccuracy: {mpijob_accuracy}\n"
)

#  Verification: (Only possible to test on a stronger machine as the test requires big data and longer training)
# assert mpijob_time < job_time
# assert np.isclose(job_accuracy, mpijob_accuracy, atol=0.1)