# Install dependencies


In [1]:
!pip install -qU torch==2.7.1 torchvision==0.22.1 "ray[client,train]"==2.47.1

# Import dependencies


In [2]:
import os
from typing import Dict

import torch
from filelock import FileLock
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from torchvision.transforms import Normalize, ToTensor

import ray
import ray.train
from ray.train import ScalingConfig
from ray.train.torch import TorchTrainer

# Define functions


In [3]:
def get_dataloaders(batch_size):
    # Transform to normalize the input images
    transform = transforms.Compose([ToTensor(), Normalize((0.5,), (0.5,))])

    with FileLock(os.path.expanduser("~/data.lock")):
        # Download training data from open datasets
        training_data = datasets.FashionMNIST(
            root="~/data",
            train=True,
            download=True,
            transform=transform,
        )

        # Download test data from open datasets
        test_data = datasets.FashionMNIST(
            root="~/data",
            train=False,
            download=True,
            transform=transform,
        )

    # Create data loaders
    train_dataloader = DataLoader(training_data, batch_size=batch_size, shuffle=True)
    test_dataloader = DataLoader(test_data, batch_size=batch_size)

    return train_dataloader, test_dataloader


# Model Definition
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28 * 28, 512),
            nn.ReLU(),
            nn.Dropout(0.25),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Dropout(0.25),
            nn.Linear(512, 10),
            nn.ReLU(),
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits


def train_func_per_worker(config: Dict):
    lr = config["lr"]
    epochs = config["epochs"]
    batch_size = config["batch_size_per_worker"]

    # Get dataloaders inside the worker training function
    train_dataloader, test_dataloader = get_dataloaders(batch_size=batch_size)

    # [1] Prepare Dataloader for distributed training
    # Shard the datasets among workers and move batches to the correct device
    # =======================================================================
    train_dataloader = ray.train.torch.prepare_data_loader(train_dataloader)
    test_dataloader = ray.train.torch.prepare_data_loader(test_dataloader)

    model = NeuralNetwork()

    # [2] Prepare and wrap your model with DistributedDataParallel
    # Move the model to the correct GPU/CPU device
    # ============================================================
    model = ray.train.torch.prepare_model(model)

    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9)

    # Model training loop
    for epoch in range(epochs):
        if ray.train.get_context().get_world_size() > 1:
            # Required for the distributed sampler to shuffle properly across epochs.
            train_dataloader.sampler.set_epoch(epoch)

        model.train()
        print(f"Train Epoch {epoch}")
        for X, y in train_dataloader:
            pred = model(X)
            loss = loss_fn(pred, y)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        model.eval()
        test_loss, num_correct, num_total = 0, 0, 0
        with torch.no_grad():
            print(f"Test Epoch {epoch}")
            for X, y in test_dataloader:
                pred = model(X)
                loss = loss_fn(pred, y)

                test_loss += loss.item()
                num_total += y.shape[0]
                num_correct += (pred.argmax(1) == y).sum().item()

        test_loss /= len(test_dataloader)
        accuracy = num_correct / num_total

        # [3] Report metrics to Ray Train
        # ===============================
        ray.train.report(metrics={"loss": test_loss, "accuracy": accuracy})


@ray.remote
def train_fashion_mnist(num_workers=2, use_gpu=False):
    global_batch_size = 32

    train_config = {
        "lr": 1e-3,
        "epochs": 10,
        "batch_size_per_worker": global_batch_size // num_workers,
    }

    # Configure computation resources
    scaling_config = ScalingConfig(num_workers=num_workers, use_gpu=use_gpu)

    # Initialize a Ray TorchTrainer
    trainer = TorchTrainer(
        train_loop_per_worker=train_func_per_worker,
        train_loop_config=train_config,
        scaling_config=scaling_config,
    )

    # [4] Start distributed training
    # Run `train_func_per_worker` on all workers
    # =============================================
    result = trainer.fit()
    print(f"Training result: {result}")

# Connect to Ray Cluster


In [4]:
ray.init(
    address="ray://raycluster-cpu-head-svc.default.svc.cluster.local:10001",
    runtime_env={"pip": ["torch==2.7.1", "torchvision==0.22.1"]},
)

2025-06-24 08:57:19,945	INFO client_builder.py:242 -- Passing the following kwargs to ray.init() on the server: log_to_driver
SIGTERM handler is not set because current thread is not the main thread.
    Ray: 2.47.1
    Python: 3.9.23
This process on Ray Client was started with:
    Ray: 2.47.1
    Python: 3.9.13



0,1
Python version:,3.9.23
Ray version:,2.47.1
Dashboard:,http://10.0.2.38:8265


# Train model remotely


In [5]:
ray.get(train_fashion_mnist.remote(num_workers=7, use_gpu=False))

[36m(train_fashion_mnist pid=1474, ip=10.0.3.56)[0m 
[36m(train_fashion_mnist pid=1474, ip=10.0.3.56)[0m View detailed results here: /home/ray/ray_results/TorchTrainer_2025-06-24_01-57-35
[36m(train_fashion_mnist pid=1474, ip=10.0.3.56)[0m To visualize your results with TensorBoard, run: `tensorboard --logdir /tmp/ray/session_2025-06-24_01-30-18_669596_1/artifacts/2025-06-24_01-57-35/TorchTrainer_2025-06-24_01-57-35/driver_artifacts`
[36m(train_fashion_mnist pid=1474, ip=10.0.3.56)[0m 
[36m(train_fashion_mnist pid=1474, ip=10.0.3.56)[0m Training started with configuration:
[36m(train_fashion_mnist pid=1474, ip=10.0.3.56)[0m ╭─────────────────────────────────────────────────╮
[36m(train_fashion_mnist pid=1474, ip=10.0.3.56)[0m │ Training config                                 │
[36m(train_fashion_mnist pid=1474, ip=10.0.3.56)[0m ├─────────────────────────────────────────────────┤
[36m(train_fashion_mnist pid=1474, ip=10.0.3.56)[0m │ train_loop_config/batch_size_per_wor

[36m(RayTrainWorker pid=1491, ip=10.0.3.127)[0m Setting up process group for: env:// [rank=0, world_size=7]
[36m(TorchTrainer pid=1429, ip=10.0.3.127)[0m Started distributed worker processes: 
[36m(TorchTrainer pid=1429, ip=10.0.3.127)[0m - (node_id=ac5bf144236e88f5666b2801da4157ebf187d620c16523cb981c2379, ip=10.0.3.127, pid=1491) world_rank=0, local_rank=0, node_rank=0
[36m(TorchTrainer pid=1429, ip=10.0.3.127)[0m - (node_id=ac5bf144236e88f5666b2801da4157ebf187d620c16523cb981c2379, ip=10.0.3.127, pid=1494) world_rank=1, local_rank=1, node_rank=0
[36m(TorchTrainer pid=1429, ip=10.0.3.127)[0m - (node_id=ac5bf144236e88f5666b2801da4157ebf187d620c16523cb981c2379, ip=10.0.3.127, pid=1493) world_rank=2, local_rank=2, node_rank=0
[36m(TorchTrainer pid=1429, ip=10.0.3.127)[0m - (node_id=ac5bf144236e88f5666b2801da4157ebf187d620c16523cb981c2379, ip=10.0.3.127, pid=1496) world_rank=3, local_rank=3, node_rank=0
[36m(TorchTrainer pid=1429, ip=10.0.3.127)[0m - (node_id=ac5bf144236e88f5

[36m(RayTrainWorker pid=1624, ip=10.0.3.56)[0m Train Epoch 0
[36m(RayTrainWorker pid=1491, ip=10.0.3.127)[0m Train Epoch 0
[36m(RayTrainWorker pid=1494, ip=10.0.3.127)[0m Train Epoch 0
[36m(RayTrainWorker pid=1493, ip=10.0.3.127)[0m Train Epoch 0
[36m(RayTrainWorker pid=1496, ip=10.0.3.127)[0m Train Epoch 0
[36m(RayTrainWorker pid=1495, ip=10.0.3.127)[0m Train Epoch 0
[36m(RayTrainWorker pid=1492, ip=10.0.3.127)[0m Train Epoch 0
[36m(RayTrainWorker pid=1624, ip=10.0.3.56)[0m Test Epoch 0
[36m(RayTrainWorker pid=1491, ip=10.0.3.127)[0m Test Epoch 0
[36m(RayTrainWorker pid=1494, ip=10.0.3.127)[0m Test Epoch 0
[36m(RayTrainWorker pid=1493, ip=10.0.3.127)[0m Test Epoch 0
[36m(RayTrainWorker pid=1496, ip=10.0.3.127)[0m Test Epoch 0
[36m(RayTrainWorker pid=1495, ip=10.0.3.127)[0m Test Epoch 0
[36m(RayTrainWorker pid=1492, ip=10.0.3.127)[0m Test Epoch 0
[36m(RayTrainWorker pid=1494, ip=10.0.3.127)[0m Train Epoch 1
[36m(RayTrainWorker pid=1493, ip=10.0.3.127)[0m

[33m(raylet)[0m [2025-06-24 01:58:28,029 E 370 370] (raylet) node_manager.cc:3193: 1 Workers (tasks / actors) killed due to memory pressure (OOM), 0 Workers crashed due to other reasons at node (ID: b89779e09a020306819af4ad6700cca55d525c6ee1ebcfde386d6698, IP: 10.0.2.38) over the last time period. To see more information about the Workers killed on this node, use `ray logs raylet.out -ip 10.0.2.38`
[33m(raylet)[0m 
[33m(raylet)[0m Refer to the documentation on how to address the out of memory issue: https://docs.ray.io/en/latest/ray-core/scheduling/ray-oom-prevention.html. Consider provisioning more memory on this node or reducing task parallelism by requesting more CPUs per task. To adjust the kill threshold, set the environment variable `RAY_memory_usage_threshold` when starting Ray. To disable worker killing, set the environment variable `RAY_memory_monitor_refresh_ms` to zero.


[36m(RayTrainWorker pid=1491, ip=10.0.3.127)[0m Test Epoch 2
[36m(RayTrainWorker pid=1494, ip=10.0.3.127)[0m Test Epoch 2
[36m(RayTrainWorker pid=1493, ip=10.0.3.127)[0m Test Epoch 2
[36m(RayTrainWorker pid=1496, ip=10.0.3.127)[0m Test Epoch 2
[36m(RayTrainWorker pid=1495, ip=10.0.3.127)[0m Test Epoch 2
[36m(RayTrainWorker pid=1492, ip=10.0.3.127)[0m Test Epoch 2
[36m(RayTrainWorker pid=1624, ip=10.0.3.56)[0m Test Epoch 2
[36m(RayTrainWorker pid=1491, ip=10.0.3.127)[0m Train Epoch 3
[36m(RayTrainWorker pid=1494, ip=10.0.3.127)[0m Train Epoch 3
[36m(RayTrainWorker pid=1493, ip=10.0.3.127)[0m Train Epoch 3
[36m(RayTrainWorker pid=1495, ip=10.0.3.127)[0m Train Epoch 3
[36m(RayTrainWorker pid=1492, ip=10.0.3.127)[0m Train Epoch 3
[36m(train_fashion_mnist pid=1474, ip=10.0.3.56)[0m 
[36m(train_fashion_mnist pid=1474, ip=10.0.3.56)[0m Training finished iteration 3 at 2025-06-24 01:58:44. Total running time: 1min 9s
[36m(train_fashion_mnist pid=1474, ip=10.0.3.56)

[36m(train_fashion_mnist pid=1474, ip=10.0.3.56)[0m Wrote the latest version of all result files and experiment state to '/home/ray/ray_results/TorchTrainer_2025-06-24_01-57-35' in 0.0030s.


[36m(autoscaler +4m52s)[0m Tip: use `ray status` to view detailed cluster status. To disable these messages, set RAY_SCHEDULER_EVENTS=0.
[36m(autoscaler +4m52s)[0m Removing 1 nodes of type cpu-group (idle).
[36m(autoscaler +4m52s)[0m Resized to 7 CPUs.


# Shut down Ray workers


In [6]:
ray.shutdown()