# Install dependencies


In [1]:
!pip install -qU torch==2.4.0 torchvision==0.19.0 "ray[client,train]"==2.34.0

# Import dependencies


In [2]:
import os
from typing import Dict

import torch
from filelock import FileLock
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from torchvision.transforms import Normalize, ToTensor
from tqdm import tqdm

import ray
import ray.train
from ray.train import ScalingConfig
from ray.train.torch import TorchTrainer

# Define functions


In [3]:
def get_dataloaders(batch_size):
    # Transform to normalize the input images
    transform = transforms.Compose([ToTensor(), Normalize((0.5,), (0.5,))])

    with FileLock(os.path.expanduser("~/data.lock")):
        # Download training data from open datasets
        training_data = datasets.FashionMNIST(
            root="~/data",
            train=True,
            download=True,
            transform=transform,
        )

        # Download test data from open datasets
        test_data = datasets.FashionMNIST(
            root="~/data",
            train=False,
            download=True,
            transform=transform,
        )

    # Create data loaders
    train_dataloader = DataLoader(training_data, batch_size=batch_size, shuffle=True)
    test_dataloader = DataLoader(test_data, batch_size=batch_size)

    return train_dataloader, test_dataloader


# Model Definition
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28 * 28, 512),
            nn.ReLU(),
            nn.Dropout(0.25),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Dropout(0.25),
            nn.Linear(512, 10),
            nn.ReLU(),
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits


def train_func_per_worker(config: Dict):
    lr = config["lr"]
    epochs = config["epochs"]
    batch_size = config["batch_size_per_worker"]

    # Get dataloaders inside the worker training function
    train_dataloader, test_dataloader = get_dataloaders(batch_size=batch_size)

    # [1] Prepare Dataloader for distributed training
    # Shard the datasets among workers and move batches to the correct device
    # =======================================================================
    train_dataloader = ray.train.torch.prepare_data_loader(train_dataloader)
    test_dataloader = ray.train.torch.prepare_data_loader(test_dataloader)

    model = NeuralNetwork()

    # [2] Prepare and wrap your model with DistributedDataParallel
    # Move the model to the correct GPU/CPU device
    # ============================================================
    model = ray.train.torch.prepare_model(model)

    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9)

    # Model training loop
    for epoch in range(epochs):
        if ray.train.get_context().get_world_size() > 1:
            # Required for the distributed sampler to shuffle properly across epochs.
            train_dataloader.sampler.set_epoch(epoch)

        model.train()
        for X, y in tqdm(train_dataloader, desc=f"Train Epoch {epoch}"):
            pred = model(X)
            loss = loss_fn(pred, y)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        model.eval()
        test_loss, num_correct, num_total = 0, 0, 0
        with torch.no_grad():
            for X, y in tqdm(test_dataloader, desc=f"Test Epoch {epoch}"):
                pred = model(X)
                loss = loss_fn(pred, y)

                test_loss += loss.item()
                num_total += y.shape[0]
                num_correct += (pred.argmax(1) == y).sum().item()

        test_loss /= len(test_dataloader)
        accuracy = num_correct / num_total

        # [3] Report metrics to Ray Train
        # ===============================
        ray.train.report(metrics={"loss": test_loss, "accuracy": accuracy})


@ray.remote
def train_fashion_mnist(num_workers=2, use_gpu=False):
    global_batch_size = 32

    train_config = {
        "lr": 1e-3,
        "epochs": 10,
        "batch_size_per_worker": global_batch_size // num_workers,
    }

    # Configure computation resources
    scaling_config = ScalingConfig(num_workers=num_workers, use_gpu=use_gpu)

    # Initialize a Ray TorchTrainer
    trainer = TorchTrainer(
        train_loop_per_worker=train_func_per_worker,
        train_loop_config=train_config,
        scaling_config=scaling_config,
    )

    # [4] Start distributed training
    # Run `train_func_per_worker` on all workers
    # =============================================
    result = trainer.fit()
    print(f"Training result: {result}")

# Connect to Ray Cluster


In [4]:
ray.init(
    address="ray://raycluster-head-svc.default.svc.cluster.local:10001",
    runtime_env={"pip": ["torch==2.4.0", "torchvision==0.19.0"]},
)

    Ray: 2.34.0
    Python: 3.9.19
This process on Ray Client was started with:
    Ray: 2.34.0
    Python: 3.9.13



0,1
Python version:,3.9.19
Ray version:,2.34.0
Dashboard:,http://10.0.2.192:8265


# Train model remotely


In [5]:
ray.get(train_fashion_mnist.remote(num_workers=6, use_gpu=False))

[36m(autoscaler +8s)[0m Tip: use `ray status` to view detailed cluster status. To disable these messages, set RAY_SCHEDULER_EVENTS=0.
[36m(autoscaler +8s)[0m Adding 1 node(s) of type small-group.
[36m(autoscaler +8s)[0m Resized to 4 CPUs.
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m 
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m View detailed results here: /home/ray/ray_results/TorchTrainer_2024-08-20_08-07-16
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m To visualize your results with TensorBoard, run: `tensorboard --logdir /tmp/ray/session_2024-08-20_07-55-50_885073_1/artifacts/2024-08-20_08-07-16/TorchTrainer_2024-08-20_08-07-16/driver_artifacts`
[36m(autoscaler +3m16s)[0m Adding 1 node(s) of type small-group.
[36m(autoscaler +3m16s)[0m Resized to 8 CPUs.
[36m(autoscaler +3m16s)[0m No available node types can fulfill resource requests {'bundle_group_66e7422e3d142a6d037037c6e2a102000000': 0.001}*1. Add suitable node types to this cluster to resolve t

[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m Ignore this message if the cluster is autoscaling. Training has not started in the last 60 seconds. This could be due to the cluster not having enough resources available. You asked for 7.0 CPUs and 0 GPUs, but the cluster only has 4.0 CPUs and 0 GPUs available. Stop the training and adjust the required resources (e.g. via the `ScalingConfig` or `resources_per_trial`, or `num_workers` for rllib), or add more resources to your cluster.


[36m(autoscaler +4m12s)[0m No available node types can fulfill resource requests {'bundle_group_66e7422e3d142a6d037037c6e2a102000000': 0.001}*1. Add suitable node types to this cluster to resolve this issue.
[36m(autoscaler +4m17s)[0m No available node types can fulfill resource requests {'bundle_group_66e7422e3d142a6d037037c6e2a102000000': 0.001}*1. Add suitable node types to this cluster to resolve this issue.
[36m(autoscaler +4m22s)[0m No available node types can fulfill resource requests {'bundle_group_66e7422e3d142a6d037037c6e2a102000000': 0.001}*1. Add suitable node types to this cluster to resolve this issue.
[36m(autoscaler +4m27s)[0m No available node types can fulfill resource requests {'bundle_group_66e7422e3d142a6d037037c6e2a102000000': 0.001}*1. Add suitable node types to this cluster to resolve this issue.
[36m(autoscaler +4m32s)[0m No available node types can fulfill resource requests {'bundle_group_66e7422e3d142a6d037037c6e2a102000000': 0.001}*1. Add suitable 

[36m(RayTrainWorker pid=876, ip=10.0.2.84)[0m Setting up process group for: env:// [rank=0, world_size=6]
[36m(TorchTrainer pid=829, ip=10.0.2.84)[0m Started distributed worker processes: 
[36m(TorchTrainer pid=829, ip=10.0.2.84)[0m - (node_id=5ae2bf3e104d858e8a50bde2092e640995c80f2dbaf1626944bad9dd, ip=10.0.2.84, pid=876) world_rank=0, local_rank=0, node_rank=0
[36m(TorchTrainer pid=829, ip=10.0.2.84)[0m - (node_id=5ae2bf3e104d858e8a50bde2092e640995c80f2dbaf1626944bad9dd, ip=10.0.2.84, pid=877) world_rank=1, local_rank=1, node_rank=0
[36m(TorchTrainer pid=829, ip=10.0.2.84)[0m - (node_id=5ae2bf3e104d858e8a50bde2092e640995c80f2dbaf1626944bad9dd, ip=10.0.2.84, pid=878) world_rank=2, local_rank=2, node_rank=0
[36m(TorchTrainer pid=829, ip=10.0.2.84)[0m - (node_id=0d576c11aafb62c15a9de755c70bb52bd0500d299da122e9022fef66, ip=10.0.2.66, pid=457) world_rank=3, local_rank=0, node_rank=1
[36m(TorchTrainer pid=829, ip=10.0.2.84)[0m - (node_id=0d576c11aafb62c15a9de755c70bb52bd0500d

[36m(RayTrainWorker pid=457, ip=10.0.2.66)[0m Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz
[36m(RayTrainWorker pid=877, ip=10.0.2.84)[0m Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz
[36m(RayTrainWorker pid=877, ip=10.0.2.84)[0m Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz to /home/ray/data/FashionMNIST/raw/train-images-idx3-ubyte.gz
[36m(RayTrainWorker pid=457, ip=10.0.2.66)[0m Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz to /home/ray/data/FashionMNIST/raw/train-images-idx3-ubyte.gz


  0%|          | 0/26421880 [00:00<?, ?it/s][0m 
  0%|          | 0/26421880 [00:00<?, ?it/s][0m 
  0%|          | 32768/26421880 [00:00<02:05, 210690.92it/s]
  0%|          | 32768/26421880 [00:00<02:07, 207735.98it/s]
  0%|          | 65536/26421880 [00:00<02:05, 209579.85it/s]
  0%|          | 65536/26421880 [00:00<02:07, 206806.50it/s]
  0%|          | 98304/26421880 [00:00<02:05, 209299.20it/s]
  0%|          | 131072/26421880 [00:00<01:27, 300694.20it/s]
  0%|          | 131072/26421880 [00:00<02:05, 209027.20it/s]
  1%|          | 229376/26421880 [00:00<01:01, 426507.80it/s]
  1%|          | 163840/26421880 [00:00<02:05, 209031.45it/s]
  2%|▏         | 458752/26421880 [00:00<00:32, 793226.92it/s]
  1%|          | 196608/26421880 [00:00<02:05, 209006.02it/s]
  4%|▎         | 950272/26421880 [00:00<00:16, 1575236.21it/s]
  7%|▋         | 1867776/26421880 [00:01<00:08, 2948215.44it/s]
  1%|          | 229376/26421880 [00:01<02:05, 208965.58it/s]
  1%|          | 262144/26421880 [00

[36m(RayTrainWorker pid=457, ip=10.0.2.66)[0m Extracting /home/ray/data/FashionMNIST/raw/train-images-idx3-ubyte.gz to /home/ray/data/FashionMNIST/raw


100%|██████████| 26421880/26421880 [00:03<00:00, 7784940.33it/s] 
  4%|▍         | 1015808/26421880 [00:03<01:01, 415912.31it/s]
  4%|▍         | 1081344/26421880 [00:03<01:00, 416627.44it/s]


[36m(RayTrainWorker pid=457, ip=10.0.2.66)[0m 
[36m(RayTrainWorker pid=457, ip=10.0.2.66)[0m Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz


  4%|▍         | 1179648/26421880 [00:03<00:52, 476842.41it/s]
  5%|▍         | 1245184/26421880 [00:03<00:54, 460131.48it/s]
  5%|▌         | 1343488/26421880 [00:04<00:49, 508918.96it/s]


[36m(RayTrainWorker pid=457, ip=10.0.2.66)[0m Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz to /home/ray/data/FashionMNIST/raw/train-labels-idx1-ubyte.gz


  5%|▌         | 1441792/26421880 [00:04<00:45, 543787.75it/s]
  6%|▌         | 1540096/26421880 [00:04<00:43, 568513.21it/s]
  0%|          | 0/29515 [00:00<?, ?it/s]6)[0m 
  6%|▌         | 1638400/26421880 [00:04<00:42, 585895.83it/s]
100%|██████████| 29515/29515 [00:00<00:00, 183235.59it/s]
  7%|▋         | 1736704/26421880 [00:04<00:41, 598368.56it/s]


[36m(RayTrainWorker pid=457, ip=10.0.2.66)[0m Extracting /home/ray/data/FashionMNIST/raw/train-labels-idx1-ubyte.gz to /home/ray/data/FashionMNIST/raw
[36m(RayTrainWorker pid=457, ip=10.0.2.66)[0m 
[36m(RayTrainWorker pid=457, ip=10.0.2.66)[0m Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz


  7%|▋         | 1835008/26421880 [00:04<00:40, 607134.30it/s]
  7%|▋         | 1966080/26421880 [00:05<00:36, 675288.92it/s]


[36m(RayTrainWorker pid=457, ip=10.0.2.66)[0m Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz to /home/ray/data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz


  8%|▊         | 2097152/26421880 [00:05<00:33, 723325.41it/s]
  8%|▊         | 2228224/26421880 [00:05<00:31, 756541.55it/s]
  0%|          | 0/4422102 [00:00<?, ?it/s][0m 
  1%|          | 32768/4422102 [00:00<00:20, 210281.53it/s]
  9%|▉         | 2359296/26421880 [00:05<00:30, 779229.37it/s]
  9%|▉         | 2490368/26421880 [00:05<00:30, 796322.51it/s]
  1%|▏         | 65536/4422102 [00:00<00:20, 209876.36it/s]
  3%|▎         | 131072/4422102 [00:00<00:14, 305588.38it/s]
 10%|█         | 2654208/26421880 [00:05<00:27, 870066.62it/s]
  5%|▌         | 229376/4422102 [00:00<00:09, 433094.30it/s]
 11%|█         | 2785280/26421880 [00:05<00:27, 863907.91it/s]
 11%|█▏        | 2981888/26421880 [00:06<00:24, 976514.25it/s]
 10%|▉         | 425984/4422102 [00:00<00:05, 730780.62it/s]
 19%|█▉        | 851968/4422102 [00:00<00:02, 1409341.76it/s]
 12%|█▏        | 3145728/26421880 [00:06<00:23, 997582.92it/s]
 13%|█▎        | 3309568/26421880 [00:06<00:22, 1015972.55it/s]
 39%|███▊      | 1

[36m(RayTrainWorker pid=457, ip=10.0.2.66)[0m Extracting /home/ray/data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz to /home/ray/data/FashionMNIST/raw
[36m(RayTrainWorker pid=457, ip=10.0.2.66)[0m 
[36m(RayTrainWorker pid=457, ip=10.0.2.66)[0m Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz


 13%|█▎        | 3506176/26421880 [00:06<00:21, 1083940.00it/s]
 14%|█▍        | 3702784/26421880 [00:06<00:19, 1138436.06it/s]
 15%|█▍        | 3932160/26421880 [00:06<00:18, 1231363.89it/s]
 16%|█▌        | 4128768/26421880 [00:07<00:17, 1242700.61it/s]


[36m(RayTrainWorker pid=457, ip=10.0.2.66)[0m Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz to /home/ray/data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz


 17%|█▋        | 4390912/26421880 [00:07<00:16, 1364269.22it/s]
 17%|█▋        | 4620288/26421880 [00:07<00:15, 1398716.30it/s]
100%|██████████| 5148/5148 [00:00<00:00, 35166574.91it/s]
[36m(RayTrainWorker pid=457, ip=10.0.2.66)[0m Moving model to device: cpu
[36m(RayTrainWorker pid=457, ip=10.0.2.66)[0m Wrapping provided model in DistributedDataParallel.


[36m(RayTrainWorker pid=457, ip=10.0.2.66)[0m Extracting /home/ray/data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz to /home/ray/data/FashionMNIST/raw
[36m(RayTrainWorker pid=457, ip=10.0.2.66)[0m 


 18%|█▊        | 4882432/26421880 [00:07<00:14, 1476270.80it/s]
 19%|█▉        | 5144576/26421880 [00:07<00:13, 1539846.67it/s]
 21%|██        | 5439488/26421880 [00:07<00:12, 1637108.19it/s]
 22%|██▏       | 5734400/26421880 [00:07<00:12, 1710294.02it/s]
 23%|██▎       | 6029312/26421880 [00:08<00:11, 1769047.48it/s]
 24%|██▍       | 6356992/26421880 [00:08<00:10, 1865138.58it/s]
 25%|██▌       | 6717440/26421880 [00:08<00:09, 1985806.27it/s]
 27%|██▋       | 7077888/26421880 [00:08<00:09, 2079563.45it/s]
 28%|██▊       | 7438336/26421880 [00:08<00:08, 2155355.70it/s]
 30%|██▉       | 7831552/26421880 [00:08<00:08, 2261252.82it/s]
 31%|███▏      | 8257536/26421880 [00:09<00:07, 2397573.85it/s]
 33%|███▎      | 8683520/26421880 [00:09<00:07, 2493735.26it/s]
 35%|███▍      | 9142272/26421880 [00:09<00:06, 2623258.60it/s]
 36%|███▋      | 9633792/26421880 [00:09<00:06, 2770404.28it/s]
 38%|███▊      | 10125312/26421880 [00:09<00:05, 2879204.85it/s]
 40%|████      | 10682368/26421880 [00:

[36m(RayTrainWorker pid=877, ip=10.0.2.84)[0m Extracting /home/ray/data/FashionMNIST/raw/train-images-idx3-ubyte.gz to /home/ray/data/FashionMNIST/raw
[36m(RayTrainWorker pid=877, ip=10.0.2.84)[0m 
[36m(RayTrainWorker pid=877, ip=10.0.2.84)[0m Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz
[36m(RayTrainWorker pid=877, ip=10.0.2.84)[0m Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz to /home/ray/data/FashionMNIST/raw/train-labels-idx1-ubyte.gz


  0%|          | 0/29515 [00:00<?, ?it/s]4)[0m 


[36m(RayTrainWorker pid=877, ip=10.0.2.84)[0m Extracting /home/ray/data/FashionMNIST/raw/train-labels-idx1-ubyte.gz to /home/ray/data/FashionMNIST/raw
[36m(RayTrainWorker pid=877, ip=10.0.2.84)[0m 
[36m(RayTrainWorker pid=877, ip=10.0.2.84)[0m Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz


100%|██████████| 29515/29515 [00:00<00:00, 188222.79it/s]


[36m(RayTrainWorker pid=877, ip=10.0.2.84)[0m Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz to /home/ray/data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz


  0%|          | 0/4422102 [00:00<?, ?it/s][0m 
  1%|          | 32768/4422102 [00:00<00:21, 204508.86it/s]
  1%|▏         | 65536/4422102 [00:00<00:21, 204214.29it/s]
  3%|▎         | 131072/4422102 [00:00<00:14, 297165.31it/s]
  5%|▌         | 229376/4422102 [00:00<00:09, 421632.55it/s]
 10%|█         | 458752/4422102 [00:00<00:05, 784708.20it/s]
 21%|██▏       | 950272/4422102 [00:00<00:02, 1558225.45it/s]
 42%|████▏     | 1867776/4422102 [00:01<00:00, 2916817.54it/s]


[36m(RayTrainWorker pid=877, ip=10.0.2.84)[0m Extracting /home/ray/data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz to /home/ray/data/FashionMNIST/raw
[36m(RayTrainWorker pid=877, ip=10.0.2.84)[0m 
[36m(RayTrainWorker pid=877, ip=10.0.2.84)[0m Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz


100%|██████████| 4422102/4422102 [00:01<00:00, 3415137.69it/s]


[36m(RayTrainWorker pid=877, ip=10.0.2.84)[0m Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz to /home/ray/data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz
[36m(RayTrainWorker pid=877, ip=10.0.2.84)[0m Extracting /home/ray/data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz to /home/ray/data/FashionMNIST/raw
[36m(RayTrainWorker pid=877, ip=10.0.2.84)[0m 


100%|██████████| 5148/5148 [00:00<00:00, 34164995.24it/s]
Train Epoch 0:   0%|          | 0/2000 [00:00<?, ?it/s]
Train Epoch 0:   0%|          | 0/2000 [00:00<?, ?it/s]
[36m(RayTrainWorker pid=876, ip=10.0.2.84)[0m Moving model to device: cpu
[36m(RayTrainWorker pid=876, ip=10.0.2.84)[0m Wrapping provided model in DistributedDataParallel.
Train Epoch 0:   0%|          | 0/2000 [00:00<?, ?it/s]
Train Epoch 0:   0%|          | 0/2000 [00:00<?, ?it/s]
Train Epoch 0:   0%|          | 0/2000 [00:00<?, ?it/s]
Train Epoch 0:   0%|          | 0/2000 [00:00<?, ?it/s]
Train Epoch 0:   0%|          | 3/2000 [00:00<01:10, 28.28it/s]
Train Epoch 0:   0%|          | 3/2000 [00:00<01:13, 27.05it/s]
Train Epoch 0:   0%|          | 3/2000 [00:00<01:07, 29.37it/s]
Train Epoch 0:   0%|          | 3/2000 [00:00<01:10, 28.47it/s]
Train Epoch 0:   0%|          | 3/2000 [00:00<01:07, 29.47it/s]
Train Epoch 0:   0%|          | 4/2000 [00:00<01:06, 30.14it/s]
Train Epoch 0:   0%|          | 7/2000 [00:00<

[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m 
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m Training finished iteration 1 at 2024-08-20 08:11:07. Total running time: 3min 50s
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m ╭───────────────────────────────╮
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m │ Training result               │
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m ├───────────────────────────────┤
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m │ checkpoint_dir_name           │
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m │ time_this_iter_s      143.165 │
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m │ time_total_s          143.165 │
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m │ training_iteration          1 │
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m │ accuracy              0.78884 │
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m │ loss                  0.54086 │
[36m(train_fashion_mnist p

Train Epoch 1:   0%|          | 10/2000 [00:00<00:39, 50.96it/s]
Train Epoch 1:   0%|          | 10/2000 [00:00<00:38, 51.51it/s]
Train Epoch 1:   1%|          | 11/2000 [00:00<00:36, 54.93it/s]
Train Epoch 1:   1%|          | 18/2000 [00:00<00:32, 61.06it/s]
Train Epoch 1:   1%|          | 18/2000 [00:00<00:32, 61.33it/s]
Train Epoch 1:   1%|          | 19/2000 [00:00<00:31, 63.04it/s]
Train Epoch 1:   1%|          | 15/2000 [00:00<00:28, 68.75it/s]
Train Epoch 1:   1%|          | 15/2000 [00:00<00:29, 67.68it/s]
Train Epoch 1:   1%|          | 15/2000 [00:00<00:29, 68.27it/s]
Train Epoch 1:   1%|▏         | 26/2000 [00:00<00:30, 65.55it/s]
Train Epoch 1:   1%|▏         | 26/2000 [00:00<00:30, 65.64it/s]
Train Epoch 1:   1%|▏         | 27/2000 [00:00<00:29, 67.08it/s]
Train Epoch 1:   1%|          | 23/2000 [00:00<00:28, 70.24it/s]
Train Epoch 1:   1%|          | 23/2000 [00:00<00:28, 69.48it/s]
Train Epoch 1:   1%|          | 23/2000 [00:00<00:28, 69.81it/s]
Train Epoch 1:   2%|▏    

[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m 
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m Training finished iteration 2 at 2024-08-20 08:11:35. Total running time: 4min 18s
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m ╭───────────────────────────────╮
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m │ Training result               │
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m ├───────────────────────────────┤
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m │ checkpoint_dir_name           │
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m │ time_this_iter_s      28.6825 │
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m │ time_total_s          171.847 │
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m │ training_iteration          2 │
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m │ accuracy              0.82543 │
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m │ loss                  0.45826 │
[36m(train_fashion_mnist p

Train Epoch 2:   1%|          | 17/2000 [00:00<00:36, 54.79it/s]
Train Epoch 2:   1%|          | 17/2000 [00:00<00:35, 56.26it/s]
Train Epoch 2:   1%|          | 16/2000 [00:00<00:36, 55.08it/s]
Train Epoch 2:   1%|          | 15/2000 [00:00<00:28, 70.07it/s]
Train Epoch 2:   1%|          | 15/2000 [00:00<00:27, 71.51it/s]
Train Epoch 2:   1%|          | 11/2000 [00:00<00:36, 54.37it/s]
Train Epoch 2:   1%|▏         | 25/2000 [00:00<00:32, 61.25it/s]
Train Epoch 2:   1%|▏         | 25/2000 [00:00<00:31, 62.58it/s]
Train Epoch 2:   1%|          | 24/2000 [00:00<00:31, 62.06it/s]
Train Epoch 2:   1%|          | 23/2000 [00:00<00:27, 71.21it/s]
Train Epoch 2:   1%|          | 23/2000 [00:00<00:27, 72.00it/s]
Train Epoch 2:   1%|          | 19/2000 [00:00<00:31, 62.65it/s]
Train Epoch 2:   2%|▏         | 32/2000 [00:00<00:30, 65.56it/s]
Train Epoch 2:   2%|▏         | 31/2000 [00:00<00:27, 71.72it/s]
Train Epoch 2:   2%|▏         | 31/2000 [00:00<00:27, 72.14it/s]
Train Epoch 2:   1%|▏    

[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m 
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m Training finished iteration 3 at 2024-08-20 08:12:04. Total running time: 4min 47s
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m ╭───────────────────────────────╮
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m │ Training result               │
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m ├───────────────────────────────┤
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m │ checkpoint_dir_name           │
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m │ time_this_iter_s      28.9163 │
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m │ time_total_s          200.764 │
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m │ training_iteration          3 │
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m │ accuracy              0.84283 │
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m │ loss                  0.42585 │
[36m(train_fashion_mnist p

Train Epoch 3:   1%|          | 11/2000 [00:00<00:35, 55.42it/s]
Train Epoch 3:   1%|          | 12/2000 [00:00<00:33, 58.97it/s]
Train Epoch 3:   1%|          | 13/2000 [00:00<00:31, 62.88it/s]
Train Epoch 3:   1%|          | 16/2000 [00:00<00:27, 71.81it/s]
Train Epoch 3:   1%|          | 15/2000 [00:00<00:28, 69.31it/s]
Train Epoch 3:   1%|          | 15/2000 [00:00<00:28, 69.76it/s]
Train Epoch 3:   1%|          | 19/2000 [00:00<00:31, 62.68it/s]
Train Epoch 3:   1%|          | 20/2000 [00:00<00:30, 64.49it/s]
Train Epoch 3:   1%|          | 21/2000 [00:00<00:29, 66.39it/s]
Train Epoch 3:   1%|          | 24/2000 [00:00<00:27, 71.86it/s]
Train Epoch 3:   1%|          | 23/2000 [00:00<00:28, 70.34it/s]
Train Epoch 3:   1%|          | 23/2000 [00:00<00:27, 70.99it/s]
Train Epoch 3:   1%|▏         | 27/2000 [00:00<00:29, 67.34it/s]
Train Epoch 3:   1%|▏         | 28/2000 [00:00<00:28, 68.30it/s]
Train Epoch 3:   1%|▏         | 29/2000 [00:00<00:28, 69.02it/s]
Train Epoch 3:   2%|▏    

[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m 
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m Training finished iteration 4 at 2024-08-20 08:12:33. Total running time: 5min 16s
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m ╭───────────────────────────────╮
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m │ Training result               │
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m ├───────────────────────────────┤
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m │ checkpoint_dir_name           │
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m │ time_this_iter_s      28.7048 │
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m │ time_total_s          229.468 │
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m │ training_iteration          4 │
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m │ accuracy              0.85663 │
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m │ loss                  0.41158 │
[36m(train_fashion_mnist p

Train Epoch 4:   0%|          | 1/2000 [00:00<03:26,  9.67it/s]
Train Epoch 4:   0%|          | 2/2000 [00:00<01:48, 18.33it/s]
Train Epoch 4:   0%|          | 2/2000 [00:00<01:52, 17.74it/s]
Train Epoch 4:   0%|          | 5/2000 [00:00<00:43, 45.84it/s]
Train Epoch 4:   0%|          | 9/2000 [00:00<00:41, 48.18it/s]
Train Epoch 4:   0%|          | 10/2000 [00:00<00:39, 50.46it/s]
Train Epoch 4:   0%|          | 10/2000 [00:00<00:39, 50.08it/s]
Train Epoch 4:   1%|          | 13/2000 [00:00<00:32, 61.88it/s]
Train Epoch 4:   0%|          | 8/2000 [00:00<00:28, 70.81it/s]
Train Epoch 4:   0%|          | 8/2000 [00:00<00:28, 69.92it/s]
Train Epoch 4:   1%|          | 17/2000 [00:00<00:33, 59.71it/s]
Train Epoch 4:   1%|          | 18/2000 [00:00<00:32, 61.34it/s]
Train Epoch 4:   1%|          | 18/2000 [00:00<00:32, 60.92it/s]
Train Epoch 4:   1%|          | 21/2000 [00:00<00:29, 67.91it/s]
Train Epoch 4:   1%|          | 16/2000 [00:00<00:27, 72.57it/s]
Train Epoch 4:   1%|          | 

[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m 
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m Training finished iteration 5 at 2024-08-20 08:13:02. Total running time: 5min 45s
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m ╭───────────────────────────────╮
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m │ Training result               │
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m ├───────────────────────────────┤
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m │ checkpoint_dir_name           │
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m │ time_this_iter_s      28.6922 │
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m │ time_total_s          258.161 │
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m │ training_iteration          5 │
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m │ accuracy              0.86383 │
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m │ loss                  0.38492 │
[36m(train_fashion_mnist p

Train Epoch 5:   1%|          | 11/2000 [00:00<00:37, 53.34it/s]
Train Epoch 5:   1%|          | 11/2000 [00:00<00:37, 53.61it/s]
Train Epoch 5:   1%|          | 12/2000 [00:00<00:34, 57.25it/s]
Train Epoch 5:   1%|          | 14/2000 [00:00<00:29, 66.27it/s]
Train Epoch 5:   1%|          | 14/2000 [00:00<00:31, 63.71it/s]
Train Epoch 5:   1%|          | 15/2000 [00:00<00:28, 70.06it/s]
Train Epoch 5:   1%|          | 19/2000 [00:00<00:32, 61.79it/s]
Train Epoch 5:   1%|          | 19/2000 [00:00<00:31, 62.02it/s]
Train Epoch 5:   1%|          | 20/2000 [00:00<00:30, 63.96it/s]
Train Epoch 5:   1%|          | 22/2000 [00:00<00:28, 69.54it/s]
Train Epoch 5:   1%|          | 22/2000 [00:00<00:29, 68.08it/s]
Train Epoch 5:   1%|          | 23/2000 [00:00<00:27, 71.44it/s]
Train Epoch 5:   1%|▏         | 27/2000 [00:00<00:30, 64.99it/s]
Train Epoch 5:   1%|▏         | 27/2000 [00:00<00:30, 65.12it/s]
Train Epoch 5:   1%|▏         | 28/2000 [00:00<00:29, 66.30it/s]
Train Epoch 5:   1%|▏    

[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m 
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m Training finished iteration 6 at 2024-08-20 08:13:30. Total running time: 6min 13s
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m ╭───────────────────────────────╮
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m │ Training result               │
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m ├───────────────────────────────┤
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m │ checkpoint_dir_name           │
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m │ time_this_iter_s      28.4478 │
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m │ time_total_s          286.608 │
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m │ training_iteration          6 │
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m │ accuracy              0.86203 │
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m │ loss                  0.37738 │
[36m(train_fashion_mnist p

Train Epoch 6:   0%|          | 9/2000 [00:00<00:41, 47.71it/s]
Train Epoch 6:   1%|          | 14/2000 [00:00<00:30, 65.06it/s]
Train Epoch 6:   1%|          | 17/2000 [00:00<00:33, 59.66it/s]
Train Epoch 6:   1%|          | 11/2000 [00:00<00:37, 52.36it/s]
Train Epoch 6:   1%|          | 11/2000 [00:00<00:35, 55.30it/s]
Train Epoch 6:   1%|          | 22/2000 [00:00<00:28, 68.70it/s]
Train Epoch 6:   1%|          | 15/2000 [00:00<00:28, 69.64it/s]
Train Epoch 6:   1%|          | 15/2000 [00:00<00:28, 69.31it/s]
Train Epoch 6:   1%|▏         | 25/2000 [00:00<00:30, 64.66it/s]
Train Epoch 6:   1%|          | 19/2000 [00:00<00:32, 61.27it/s]
Train Epoch 6:   1%|          | 19/2000 [00:00<00:31, 63.09it/s]
Train Epoch 6:   1%|▏         | 29/2000 [00:00<00:28, 68.77it/s]
Train Epoch 6:   1%|          | 23/2000 [00:00<00:28, 70.36it/s]
Train Epoch 6:   1%|          | 23/2000 [00:00<00:27, 71.14it/s]
Train Epoch 6:   2%|▏         | 32/2000 [00:00<00:29, 66.06it/s]
Train Epoch 6:   1%|▏     

[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m 
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m Training finished iteration 7 at 2024-08-20 08:13:59. Total running time: 6min 42s
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m ╭───────────────────────────────╮
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m │ Training result               │
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m ├───────────────────────────────┤
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m │ checkpoint_dir_name           │
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m │ time_this_iter_s      28.6233 │
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m │ time_total_s          315.232 │
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m │ training_iteration          7 │
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m │ accuracy              0.87103 │
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m │ loss                  0.35866 │
[36m(train_fashion_mnist p

Train Epoch 7:   1%|          | 12/2000 [00:00<00:34, 58.05it/s]
Train Epoch 7:   1%|          | 13/2000 [00:00<00:31, 63.27it/s]
Train Epoch 7:   1%|          | 19/2000 [00:00<00:31, 63.49it/s]
Train Epoch 7:   1%|          | 15/2000 [00:00<00:28, 70.55it/s]
Train Epoch 7:   1%|          | 14/2000 [00:00<00:30, 65.52it/s]
Train Epoch 7:   1%|          | 15/2000 [00:00<00:28, 68.48it/s]
Train Epoch 7:   1%|          | 20/2000 [00:00<00:29, 66.10it/s]
Train Epoch 7:   1%|          | 21/2000 [00:00<00:28, 68.57it/s]
Train Epoch 7:   1%|▏         | 27/2000 [00:00<00:29, 67.99it/s]
Train Epoch 7:   1%|          | 23/2000 [00:00<00:26, 73.76it/s]
Train Epoch 7:   1%|          | 22/2000 [00:00<00:28, 70.15it/s]
Train Epoch 7:   1%|          | 23/2000 [00:00<00:27, 72.56it/s]
Train Epoch 7:   1%|▏         | 28/2000 [00:00<00:28, 68.74it/s]
Train Epoch 7:   1%|▏         | 29/2000 [00:00<00:28, 70.24it/s]
Train Epoch 7:   2%|▏         | 34/2000 [00:00<00:28, 68.02it/s]
Train Epoch 7:   2%|▏    

[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m 
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m Training finished iteration 8 at 2024-08-20 08:14:27. Total running time: 7min 10s
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m ╭───────────────────────────────╮
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m │ Training result               │
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m ├───────────────────────────────┤
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m │ checkpoint_dir_name           │
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m │ time_this_iter_s      28.6214 │
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m │ time_total_s          343.853 │
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m │ training_iteration          8 │
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m │ accuracy              0.86923 │
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m │ loss                  0.36118 │
[36m(train_fashion_mnist p

Train Epoch 8:   0%|          | 7/2000 [00:00<00:30, 64.53it/s]
Train Epoch 8:   0%|          | 8/2000 [00:00<00:28, 70.98it/s]
Train Epoch 8:   0%|          | 4/2000 [00:00<00:57, 34.69it/s]
Train Epoch 8:   0%|          | 6/2000 [00:00<00:34, 57.87it/s]
Train Epoch 8:   0%|          | 6/2000 [00:00<00:35, 56.37it/s]
Train Epoch 8:   0%|          | 6/2000 [00:00<00:34, 58.06it/s]
Train Epoch 8:   1%|          | 15/2000 [00:00<00:29, 68.43it/s]
Train Epoch 8:   1%|          | 16/2000 [00:00<00:27, 72.32it/s]
Train Epoch 8:   1%|          | 12/2000 [00:00<00:35, 56.04it/s]
Train Epoch 8:   1%|          | 14/2000 [00:00<00:29, 66.58it/s]
Train Epoch 8:   1%|          | 14/2000 [00:00<00:30, 65.86it/s]
Train Epoch 8:   1%|          | 14/2000 [00:00<00:29, 66.75it/s]
Train Epoch 8:   1%|          | 23/2000 [00:00<00:27, 72.52it/s]
Train Epoch 8:   1%|          | 20/2000 [00:00<00:30, 64.65it/s]
Train Epoch 8:   1%|          | 24/2000 [00:00<00:26, 74.36it/s]
Train Epoch 8:   1%|          |

[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m 
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m Training finished iteration 9 at 2024-08-20 08:14:56. Total running time: 7min 39s
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m ╭───────────────────────────────╮
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m │ Training result               │
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m ├───────────────────────────────┤
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m │ checkpoint_dir_name           │
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m │ time_this_iter_s      28.7684 │
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m │ time_total_s          372.621 │
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m │ training_iteration          9 │
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m │ accuracy              0.87762 │
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m │ loss                  0.35107 │
[36m(train_fashion_mnist p

Train Epoch 9:   0%|          | 5/2000 [00:00<00:40, 49.54it/s]
Train Epoch 9:   0%|          | 6/2000 [00:00<00:37, 53.28it/s]
Train Epoch 9:   0%|          | 5/2000 [00:00<00:45, 44.07it/s]
Train Epoch 9:   0%|          | 7/2000 [00:00<00:29, 67.59it/s]
Train Epoch 9:   0%|          | 7/2000 [00:00<00:32, 62.16it/s]
Train Epoch 9:   0%|          | 7/2000 [00:00<00:29, 68.58it/s]
Train Epoch 9:   1%|          | 13/2000 [00:00<00:32, 61.59it/s]
Train Epoch 9:   1%|          | 13/2000 [00:00<00:31, 62.25it/s]
Train Epoch 9:   1%|          | 12/2000 [00:00<00:34, 58.24it/s]
Train Epoch 9:   1%|          | 15/2000 [00:00<00:28, 69.09it/s]
Train Epoch 9:   1%|          | 15/2000 [00:00<00:29, 66.81it/s]
Train Epoch 9:   1%|          | 15/2000 [00:00<00:28, 69.61it/s]
Train Epoch 9:   1%|          | 21/2000 [00:00<00:30, 65.81it/s]
Train Epoch 9:   1%|          | 21/2000 [00:00<00:29, 66.04it/s]
Train Epoch 9:   1%|          | 19/2000 [00:00<00:31, 63.22it/s]
Train Epoch 9:   1%|          |

[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m 
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m Training finished iteration 10 at 2024-08-20 08:15:25. Total running time: 8min 8s
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m ╭───────────────────────────────╮
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m │ Training result               │
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m ├───────────────────────────────┤
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m │ checkpoint_dir_name           │
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m │ time_this_iter_s      28.6079 │
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m │ time_total_s          401.229 │
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m │ training_iteration         10 │
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m │ accuracy              0.86743 │
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m │ loss                  0.35137 │
[36m(train_fashion_mnist p

# Shut down Ray workers


In [6]:
ray.shutdown()

[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m 
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m Training completed after 10 iterations at 2024-08-20 08:15:26. Total running time: 8min 9s
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m 
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m Training result: Result(
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m   metrics={'loss': 0.35136826510797675, 'accuracy': 0.8674265146970606},
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m   path='/home/ray/ray_results/TorchTrainer_2024-08-20_08-07-16/TorchTrainer_e36e0_00000_0_2024-08-20_08-07-16',
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m   filesystem='local',
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m   checkpoint=None
[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m )


[36m(train_fashion_mnist pid=441, ip=10.0.2.84)[0m Wrote the latest version of all result files and experiment state to '/home/ray/ray_results/TorchTrainer_2024-08-20_08-07-16' in 0.0028s.
