In [1]:
import os
from typing import Dict

import torch
from filelock import FileLock
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from torchvision.transforms import Normalize, ToTensor
from tqdm import tqdm

import ray.train
from ray.train import ScalingConfig
from ray.train.torch import TorchTrainer

In [2]:
def get_dataloaders(batch_size):
    # Transform to normalize the input images
    transform = transforms.Compose([ToTensor(), Normalize((0.5,), (0.5,))])

    with FileLock(os.path.expanduser("~/data.lock")):
        # Download training data from open datasets
        training_data = datasets.FashionMNIST(
            root="~/data",
            train=True,
            download=True,
            transform=transform,
        )

        # Download test data from open datasets
        test_data = datasets.FashionMNIST(
            root="~/data",
            train=False,
            download=True,
            transform=transform,
        )

    # Create data loaders
    train_dataloader = DataLoader(training_data, batch_size=batch_size, shuffle=True)
    test_dataloader = DataLoader(test_data, batch_size=batch_size)

    return train_dataloader, test_dataloader

In [3]:
# Model Definition
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28 * 28, 512),
            nn.ReLU(),
            nn.Dropout(0.25),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Dropout(0.25),
            nn.Linear(512, 10),
            nn.ReLU(),
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

In [4]:
def train_func_per_worker(config: Dict):
    lr = config["lr"]
    epochs = config["epochs"]
    batch_size = config["batch_size_per_worker"]

    # Get dataloaders inside the worker training function
    train_dataloader, test_dataloader = get_dataloaders(batch_size=batch_size)

    # [1] Prepare Dataloader for distributed training
    # Shard the datasets among workers and move batches to the correct device
    # =======================================================================
    train_dataloader = ray.train.torch.prepare_data_loader(train_dataloader)
    test_dataloader = ray.train.torch.prepare_data_loader(test_dataloader)

    model = NeuralNetwork()

    # [2] Prepare and wrap your model with DistributedDataParallel
    # Move the model to the correct GPU/CPU device
    # ============================================================
    model = ray.train.torch.prepare_model(model)

    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9)

    # Model training loop
    for epoch in range(epochs):
        if ray.train.get_context().get_world_size() > 1:
            # Required for the distributed sampler to shuffle properly across epochs.
            train_dataloader.sampler.set_epoch(epoch)

        model.train()
        for X, y in tqdm(train_dataloader, desc=f"Train Epoch {epoch}"):
            pred = model(X)
            loss = loss_fn(pred, y)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        model.eval()
        test_loss, num_correct, num_total = 0, 0, 0
        with torch.no_grad():
            for X, y in tqdm(test_dataloader, desc=f"Test Epoch {epoch}"):
                pred = model(X)
                loss = loss_fn(pred, y)

                test_loss += loss.item()
                num_total += y.shape[0]
                num_correct += (pred.argmax(1) == y).sum().item()

        test_loss /= len(test_dataloader)
        accuracy = num_correct / num_total

        # [3] Report metrics to Ray Train
        # ===============================
        ray.train.report(metrics={"loss": test_loss, "accuracy": accuracy})


In [5]:
def train_fashion_mnist(num_workers=2, use_gpu=False):
    global_batch_size = 32

    train_config = {
        "lr": 1e-3,
        "epochs": 10,
        "batch_size_per_worker": global_batch_size // num_workers,
    }

    # Configure computation resources
    scaling_config = ScalingConfig(num_workers=num_workers, use_gpu=use_gpu)

    # Initialize a Ray TorchTrainer
    trainer = TorchTrainer(
        train_loop_per_worker=train_func_per_worker,
        train_loop_config=train_config,
        scaling_config=scaling_config,
    )

    # [4] Start distributed training
    # Run `train_func_per_worker` on all workers
    # =============================================
    result = trainer.fit()
    print(f"Training result: {result}")

In [6]:
train_fashion_mnist(num_workers=4, use_gpu=False)

2024-09-29 11:43:02,397	INFO worker.py:1777 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8266 [39m[22m
2024-09-29 11:43:02,935	INFO tune.py:253 -- Initializing Ray automatically. For cluster usage or custom Ray initialization, call `ray.init(...)` before `<FrameworkTrainer>(...)`.
2024-09-29 11:43:02,936	INFO tune.py:616 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949


== Status ==
Current time: 2024-09-29 11:43:03 (running for 00:00:00.13)
Using FIFO scheduling algorithm.
Logical resource usage: 5.0/10 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-09-29_11-43-01_585844_53312/artifacts/2024-09-29_11-43-02/TorchTrainer_2024-09-29_11-43-01/driver_artifacts
Number of trials: 1/1 (1 PENDING)




[36m(TorchTrainer pid=53623)[0m Started distributed worker processes: 
[36m(TorchTrainer pid=53623)[0m - (node_id=2ed88b6ebc0fcf280abb7fc3226a29881fbdf419bec129f08b318fd4, ip=127.0.0.1, pid=53630) world_rank=0, local_rank=0, node_rank=0
[36m(TorchTrainer pid=53623)[0m - (node_id=2ed88b6ebc0fcf280abb7fc3226a29881fbdf419bec129f08b318fd4, ip=127.0.0.1, pid=53631) world_rank=1, local_rank=1, node_rank=0
[36m(TorchTrainer pid=53623)[0m - (node_id=2ed88b6ebc0fcf280abb7fc3226a29881fbdf419bec129f08b318fd4, ip=127.0.0.1, pid=53632) world_rank=2, local_rank=2, node_rank=0
[36m(TorchTrainer pid=53623)[0m - (node_id=2ed88b6ebc0fcf280abb7fc3226a29881fbdf419bec129f08b318fd4, ip=127.0.0.1, pid=53633) world_rank=3, local_rank=3, node_rank=0
[36m(RayTrainWorker pid=53630)[0m Setting up process group for: env:// [rank=0, world_size=4]


[36m(RayTrainWorker pid=53630)[0m Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz
[36m(RayTrainWorker pid=53630)[0m Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz to /Users/atharvakulkarni/data/FashionMNIST/raw/train-images-idx3-ubyte.gz


  0%|          | 0/26421880 [00:00<?, ?it/s]


== Status ==
Current time: 2024-09-29 11:43:08 (running for 00:00:05.13)
Using FIFO scheduling algorithm.
Logical resource usage: 5.0/10 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-09-29_11-43-01_585844_53312/artifacts/2024-09-29_11-43-02/TorchTrainer_2024-09-29_11-43-01/driver_artifacts
Number of trials: 1/1 (1 RUNNING)




  0%|          | 32768/26421880 [00:00<02:01, 216433.16it/s]
  0%|          | 65536/26421880 [00:00<02:10, 202472.26it/s]
  0%|          | 131072/26421880 [00:00<01:30, 289279.37it/s]
  1%|          | 229376/26421880 [00:00<01:04, 403294.88it/s]
  2%|▏         | 491520/26421880 [00:00<00:32, 799834.21it/s]
  4%|▎         | 983040/26421880 [00:01<00:16, 1504603.66it/s]
 11%|█         | 2883584/26421880 [00:01<00:06, 3903698.79it/s]
 13%|█▎        | 3407872/26421880 [00:01<00:05, 3957392.96it/s]
 27%|██▋       | 7077888/26421880 [00:01<00:01, 11290553.83it/s]
 32%|███▏      | 8454144/26421880 [00:01<00:01, 11153441.44it/s]
 40%|███▉      | 10485760/26421880 [00:01<00:01, 13396533.78it/s]
 45%|████▌     | 11993088/26421880 [00:01<00:01, 13105029.20it/s]
 52%|█████▏    | 13697024/26421880 [00:02<00:00, 14036279.94it/s]
 59%|█████▉    | 15532032/26421880 [00:02<00:00, 14643627.83it/s]
 66%|██████▌   | 17432576/26421880 [00:02<00:00, 15753470.49it/s]
 72%|███████▏  | 19070976/26421880 [00:02

[36m(RayTrainWorker pid=53630)[0m Extracting /Users/atharvakulkarni/data/FashionMNIST/raw/train-images-idx3-ubyte.gz to /Users/atharvakulkarni/data/FashionMNIST/raw
[36m(RayTrainWorker pid=53630)[0m 
[36m(RayTrainWorker pid=53630)[0m Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz
[36m(RayTrainWorker pid=53630)[0m Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz to /Users/atharvakulkarni/data/FashionMNIST/raw/train-labels-idx1-ubyte.gz


  0%|          | 0/29515 [00:00<?, ?it/s]
100%|██████████| 29515/29515 [00:00<00:00, 181621.02it/s]


[36m(RayTrainWorker pid=53630)[0m Extracting /Users/atharvakulkarni/data/FashionMNIST/raw/train-labels-idx1-ubyte.gz to /Users/atharvakulkarni/data/FashionMNIST/raw
[36m(RayTrainWorker pid=53630)[0m 
[36m(RayTrainWorker pid=53630)[0m Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz
[36m(RayTrainWorker pid=53630)[0m Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz to /Users/atharvakulkarni/data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz


  0%|          | 0/4422102 [00:00<?, ?it/s]
  1%|          | 32768/4422102 [00:00<00:23, 189917.54it/s]


== Status ==
Current time: 2024-09-29 11:43:13 (running for 00:00:10.14)
Using FIFO scheduling algorithm.
Logical resource usage: 5.0/10 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-09-29_11-43-01_585844_53312/artifacts/2024-09-29_11-43-02/TorchTrainer_2024-09-29_11-43-01/driver_artifacts
Number of trials: 1/1 (1 RUNNING)




  1%|▏         | 65536/4422102 [00:00<00:35, 123376.05it/s]
  2%|▏         | 98304/4422102 [00:00<00:29, 145295.17it/s]
  3%|▎         | 131072/4422102 [00:00<00:26, 160948.97it/s]
  4%|▎         | 163840/4422102 [00:01<00:24, 171697.69it/s]
  4%|▍         | 196608/4422102 [00:01<00:23, 178711.00it/s]
  5%|▌         | 229376/4422102 [00:01<00:22, 184584.43it/s]
  6%|▌         | 262144/4422102 [00:01<00:22, 187703.48it/s]
  7%|▋         | 294912/4422102 [00:01<00:21, 190446.91it/s]
  7%|▋         | 327680/4422102 [00:01<00:20, 196419.00it/s]
  9%|▉         | 393216/4422102 [00:02<00:16, 247601.28it/s]
 10%|▉         | 425984/4422102 [00:02<00:17, 230968.48it/s]
 10%|█         | 458752/4422102 [00:02<00:17, 227640.87it/s]
 12%|█▏        | 524288/4422102 [00:02<00:14, 268875.50it/s]
 13%|█▎        | 557056/4422102 [00:02<00:15, 255828.01it/s]
 14%|█▍        | 622592/4422102 [00:02<00:13, 286541.81it/s]
 16%|█▌        | 688128/4422102 [00:03<00:11, 315102.89it/s]
 16%|█▋        | 720896/44

== Status ==
Current time: 2024-09-29 11:43:18 (running for 00:00:15.24)
Using FIFO scheduling algorithm.
Logical resource usage: 5.0/10 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-09-29_11-43-01_585844_53312/artifacts/2024-09-29_11-43-02/TorchTrainer_2024-09-29_11-43-01/driver_artifacts
Number of trials: 1/1 (1 RUNNING)




 46%|████▌     | 2031616/4422102 [00:05<00:03, 621379.85it/s]
 49%|████▉     | 2162688/4422102 [00:05<00:03, 665149.92it/s]
 53%|█████▎    | 2326528/4422102 [00:06<00:02, 756273.54it/s]
 56%|█████▌    | 2457600/4422102 [00:06<00:02, 757804.28it/s]
 59%|█████▉    | 2621440/4422102 [00:06<00:02, 811139.55it/s]
 63%|██████▎   | 2785280/4422102 [00:06<00:01, 858088.05it/s]
 67%|██████▋   | 2981888/4422102 [00:06<00:01, 950451.05it/s]
 72%|███████▏  | 3178496/4422102 [00:06<00:01, 1017579.23it/s]
 76%|███████▋  | 3375104/4422102 [00:07<00:00, 1064133.48it/s]
 82%|████████▏ | 3604480/4422102 [00:07<00:00, 1149034.06it/s]
 87%|████████▋ | 3833856/4422102 [00:07<00:00, 1262323.51it/s]
 93%|█████████▎| 4096000/4422102 [00:07<00:00, 1305300.23it/s]
100%|██████████| 4422102/4422102 [00:07<00:00, 569479.82it/s] 


[36m(RayTrainWorker pid=53630)[0m Extracting /Users/atharvakulkarni/data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz to /Users/atharvakulkarni/data/FashionMNIST/raw
[36m(RayTrainWorker pid=53630)[0m 
[36m(RayTrainWorker pid=53630)[0m Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz
[36m(RayTrainWorker pid=53630)[0m Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz to /Users/atharvakulkarni/data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz
[36m(RayTrainWorker pid=53630)[0m Extracting /Users/atharvakulkarni/data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz to /Users/atharvakulkarni/data/FashionMNIST/raw
[36m(RayTrainWorker pid=53630)[0m 


100%|██████████| 5148/5148 [00:00<00:00, 6021270.77it/s]
[36m(RayTrainWorker pid=53630)[0m Moving model to device: cpu
[36m(RayTrainWorker pid=53630)[0m Wrapping provided model in DistributedDataParallel.
Train Epoch 0:   0%|          | 0/1875 [00:00<?, ?it/s]
Train Epoch 0:   0%|          | 9/1875 [00:00<00:21, 86.85it/s]


== Status ==
Current time: 2024-09-29 11:43:23 (running for 00:00:20.31)
Using FIFO scheduling algorithm.
Logical resource usage: 5.0/10 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-09-29_11-43-01_585844_53312/artifacts/2024-09-29_11-43-02/TorchTrainer_2024-09-29_11-43-01/driver_artifacts
Number of trials: 1/1 (1 RUNNING)




Train Epoch 0:   0%|          | 0/1875 [00:00<?, ?it/s][32m [repeated 3x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/user-guides/configure-logging.html#log-deduplication for more options.)[0m
Train Epoch 0:  49%|████▉     | 925/1875 [00:05<00:05, 184.09it/s][32m [repeated 188x across cluster][0m


== Status ==
Current time: 2024-09-29 11:43:28 (running for 00:00:25.39)
Using FIFO scheduling algorithm.
Logical resource usage: 5.0/10 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-09-29_11-43-01_585844_53312/artifacts/2024-09-29_11-43-02/TorchTrainer_2024-09-29_11-43-01/driver_artifacts
Number of trials: 1/1 (1 RUNNING)




Train Epoch 0:  92%|█████████▏| 1728/1875 [00:09<00:00, 175.32it/s]
Train Epoch 0:  91%|█████████ | 1710/1875 [00:09<00:00, 179.76it/s][32m [repeated 167x across cluster][0m
Train Epoch 0: 100%|██████████| 1875/1875 [00:10<00:00, 178.33it/s]
Test Epoch 0:   0%|          | 0/313 [00:00<?, ?it/s]
Test Epoch 0:  28%|██▊       | 87/313 [00:00<00:00, 868.33it/s]
Test Epoch 0: 100%|██████████| 313/313 [00:00<00:00, 982.71it/s] 
Train Epoch 1:   0%|          | 0/1875 [00:00<?, ?it/s]
Test Epoch 0: 100%|██████████| 313/313 [00:00<00:00, 1021.63it/s]
Test Epoch 0: 100%|██████████| 313/313 [00:00<00:00, 951.37it/s]
Train Epoch 1:   7%|▋         | 128/1875 [00:00<00:11, 146.72it/s]


== Status ==
Current time: 2024-09-29 11:43:33 (running for 00:00:30.47)
Using FIFO scheduling algorithm.
Logical resource usage: 5.0/10 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-09-29_11-43-01_585844_53312/artifacts/2024-09-29_11-43-02/TorchTrainer_2024-09-29_11-43-01/driver_artifacts
Number of trials: 1/1 (1 RUNNING)




Train Epoch 1:  13%|█▎        | 249/1875 [00:01<00:11, 145.98it/s]
Train Epoch 0:  99%|█████████▉| 1853/1875 [00:10<00:00, 165.97it/s][32m [repeated 31x across cluster][0m
Train Epoch 1:  33%|███▎      | 612/1875 [00:04<00:09, 139.32it/s][32m [repeated 153x across cluster][0m
Train Epoch 0: 100%|██████████| 1875/1875 [00:10<00:00, 178.34it/s][32m [repeated 3x across cluster][0m
Test Epoch 0:   0%|          | 0/313 [00:00<?, ?it/s][32m [repeated 3x across cluster][0m
Test Epoch 0:  60%|██████    | 189/313 [00:00<00:00, 956.91it/s][32m [repeated 7x across cluster][0m
Test Epoch 0: 100%|██████████| 313/313 [00:00<00:00, 972.76it/s]
Train Epoch 1:   0%|          | 0/1875 [00:00<?, ?it/s][32m [repeated 3x across cluster][0m
Train Epoch 1:  21%|██        | 392/1875 [00:02<00:09, 152.20it/s][32m [repeated 2x across cluster][0m


== Status ==
Current time: 2024-09-29 11:43:38 (running for 00:00:35.53)
Using FIFO scheduling algorithm.
Logical resource usage: 5.0/10 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-09-29_11-43-01_585844_53312/artifacts/2024-09-29_11-43-02/TorchTrainer_2024-09-29_11-43-01/driver_artifacts
Number of trials: 1/1 (1 RUNNING)




Train Epoch 1:  68%|██████▊   | 1281/1875 [00:09<00:04, 138.23it/s][32m [repeated 184x across cluster][0m


== Status ==
Current time: 2024-09-29 11:43:43 (running for 00:00:40.61)
Using FIFO scheduling algorithm.
Logical resource usage: 5.0/10 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-09-29_11-43-01_585844_53312/artifacts/2024-09-29_11-43-02/TorchTrainer_2024-09-29_11-43-01/driver_artifacts
Number of trials: 1/1 (1 RUNNING)




Train Epoch 1:  92%|█████████▏| 1725/1875 [00:12<00:01, 137.19it/s]
Test Epoch 1:   0%|          | 0/313 [00:00<?, ?it/s]
Test Epoch 1:  46%|████▌     | 144/313 [00:00<00:00, 1439.97it/s]
Test Epoch 1:  93%|█████████▎| 290/313 [00:00<00:00, 1439.86it/s]
Train Epoch 2:   0%|          | 0/1875 [00:00<?, ?it/s]
Train Epoch 2:   3%|▎         | 52/1875 [00:00<00:14, 126.49it/s][32m [repeated 133x across cluster][0m
Train Epoch 2:   4%|▍         | 81/1875 [00:00<00:13, 134.50it/s]


== Status ==
Current time: 2024-09-29 11:43:48 (running for 00:00:45.67)
Using FIFO scheduling algorithm.
Logical resource usage: 5.0/10 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-09-29_11-43-01_585844_53312/artifacts/2024-09-29_11-43-02/TorchTrainer_2024-09-29_11-43-01/driver_artifacts
Number of trials: 1/1 (1 RUNNING)




Train Epoch 1: 100%|██████████| 1875/1875 [00:13<00:00, 135.31it/s][32m [repeated 47x across cluster][0m
Test Epoch 1:   0%|          | 0/313 [00:00<?, ?it/s][32m [repeated 3x across cluster][0m
Test Epoch 1:  49%|████▊     | 152/313 [00:00<00:00, 1512.35it/s][32m [repeated 3x across cluster][0m
Test Epoch 1: 100%|██████████| 313/313 [00:00<00:00, 1455.80it/s][32m [repeated 7x across cluster][0m
Train Epoch 2:   0%|          | 0/1875 [00:00<?, ?it/s][32m [repeated 3x across cluster][0m
Train Epoch 2:  43%|████▎     | 798/1875 [00:05<00:07, 141.80it/s][32m [repeated 185x across cluster][0m
Train Epoch 2:   4%|▍         | 81/1875 [00:00<00:13, 134.21it/s][32m [repeated 2x across cluster][0m


== Status ==
Current time: 2024-09-29 11:43:53 (running for 00:00:50.75)
Using FIFO scheduling algorithm.
Logical resource usage: 5.0/10 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-09-29_11-43-01_585844_53312/artifacts/2024-09-29_11-43-02/TorchTrainer_2024-09-29_11-43-01/driver_artifacts
Number of trials: 1/1 (1 RUNNING)




Train Epoch 2:  81%|████████  | 1518/1875 [00:10<00:02, 135.64it/s][32m [repeated 188x across cluster][0m
Train Epoch 2:  92%|█████████▏| 1726/1875 [00:12<00:00, 150.91it/s]


== Status ==
Current time: 2024-09-29 11:43:58 (running for 00:00:55.82)
Using FIFO scheduling algorithm.
Logical resource usage: 5.0/10 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-09-29_11-43-01_585844_53312/artifacts/2024-09-29_11-43-02/TorchTrainer_2024-09-29_11-43-01/driver_artifacts
Number of trials: 1/1 (1 RUNNING)




Train Epoch 2: 100%|██████████| 1875/1875 [00:13<00:00, 143.42it/s]
Test Epoch 2:   0%|          | 0/313 [00:00<?, ?it/s]
Test Epoch 2:  48%|████▊     | 150/313 [00:00<00:00, 1493.58it/s]
Test Epoch 2: 100%|██████████| 313/313 [00:00<00:00, 1505.16it/s]
Train Epoch 3:   0%|          | 0/1875 [00:00<?, ?it/s]
Train Epoch 3:  17%|█▋        | 325/1875 [00:02<00:10, 145.78it/s][32m [repeated 140x across cluster][0m
Train Epoch 2:  99%|█████████▉| 1854/1875 [00:12<00:00, 143.04it/s][32m [repeated 35x across cluster][0m


== Status ==
Current time: 2024-09-29 11:44:03 (running for 00:01:00.82)
Using FIFO scheduling algorithm.
Logical resource usage: 5.0/10 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-09-29_11-43-01_585844_53312/artifacts/2024-09-29_11-43-02/TorchTrainer_2024-09-29_11-43-01/driver_artifacts
Number of trials: 1/1 (1 RUNNING)




Train Epoch 2: 100%|██████████| 1875/1875 [00:13<00:00, 143.39it/s][32m [repeated 3x across cluster][0m
Test Epoch 2:   0%|          | 0/313 [00:00<?, ?it/s][32m [repeated 3x across cluster][0m
Test Epoch 2:  47%|████▋     | 147/313 [00:00<00:00, 1463.88it/s][32m [repeated 3x across cluster][0m
Test Epoch 2: 100%|██████████| 313/313 [00:00<00:00, 1481.89it/s][32m [repeated 3x across cluster][0m
Train Epoch 3:   0%|          | 0/1875 [00:00<?, ?it/s][32m [repeated 3x across cluster][0m
Train Epoch 3:  43%|████▎     | 803/1875 [00:05<00:07, 142.36it/s]
Train Epoch 3:  55%|█████▍    | 1024/1875 [00:07<00:06, 133.47it/s][32m [repeated 178x across cluster][0m


== Status ==
Current time: 2024-09-29 11:44:08 (running for 00:01:05.90)
Using FIFO scheduling algorithm.
Logical resource usage: 5.0/10 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-09-29_11-43-01_585844_53312/artifacts/2024-09-29_11-43-02/TorchTrainer_2024-09-29_11-43-01/driver_artifacts
Number of trials: 1/1 (1 RUNNING)




Train Epoch 3:  74%|███████▍  | 1386/1875 [00:09<00:03, 138.14it/s][32m [repeated 5x across cluster][0m
Train Epoch 3:  91%|█████████▏| 1712/1875 [00:12<00:01, 137.68it/s]
Train Epoch 3:  90%|█████████ | 1694/1875 [00:12<00:01, 123.37it/s][32m [repeated 176x across cluster][0m
Train Epoch 3:  93%|█████████▎| 1748/1875 [00:12<00:00, 156.84it/s]
Test Epoch 3:   0%|          | 0/313 [00:00<?, ?it/s]
Test Epoch 3:  52%|█████▏    | 164/313 [00:00<00:00, 1638.74it/s]
Test Epoch 3: 100%|██████████| 313/313 [00:00<00:00, 1530.49it/s]
Train Epoch 4:   0%|          | 0/1875 [00:00<?, ?it/s]
Test Epoch 3: 100%|██████████| 313/313 [00:00<00:00, 1412.52it/s]
Test Epoch 3: 100%|██████████| 313/313 [00:00<00:00, 1418.57it/s]


== Status ==
Current time: 2024-09-29 11:44:13 (running for 00:01:10.94)
Using FIFO scheduling algorithm.
Logical resource usage: 5.0/10 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-09-29_11-43-01_585844_53312/artifacts/2024-09-29_11-43-02/TorchTrainer_2024-09-29_11-43-01/driver_artifacts
Number of trials: 1/1 (1 RUNNING)




Train Epoch 4:   9%|▉         | 177/1875 [00:01<00:12, 138.75it/s][32m [repeated 3x across cluster][0m
Train Epoch 3:  99%|█████████▉| 1855/1875 [00:13<00:00, 135.57it/s][32m [repeated 32x across cluster][0m
Train Epoch 4:  28%|██▊       | 531/1875 [00:03<00:08, 155.36it/s][32m [repeated 133x across cluster][0m
Train Epoch 3: 100%|██████████| 1875/1875 [00:13<00:00, 138.73it/s][32m [repeated 6x across cluster][0m
Test Epoch 3:   0%|          | 0/313 [00:00<?, ?it/s][32m [repeated 3x across cluster][0m
Test Epoch 3:  87%|████████▋ | 272/313 [00:00<00:00, 1355.41it/s][32m [repeated 4x across cluster][0m
Test Epoch 3: 100%|██████████| 313/313 [00:00<00:00, 1365.03it/s]
Train Epoch 4:   0%|          | 0/1875 [00:00<?, ?it/s][32m [repeated 3x across cluster][0m


== Status ==
Current time: 2024-09-29 11:44:18 (running for 00:01:16.00)
Using FIFO scheduling algorithm.
Logical resource usage: 5.0/10 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-09-29_11-43-01_585844_53312/artifacts/2024-09-29_11-43-02/TorchTrainer_2024-09-29_11-43-01/driver_artifacts
Number of trials: 1/1 (1 RUNNING)




Train Epoch 4:  57%|█████▋    | 1060/1875 [00:07<00:05, 157.54it/s]
Train Epoch 4:  68%|██████▊   | 1281/1875 [00:08<00:04, 127.01it/s][32m [repeated 176x across cluster][0m


== Status ==
Current time: 2024-09-29 11:44:24 (running for 00:01:21.09)
Using FIFO scheduling algorithm.
Logical resource usage: 5.0/10 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-09-29_11-43-01_585844_53312/artifacts/2024-09-29_11-43-02/TorchTrainer_2024-09-29_11-43-01/driver_artifacts
Number of trials: 1/1 (1 RUNNING)




Train Epoch 4:  92%|█████████▏| 1718/1875 [00:11<00:01, 144.44it/s]
Train Epoch 4:  81%|████████▏ | 1524/1875 [00:10<00:02, 132.90it/s][32m [repeated 6x across cluster][0m
Train Epoch 4: 100%|██████████| 1875/1875 [00:12<00:00, 144.27it/s]
Test Epoch 4:   0%|          | 0/313 [00:00<?, ?it/s]
Test Epoch 4:  42%|████▏     | 130/313 [00:00<00:00, 1296.33it/s]
Test Epoch 4:  92%|█████████▏| 288/313 [00:00<00:00, 1443.47it/s]
Train Epoch 5:   0%|          | 0/1875 [00:00<?, ?it/s]
Test Epoch 4: 100%|██████████| 313/313 [00:00<00:00, 1472.95it/s]
Train Epoch 5:   5%|▍         | 87/1875 [00:00<00:12, 144.33it/s][32m [repeated 134x across cluster][0m


== Status ==
Current time: 2024-09-29 11:44:29 (running for 00:01:26.13)
Using FIFO scheduling algorithm.
Logical resource usage: 5.0/10 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-09-29_11-43-01_585844_53312/artifacts/2024-09-29_11-43-02/TorchTrainer_2024-09-29_11-43-01/driver_artifacts
Number of trials: 1/1 (1 RUNNING)




Train Epoch 4: 100%|██████████| 1875/1875 [00:13<00:00, 144.18it/s][32m [repeated 43x across cluster][0m
Train Epoch 5:  28%|██▊       | 521/1875 [00:03<00:09, 145.63it/s][32m [repeated 2x across cluster][0m
Train Epoch 4: 100%|██████████| 1875/1875 [00:13<00:00, 144.17it/s]
Test Epoch 4:   0%|          | 0/313 [00:00<?, ?it/s][32m [repeated 3x across cluster][0m
Test Epoch 4:  87%|████████▋ | 272/313 [00:00<00:00, 1366.94it/s][32m [repeated 4x across cluster][0m
Test Epoch 4: 100%|██████████| 313/313 [00:00<00:00, 1440.13it/s][32m [repeated 3x across cluster][0m
Train Epoch 5:   0%|          | 0/1875 [00:00<?, ?it/s][32m [repeated 3x across cluster][0m
Train Epoch 5:  43%|████▎     | 810/1875 [00:05<00:07, 144.79it/s][32m [repeated 184x across cluster][0m


== Status ==
Current time: 2024-09-29 11:44:34 (running for 00:01:31.21)
Using FIFO scheduling algorithm.
Logical resource usage: 5.0/10 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-09-29_11-43-01_585844_53312/artifacts/2024-09-29_11-43-02/TorchTrainer_2024-09-29_11-43-01/driver_artifacts
Number of trials: 1/1 (1 RUNNING)




Train Epoch 5:  69%|██████▉   | 1303/1875 [00:09<00:03, 150.63it/s][32m [repeated 2x across cluster][0m
Train Epoch 5:  82%|████████▏ | 1537/1875 [00:10<00:02, 134.13it/s][32m [repeated 185x across cluster][0m
Train Epoch 5:  92%|█████████▏| 1725/1875 [00:11<00:01, 145.07it/s]


== Status ==
Current time: 2024-09-29 11:44:39 (running for 00:01:36.29)
Using FIFO scheduling algorithm.
Logical resource usage: 5.0/10 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-09-29_11-43-01_585844_53312/artifacts/2024-09-29_11-43-02/TorchTrainer_2024-09-29_11-43-01/driver_artifacts
Number of trials: 1/1 (1 RUNNING)




Train Epoch 5:  98%|█████████▊| 1835/1875 [00:12<00:00, 145.36it/s]
Test Epoch 5:   0%|          | 0/313 [00:00<?, ?it/s]
Test Epoch 5:  45%|████▌     | 141/313 [00:00<00:00, 1406.81it/s]
Test Epoch 5: 100%|██████████| 313/313 [00:00<00:00, 1430.26it/s]
Train Epoch 6:   0%|          | 0/1875 [00:00<?, ?it/s]
Test Epoch 5: 100%|██████████| 313/313 [00:00<00:00, 1685.67it/s]
Train Epoch 6:   1%|          | 23/1875 [00:00<00:15, 118.50it/s][32m [repeated 2x across cluster][0m
Train Epoch 6:  20%|█▉        | 374/1875 [00:02<00:10, 144.74it/s][32m [repeated 136x across cluster][0m
Train Epoch 5: 100%|██████████| 1875/1875 [00:12<00:00, 144.85it/s][32m [repeated 41x across cluster][0m


== Status ==
Current time: 2024-09-29 11:44:44 (running for 00:01:41.35)
Using FIFO scheduling algorithm.
Logical resource usage: 5.0/10 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-09-29_11-43-01_585844_53312/artifacts/2024-09-29_11-43-02/TorchTrainer_2024-09-29_11-43-01/driver_artifacts
Number of trials: 1/1 (1 RUNNING)




Test Epoch 5:   0%|          | 0/313 [00:00<?, ?it/s][32m [repeated 3x across cluster][0m
Test Epoch 5:  47%|████▋     | 146/313 [00:00<00:00, 1459.72it/s][32m [repeated 3x across cluster][0m
Test Epoch 5: 100%|██████████| 313/313 [00:00<00:00, 1472.26it/s][32m [repeated 2x across cluster][0m
Train Epoch 6:   0%|          | 0/1875 [00:00<?, ?it/s][32m [repeated 3x across cluster][0m
Train Epoch 6:  46%|████▌     | 857/1875 [00:05<00:07, 137.55it/s][32m [repeated 3x across cluster][0m
Train Epoch 6:  59%|█████▊    | 1100/1875 [00:07<00:05, 141.65it/s][32m [repeated 182x across cluster][0m


== Status ==
Current time: 2024-09-29 11:44:49 (running for 00:01:46.43)
Using FIFO scheduling algorithm.
Logical resource usage: 5.0/10 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-09-29_11-43-01_585844_53312/artifacts/2024-09-29_11-43-02/TorchTrainer_2024-09-29_11-43-01/driver_artifacts
Number of trials: 1/1 (1 RUNNING)




Train Epoch 6:  66%|██████▌   | 1232/1875 [00:08<00:04, 143.33it/s]
Train Epoch 6:  91%|█████████ | 1709/1875 [00:11<00:01, 126.81it/s]
Train Epoch 6:  91%|█████████▏| 1712/1875 [00:11<00:01, 127.23it/s]
Train Epoch 6:  94%|█████████▍| 1762/1875 [00:12<00:00, 124.10it/s]
Train Epoch 6:  91%|█████████ | 1710/1875 [00:11<00:01, 126.71it/s][32m [repeated 161x across cluster][0m
Test Epoch 6:   0%|          | 0/313 [00:00<?, ?it/s]
Test Epoch 6:  30%|███       | 95/313 [00:00<00:00, 945.77it/s]
Test Epoch 6: 100%|██████████| 313/313 [00:00<00:00, 1224.90it/s]
Train Epoch 7:   0%|          | 0/1875 [00:00<?, ?it/s]
Test Epoch 6: 100%|██████████| 313/313 [00:00<00:00, 964.47it/s]


== Status ==
Current time: 2024-09-29 11:44:54 (running for 00:01:51.46)
Using FIFO scheduling algorithm.
Logical resource usage: 5.0/10 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-09-29_11-43-01_585844_53312/artifacts/2024-09-29_11-43-02/TorchTrainer_2024-09-29_11-43-01/driver_artifacts
Number of trials: 1/1 (1 RUNNING)




Train Epoch 6: 100%|██████████| 1875/1875 [00:13<00:00, 140.88it/s][32m [repeated 47x across cluster][0m
Train Epoch 6: 100%|██████████| 1875/1875 [00:13<00:00, 140.60it/s][32m [repeated 2x across cluster][0m
Train Epoch 7:  27%|██▋       | 505/1875 [00:04<00:11, 122.03it/s][32m [repeated 150x across cluster][0m
Test Epoch 6:   0%|          | 0/313 [00:00<?, ?it/s][32m [repeated 3x across cluster][0m
Test Epoch 6:  62%|██████▏   | 194/313 [00:00<00:00, 968.48it/s][32m [repeated 6x across cluster][0m
Train Epoch 7:  32%|███▏      | 608/1875 [00:04<00:09, 126.85it/s]
Train Epoch 7:   0%|          | 0/1875 [00:00<?, ?it/s][32m [repeated 3x across cluster][0m
Test Epoch 6: 100%|██████████| 313/313 [00:00<00:00, 994.33it/s][32m [repeated 2x across cluster][0m


== Status ==
Current time: 2024-09-29 11:44:59 (running for 00:01:56.53)
Using FIFO scheduling algorithm.
Logical resource usage: 5.0/10 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-09-29_11-43-01_585844_53312/artifacts/2024-09-29_11-43-02/TorchTrainer_2024-09-29_11-43-01/driver_artifacts
Number of trials: 1/1 (1 RUNNING)




Train Epoch 7:  61%|██████▏   | 1150/1875 [00:09<00:05, 121.74it/s][32m [repeated 182x across cluster][0m
Train Epoch 7:  51%|█████▏    | 965/1875 [00:07<00:06, 130.07it/s][32m [repeated 4x across cluster][0m


== Status ==
Current time: 2024-09-29 11:45:04 (running for 00:02:01.60)
Using FIFO scheduling algorithm.
Logical resource usage: 5.0/10 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-09-29_11-43-01_585844_53312/artifacts/2024-09-29_11-43-02/TorchTrainer_2024-09-29_11-43-01/driver_artifacts
Number of trials: 1/1 (1 RUNNING)




Train Epoch 7:  91%|█████████▏| 1714/1875 [00:13<00:01, 123.38it/s]
Train Epoch 7:  93%|█████████▎| 1746/1875 [00:13<00:01, 128.70it/s]
Train Epoch 7:  91%|█████████ | 1706/1875 [00:13<00:01, 124.89it/s][32m [repeated 167x across cluster][0m
Test Epoch 7:   0%|          | 0/313 [00:00<?, ?it/s]
Test Epoch 7:  36%|███▌      | 113/313 [00:00<00:00, 1127.18it/s]
Test Epoch 7: 100%|██████████| 313/313 [00:00<00:00, 1135.53it/s]
Train Epoch 8:   0%|          | 0/1875 [00:00<?, ?it/s]


== Status ==
Current time: 2024-09-29 11:45:09 (running for 00:02:06.69)
Using FIFO scheduling algorithm.
Logical resource usage: 5.0/10 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-09-29_11-43-01_585844_53312/artifacts/2024-09-29_11-43-02/TorchTrainer_2024-09-29_11-43-01/driver_artifacts
Number of trials: 1/1 (1 RUNNING)




Train Epoch 7: 100%|██████████| 1875/1875 [00:14<00:00, 125.88it/s][32m [repeated 49x across cluster][0m
Train Epoch 8:  28%|██▊       | 520/1875 [00:04<00:10, 123.72it/s][32m [repeated 153x across cluster][0m
Test Epoch 7:   0%|          | 0/313 [00:00<?, ?it/s][32m [repeated 3x across cluster][0m
Test Epoch 7:  33%|███▎      | 102/313 [00:00<00:00, 1013.22it/s][32m [repeated 3x across cluster][0m
Test Epoch 7: 100%|██████████| 313/313 [00:00<00:00, 1053.05it/s][32m [repeated 3x across cluster][0m
Train Epoch 8:   0%|          | 0/1875 [00:00<?, ?it/s][32m [repeated 3x across cluster][0m
Train Epoch 8:  34%|███▍      | 638/1875 [00:05<00:09, 128.02it/s]


== Status ==
Current time: 2024-09-29 11:45:14 (running for 00:02:11.76)
Using FIFO scheduling algorithm.
Logical resource usage: 5.0/10 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-09-29_11-43-01_585844_53312/artifacts/2024-09-29_11-43-02/TorchTrainer_2024-09-29_11-43-01/driver_artifacts
Number of trials: 1/1 (1 RUNNING)




Train Epoch 8:  62%|██████▏   | 1156/1875 [00:09<00:05, 124.86it/s][32m [repeated 188x across cluster][0m
Train Epoch 8:  35%|███▍      | 652/1875 [00:05<00:09, 128.37it/s]
Train Epoch 8:  75%|███████▍  | 1405/1875 [00:11<00:03, 123.73it/s]


== Status ==
Current time: 2024-09-29 11:45:19 (running for 00:02:16.84)
Using FIFO scheduling algorithm.
Logical resource usage: 5.0/10 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-09-29_11-43-01_585844_53312/artifacts/2024-09-29_11-43-02/TorchTrainer_2024-09-29_11-43-01/driver_artifacts
Number of trials: 1/1 (1 RUNNING)




Train Epoch 8:  91%|█████████▏| 1712/1875 [00:13<00:01, 124.95it/s]
Train Epoch 8:  91%|█████████ | 1699/1875 [00:13<00:01, 126.34it/s][32m [repeated 159x across cluster][0m
Train Epoch 8: 100%|██████████| 1875/1875 [00:14<00:00, 126.09it/s]
Test Epoch 8:   0%|          | 0/313 [00:00<?, ?it/s]
Test Epoch 8:  34%|███▍      | 107/313 [00:00<00:00, 1068.35it/s]
Test Epoch 8: 100%|██████████| 313/313 [00:00<00:00, 1143.97it/s]
Train Epoch 9:   0%|          | 0/1875 [00:00<?, ?it/s]
Test Epoch 8: 100%|██████████| 313/313 [00:00<00:00, 1077.53it/s]


== Status ==
Current time: 2024-09-29 11:45:24 (running for 00:02:21.84)
Using FIFO scheduling algorithm.
Logical resource usage: 5.0/10 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-09-29_11-43-01_585844_53312/artifacts/2024-09-29_11-43-02/TorchTrainer_2024-09-29_11-43-01/driver_artifacts
Number of trials: 1/1 (1 RUNNING)




Train Epoch 9:   5%|▍         | 92/1875 [00:00<00:14, 126.06it/s][32m [repeated 4x across cluster][0m
Train Epoch 8:  99%|█████████▉| 1861/1875 [00:14<00:00, 128.50it/s][32m [repeated 47x across cluster][0m
Train Epoch 9:  28%|██▊       | 531/1875 [00:04<00:09, 136.27it/s][32m [repeated 150x across cluster][0m
Train Epoch 8: 100%|██████████| 1875/1875 [00:14<00:00, 126.27it/s][32m [repeated 3x across cluster][0m
Test Epoch 8:   0%|          | 0/313 [00:00<?, ?it/s][32m [repeated 3x across cluster][0m
Test Epoch 8:  64%|██████▎   | 199/313 [00:00<00:00, 1000.76it/s][32m [repeated 4x across cluster][0m
Test Epoch 8: 100%|██████████| 313/313 [00:00<00:00, 1235.60it/s][32m [repeated 2x across cluster][0m
Train Epoch 9:   0%|          | 0/1875 [00:00<?, ?it/s][32m [repeated 3x across cluster][0m


== Status ==
Current time: 2024-09-29 11:45:29 (running for 00:02:26.89)
Using FIFO scheduling algorithm.
Logical resource usage: 5.0/10 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-09-29_11-43-01_585844_53312/artifacts/2024-09-29_11-43-02/TorchTrainer_2024-09-29_11-43-01/driver_artifacts
Number of trials: 1/1 (1 RUNNING)




Train Epoch 9:  20%|██        | 376/1875 [00:03<00:11, 130.14it/s][32m [repeated 2x across cluster][0m
Train Epoch 9:  62%|██████▏   | 1163/1875 [00:09<00:05, 125.68it/s][32m [repeated 186x across cluster][0m


== Status ==
Current time: 2024-09-29 11:45:34 (running for 00:02:31.95)
Using FIFO scheduling algorithm.
Logical resource usage: 5.0/10 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-09-29_11-43-01_585844_53312/artifacts/2024-09-29_11-43-02/TorchTrainer_2024-09-29_11-43-01/driver_artifacts
Number of trials: 1/1 (1 RUNNING)




Train Epoch 9:  65%|██████▍   | 1213/1875 [00:09<00:05, 127.31it/s][32m [repeated 3x across cluster][0m
Train Epoch 9:  92%|█████████▏| 1722/1875 [00:13<00:01, 125.48it/s]
Train Epoch 9:  91%|█████████ | 1707/1875 [00:13<00:01, 127.84it/s][32m [repeated 161x across cluster][0m
Train Epoch 9: 100%|██████████| 1875/1875 [00:14<00:00, 126.79it/s]
Test Epoch 9:   0%|          | 0/313 [00:00<?, ?it/s]
Test Epoch 9:  31%|███       | 97/313 [00:00<00:00, 962.88it/s]
Test Epoch 9: 100%|██████████| 313/313 [00:00<00:00, 1494.53it/s]
Test Epoch 9: 100%|██████████| 313/313 [00:00<00:00, 1056.42it/s]
2024-09-29 11:45:39,868	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to '/Users/atharvakulkarni/ray_results/TorchTrainer_2024-09-29_11-43-01' in 0.0033s.
2024-09-29 11:45:39,870	INFO tune.py:1041 -- Total run time: 156.93 seconds (156.90 seconds for the tuning loop).


== Status ==
Current time: 2024-09-29 11:45:39 (running for 00:02:36.90)
Using FIFO scheduling algorithm.
Logical resource usage: 5.0/10 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-09-29_11-43-01_585844_53312/artifacts/2024-09-29_11-43-02/TorchTrainer_2024-09-29_11-43-01/driver_artifacts
Number of trials: 1/1 (1 TERMINATED)


Training result: Result(
  metrics={'loss': 0.3730754413829444, 'accuracy': 0.8652},
  path='/Users/atharvakulkarni/ray_results/TorchTrainer_2024-09-29_11-43-01/TorchTrainer_a884f_00000_0_2024-09-29_11-43-02',
  filesystem='local',
  checkpoint=None
)
