In [1]:
import os
import tempfile

import torch
from torch.nn import CrossEntropyLoss
from torch.optim import Adam
from torch.utils.data import DataLoader
from torchvision.models import resnet18
from torchvision.datasets import FashionMNIST
from torchvision.transforms import ToTensor, Normalize, Compose

import ray.train.torch

def train_func():
    # Model, Loss, Optimizer
    model = resnet18(num_classes=10)
    model.conv1 = torch.nn.Conv2d(
        1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False
    )
    # [1] Prepare model.
    model = ray.train.torch.prepare_model(model)
    # model.to("cuda")  # This is done by `prepare_model`
    criterion = CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr=0.001)

    # Data
    transform = Compose([ToTensor(), Normalize((0.5,), (0.5,))])
    data_dir = os.path.join(tempfile.gettempdir(), "data")
    train_data = FashionMNIST(root=data_dir, train=True, download=True, transform=transform)
    train_loader = DataLoader(train_data, batch_size=128, shuffle=True)
    # [2] Prepare dataloader.
    train_loader = ray.train.torch.prepare_data_loader(train_loader)

    # Training
    for epoch in range(10):
        if ray.train.get_context().get_world_size() > 1:
            train_loader.sampler.set_epoch(epoch)

        for images, labels in train_loader:
            # This is done by `prepare_data_loader`!
            # images, labels = images.to("cuda"), labels.to("cuda")
            outputs = model(images)
            loss = criterion(outputs, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        # [3] Report metrics and checkpoint.
        metrics = {"loss": loss.item(), "epoch": epoch}
        with tempfile.TemporaryDirectory() as temp_checkpoint_dir:
            torch.save(
                model.module.state_dict(),
                os.path.join(temp_checkpoint_dir, "model.pt")
            )
            ray.train.report(
                metrics,
                checkpoint=ray.train.Checkpoint.from_directory(temp_checkpoint_dir),
            )
        if ray.train.get_context().get_world_rank() == 0:
            print(metrics)

# [4] Configure scaling and resource requirements.
scaling_config = ray.train.ScalingConfig(num_workers=2, use_gpu=False)

# [5] Launch distributed training job.
trainer = ray.train.torch.TorchTrainer(
    train_func,
    scaling_config=scaling_config,
    # [5a] If running in a multi-node cluster, this is where you
    # should configure the run's persistent storage that is accessible
    # across all worker nodes.
    # run_config=ray.train.RunConfig(storage_path="s3://..."),
)
result = trainer.fit()

# [6] Load the trained model.
with result.checkpoint.as_directory() as checkpoint_dir:
    model_state_dict = torch.load(os.path.join(checkpoint_dir, "model.pt"))
    model = resnet18(num_classes=10)
    model.conv1 = torch.nn.Conv2d(
        1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False
    )
    model.load_state_dict(model_state_dict)

0,1
Current time:,2024-03-29 12:03:32
Running for:,00:39:57.91
Memory:,33.7/64.0 GiB

Trial name,status,loc,iter,total time (s),loss,epoch
TorchTrainer_7458a_00000,TERMINATED,127.0.0.1:75003,10,2393.8,0.0493913,9


[36m(RayTrainWorker pid=75027)[0m Setting up process group for: env:// [rank=0, world_size=2]
[36m(TorchTrainer pid=75003)[0m Started distributed worker processes: 
[36m(TorchTrainer pid=75003)[0m - (ip=127.0.0.1, pid=75027) world_rank=0, local_rank=0, node_rank=0
[36m(TorchTrainer pid=75003)[0m - (ip=127.0.0.1, pid=75028) world_rank=1, local_rank=1, node_rank=0
[36m(RayTrainWorker pid=75027)[0m Moving model to device: cpu
[36m(RayTrainWorker pid=75027)[0m Wrapping provided model in DistributedDataParallel.


[36m(RayTrainWorker pid=75028)[0m Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz
[36m(RayTrainWorker pid=75028)[0m Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz to /var/folders/0n/0gc794z54nv7z0mdwxbh998h0000gp/T/data/FashionMNIST/raw/train-images-idx3-ubyte.gz


  0%|          | 0/26421880 [00:00<?, ?it/s]
  0%|          | 32768/26421880 [00:00<02:09, 203506.24it/s]
  7%|▋         | 1769472/26421880 [00:01<00:09, 2625544.46it/s]
 44%|████▍     | 11665408/26421880 [00:03<00:03, 4759212.57it/s]
 92%|█████████▏| 24379392/26421880 [00:04<00:00, 7713758.46it/s]
 96%|█████████▌| 25362432/26421880 [00:04<00:00, 5374700.63it/s]
100%|██████████| 26421880/26421880 [00:05<00:00, 5237262.29it/s]


[36m(RayTrainWorker pid=75028)[0m Extracting /var/folders/0n/0gc794z54nv7z0mdwxbh998h0000gp/T/data/FashionMNIST/raw/train-images-idx3-ubyte.gz to /var/folders/0n/0gc794z54nv7z0mdwxbh998h0000gp/T/data/FashionMNIST/raw
[36m(RayTrainWorker pid=75027)[0m Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz
[36m(RayTrainWorker pid=75027)[0m Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz to /var/folders/0n/0gc794z54nv7z0mdwxbh998h0000gp/T/data/FashionMNIST/raw/train-images-idx3-ubyte.gz


  0%|          | 0/26421880 [00:00<?, ?it/s]
 83%|████████▎ | 21987328/26421880 [00:05<00:00, 7749730.09it/s][32m [repeated 60x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/ray-logging.html#log-deduplication for more options.)[0m


[36m(RayTrainWorker pid=75028)[0m 
[36m(RayTrainWorker pid=75028)[0m Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz
[36m(RayTrainWorker pid=75028)[0m Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz to /var/folders/0n/0gc794z54nv7z0mdwxbh998h0000gp/T/data/FashionMNIST/raw/train-labels-idx1-ubyte.gz


  0%|          | 0/29515 [00:00<?, ?it/s]
100%|██████████| 4422102/4422102 [00:01<00:00, 2516749.04it/s][32m [repeated 5x across cluster][0m
 83%|████████▎ | 3670016/4422102 [00:02<00:00, 2477183.58it/s][32m [repeated 27x across cluster][0m


[36m(RayTrainWorker pid=75027)[0m Using downloaded and verified file: /var/folders/0n/0gc794z54nv7z0mdwxbh998h0000gp/T/data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz
[36m(RayTrainWorker pid=75027)[0m Extracting /var/folders/0n/0gc794z54nv7z0mdwxbh998h0000gp/T/data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz to /var/folders/0n/0gc794z54nv7z0mdwxbh998h0000gp/T/data/FashionMNIST/raw[32m [repeated 7x across cluster][0m
[36m(RayTrainWorker pid=75027)[0m [32m [repeated 7x across cluster][0m
[36m(RayTrainWorker pid=75027)[0m Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz[32m [repeated 5x across cluster][0m


[36m(RayTrainWorker pid=75028)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/Users/Xingsheng_Qian/ray_results/TorchTrainer_2024-03-29_11-23-32/TorchTrainer_7458a_00000_0_2024-03-29_11-23-34/checkpoint_000000)
100%|██████████| 5148/5148 [00:00<00:00, 13761808.15it/s][32m [repeated 4x across cluster][0m
100%|██████████| 4422102/4422102 [00:02<00:00, 1753565.90it/s][32m [repeated 2x across cluster][0m


[36m(RayTrainWorker pid=75027)[0m {'loss': 0.20273448526859283, 'epoch': 0}
[36m(RayTrainWorker pid=75028)[0m Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz to /var/folders/0n/0gc794z54nv7z0mdwxbh998h0000gp/T/data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz[32m [repeated 4x across cluster][0m


[36m(RayTrainWorker pid=75028)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/Users/Xingsheng_Qian/ray_results/TorchTrainer_2024-03-29_11-23-32/TorchTrainer_7458a_00000_0_2024-03-29_11-23-34/checkpoint_000001)[32m [repeated 2x across cluster][0m


[36m(RayTrainWorker pid=75027)[0m {'loss': 0.17315298318862915, 'epoch': 1}


[36m(RayTrainWorker pid=75028)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/Users/Xingsheng_Qian/ray_results/TorchTrainer_2024-03-29_11-23-32/TorchTrainer_7458a_00000_0_2024-03-29_11-23-34/checkpoint_000002)[32m [repeated 2x across cluster][0m


[36m(RayTrainWorker pid=75027)[0m {'loss': 0.3881944715976715, 'epoch': 2}


[36m(RayTrainWorker pid=75028)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/Users/Xingsheng_Qian/ray_results/TorchTrainer_2024-03-29_11-23-32/TorchTrainer_7458a_00000_0_2024-03-29_11-23-34/checkpoint_000003)[32m [repeated 2x across cluster][0m


[36m(RayTrainWorker pid=75027)[0m {'loss': 0.19966208934783936, 'epoch': 3}


[36m(RayTrainWorker pid=75028)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/Users/Xingsheng_Qian/ray_results/TorchTrainer_2024-03-29_11-23-32/TorchTrainer_7458a_00000_0_2024-03-29_11-23-34/checkpoint_000004)[32m [repeated 2x across cluster][0m


[36m(RayTrainWorker pid=75027)[0m {'loss': 0.25064101815223694, 'epoch': 4}


[36m(RayTrainWorker pid=75028)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/Users/Xingsheng_Qian/ray_results/TorchTrainer_2024-03-29_11-23-32/TorchTrainer_7458a_00000_0_2024-03-29_11-23-34/checkpoint_000005)[32m [repeated 2x across cluster][0m


[36m(RayTrainWorker pid=75027)[0m {'loss': 0.27079126238822937, 'epoch': 5}


[36m(RayTrainWorker pid=75028)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/Users/Xingsheng_Qian/ray_results/TorchTrainer_2024-03-29_11-23-32/TorchTrainer_7458a_00000_0_2024-03-29_11-23-34/checkpoint_000006)[32m [repeated 2x across cluster][0m


[36m(RayTrainWorker pid=75027)[0m {'loss': 0.15984489023685455, 'epoch': 6}


[36m(RayTrainWorker pid=75027)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/Users/Xingsheng_Qian/ray_results/TorchTrainer_2024-03-29_11-23-32/TorchTrainer_7458a_00000_0_2024-03-29_11-23-34/checkpoint_000007)[32m [repeated 2x across cluster][0m


[36m(RayTrainWorker pid=75027)[0m {'loss': 0.178643599152565, 'epoch': 7}


[36m(RayTrainWorker pid=75028)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/Users/Xingsheng_Qian/ray_results/TorchTrainer_2024-03-29_11-23-32/TorchTrainer_7458a_00000_0_2024-03-29_11-23-34/checkpoint_000008)[32m [repeated 2x across cluster][0m


[36m(RayTrainWorker pid=75027)[0m {'loss': 0.13146929442882538, 'epoch': 8}


[36m(RayTrainWorker pid=75028)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/Users/Xingsheng_Qian/ray_results/TorchTrainer_2024-03-29_11-23-32/TorchTrainer_7458a_00000_0_2024-03-29_11-23-34/checkpoint_000009)[32m [repeated 2x across cluster][0m


[36m(RayTrainWorker pid=75027)[0m {'loss': 0.04939134791493416, 'epoch': 9}


2024-03-29 12:03:32,897	INFO tune.py:1042 -- Total run time: 2397.93 seconds (2397.91 seconds for the tuning loop).


In [1]:
scaling_config = ray.train.ScalingConfig(num_workers=2, use_gpu=True)

NameError: name 'ray' is not defined