In [4]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms, models
from torch.utils.data import DataLoader, DistributedSampler
import torch.distributed as dist
import torch.multiprocessing as mp
from torchvision import models

In [15]:
# def setup(rank, world_size, run, backend='nccl'):
#     # Setup for DDP: OS environment variables and initialization
#     os.environ['MASTER_ADDR'] = 'localhost'
#     os.environ['MASTER_PORT'] = '12355'
#     if not dist.is_initialized():
#         dist.init_process_group(backend=backend, rank=rank, world_size=world_size)
#     run(rank, size)
#     torch.cuda.set_device(rank)
    
# def cleanup():
#     dist.destroy_process_group()

In [20]:
def train(rank, world_size):
    print(f"Starting process on rank {rank}.")

    # Setup for Distributed Data Parallel (DDP)
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '12355'
    dist.init_process_group("nccl", rank=rank, world_size=world_size)
    torch.cuda.set_device(rank)

    # Model, dataset, dataloader setup
    model = models.resnet50(pretrained=True).cuda(rank)
    num_ftrs = model.fc.in_features
    model.fc = nn.Linear(num_ftrs, 5)  # Example for 5 classes
    ddp_model = DDP(model, device_ids=[rank])

    # Example dataset and DataLoader
    transform = transforms.Compose([transforms.Resize((224, 224)), transforms.ToTensor()])
    dataset = datasets.FakeData(size=100, transform=transform)
    sampler = DistributedSampler(dataset, num_replicas=world_size, rank=rank)
    dataloader = DataLoader(dataset, sampler=sampler, batch_size=10)

    # Example training loop
    criterion = nn.CrossEntropyLoss().cuda(rank)
    optimizer = optim.SGD(ddp_model.parameters(), lr=0.01, momentum=0.9)
    for epoch in range(2):  # example epoch count
        ddp_model.train()
        for data, target in dataloader:
            data, target = data.cuda(rank), target.cuda(rank)
            optimizer.zero_grad()
            output = ddp_model(data)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
        print(f"Rank {rank}, Epoch {epoch}, Loss: {loss.item()}")

    # Cleanup
    dist.destroy_process_group()
    

In [21]:
world_size = torch.cuda.device_count() if torch.cuda.is_available() else 1
print(f"World size: {world_size}")

World size: 2


In [26]:
if __name__ == "__main__":
    world_size = torch.cuda.device_count() if torch.cuda.is_available() else 1
    print(f"World size: {world_size}")
    
    # Spawn processes using torch.multiprocessing.spawn
    mp.spawn(setup,
             args=(world_size, train),  # Pass the train function to be executed after setup
             nprocs=world_size,
             join=True)


World size: 2


ProcessExitedException: process 1 terminated with exit code 1

In [27]:
import os
import sys
import tempfile
import torch
import torch.distributed as dist
import torch.nn as nn
import torch.optim as optim
import torch.multiprocessing as mp

from torch.nn.parallel import DistributedDataParallel as DDP


def setup(rank, size, run, backend='nccl'):
    """ Initialize the distributed environment. """
    os.environ['MASTER_ADDR'] = '127.0.0.1'
    os.environ['MASTER_PORT'] = '29500'
    dist.init_process_group(backend=backend, rank=rank, world_size=size)
    run(rank, size)
    
def cleanup():
    dist.destroy_process_group()

    
class ToyModel(nn.Module):
    def __init__(self):
        super(ToyModel, self).__init__()
        self.net1 = nn.Linear(10, 10)
        self.relu = nn.ReLU()
        self.net2 = nn.Linear(10, 5)

    def forward(self, x):
        return self.net2(self.relu(self.net1(x)))


def demo_basic(rank, world_size):
    print(f"Running basic DDP example on rank {rank}.\n")

    # create model and move it to GPU with id rank
    model = ToyModel().to(rank)
    ddp_model = DDP(model, device_ids=[rank])

    loss_fn = nn.MSELoss()
    optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)

    optimizer.zero_grad()
    outputs = ddp_model(torch.randn(20, 10))
    labels = torch.randn(20, 5).to(rank)
    loss_fn(outputs, labels).backward()
    optimizer.step()

    cleanup()
    
if __name__ == "__main__":
    size = 2
    processes = []
    #mp.set_start_method("spawn")
    for rank in range(size):
        p = mp.Process(target=setup, args=(rank, size, demo_basic))
        p.start()
        processes.append(p)

    for p in processes:
        p.join()

Running basic DDP example on rank 0.
Running basic DDP example on rank 1.




Process Process-17:
Process Process-18:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/shared/centos7/anaconda3/2021.05/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/shared/centos7/anaconda3/2021.05/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/shared/centos7/anaconda3/2021.05/lib/python3.8/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/shared/centos7/anaconda3/2021.05/lib/python3.8/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-27-eba5693bc2f1>", line 18, in setup
    run(rank, size)
  File "<ipython-input-27-eba5693bc2f1>", line 18, in setup
    run(rank, size)
  File "<ipython-input-27-eba5693bc2f1>", line 39, in demo_basic
    model = ToyModel().to(rank)
  File "<ipython-input-27-eba5693bc2f1>", line 39, in demo_basic
    model = ToyModel().t

In [28]:
import os
import torch
import torch.distributed as dist
import torch.multiprocessing as mp
from torch.nn.parallel import DistributedDataParallel as DDP
from torchvision import datasets, transforms
from torchvision.models import resnet50
from torch.utils.data import DataLoader, DistributedSampler

def setup(rank, world_size):
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '12355'
    dist.init_process_group("nccl", rank=rank, world_size=world_size)

def cleanup():
    dist.destroy_process_group()

def train(rank, world_size):
    setup(rank, world_size)
    
    # Configure your dataset and DataLoader here
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
    ])
    
    data_dir = '/home/hindupur.v/varsha_hpp/dataset/lung_colon_image_set'
    dataset = datasets.ImageFolder(data_dir, transform=transform)
    sampler = DistributedSampler(dataset, num_replicas=world_size, rank=rank)
    dataloader = DataLoader(dataset, batch_size=64, sampler=sampler)

    # Model setup
    model = resnet50(pretrained=True)
    num_ftrs = model.fc.in_features
    model.fc = torch.nn.Linear(num_ftrs, 5) 
    model = model.cuda(rank)
    model = DDP(model, device_ids=[rank])

    # Loss and optimizer
    criterion = torch.nn.CrossEntropyLoss().cuda(rank)
    optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
    
    # Training loop
    model.train()
    for epoch in range(10):  # Example: 10 epochs
        for inputs, labels in dataloader:
            inputs = inputs.cuda(rank)
            labels = labels.cuda(rank)
            
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
        
        if rank == 0:
            print(f"Epoch {epoch+1}, Loss: {loss.item()}")

    cleanup()

def main():
    world_size = torch.cuda.device_count()
    mp.spawn(train, args=(world_size,), nprocs=world_size, join=True)

if __name__ == '__main__':
    main()


ProcessExitedException: process 0 terminated with exit code 1

In [6]:
import os
from torchvision.io import read_image
from torchvision.transforms import Compose, Resize, ToTensor, Normalize
from torch.utils.data import Dataset

class LungColonCancerDataset(Dataset):
    def __init__(self, data_dir, transform=None):
        """
        Args:
            data_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.data_dir = data_dir
        self.transform = transform
        self.samples = []

        # Load dataset
        for label in os.listdir(data_dir):
            class_dir = os.path.join(data_dir, label)
            if os.path.isdir(class_dir):
                for img in os.listdir(class_dir):
                    self.samples.append((os.path.join(class_dir, img), label))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        img_path, label = self.samples[idx]
        image = read_image(img_path)
        if self.transform:
            image = self.transform(image)
        return image, label

# Define transformations
transforms = Compose([
    Resize((224, 224)),
    ToTensor(),
    Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])


In [15]:
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
# from lung_colon_cancer_dataset import LungColonCancerDataset  # Hypothetical dataset class

import torch.multiprocessing as mp
from torch.utils.data.distributed import DistributedSampler
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.distributed import init_process_group, destroy_process_group
import os
from torchvision import models

def ddp_setup(rank, world_size):
    os.environ["MASTER_ADDR"] = "localhost"
    os.environ["MASTER_PORT"] = "12355"
    init_process_group(backend="nccl", rank=rank, world_size=world_size)
    torch.cuda.set_device(rank)

class Trainer:
    def __init__(
        self,
        model: torch.nn.Module,
        train_data: DataLoader,
        optimizer: torch.optim.Optimizer,
        gpu_id: int,
        save_every: int,
    ) -> None:
        self.gpu_id = gpu_id
        self.model = model.to(gpu_id)
        self.train_data = train_data
        self.optimizer = optimizer
        self.save_every = save_every
        self.model = DDP(model, device_ids=[gpu_id])

    # Other methods remain unchanged

def load_train_objs():
#   train_set = LungColonCancerDataset()  # Adjusted for your dataset
    data_dir = '/home/hindupur.v/varsha_hpp/dataset/lung_colon_image_set'
    train_set = LungColonCancerDataset(data_dir=data_dir, transform=transforms)
    model = models.resnet50(pretrained=True)  # Using ResNet50 as an example
    num_ftrs = model.fc.in_features
    num_classes = 5  # Adjust the number of classes accordingly
    model.fc = torch.nn.Linear(num_ftrs, num_classes)
    optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
    return train_set, model, optimizer

In [16]:
def main(rank: int, world_size: int, save_every: int, total_epochs: int, batch_size: int):
    ddp_setup(rank, world_size)
    dataset, model, optimizer = load_train_objs()
    train_data = prepare_dataloader(dataset, batch_size)
    trainer = Trainer(model, train_data, optimizer, rank, save_every)
    trainer.train(total_epochs)
    destroy_process_group()

In [20]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms, models
from torch.utils.data import DataLoader, DistributedSampler
import torch.distributed as dist
import torch.multiprocessing as mp

def setup(rank, world_size):
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '12355'
    dist.init_process_group("nccl", rank=rank, world_size=world_size)
    torch.cuda.set_device(rank)

def cleanup():
    dist.destroy_process_group()

class LungColonCancerDataset(datasets.ImageFolder):
    def __init__(self, root_dir, transform=None):
        super(LungColonCancerDataset, self).__init__(root=root_dir, transform=transform)

# Assuming your dataset is organized correctly and transforms are defined
def train(rank, world_size):
    setup(rank, world_size)
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])

    data_dir = '/home/hindupur.v/varsha_hpp/dataset/lung_colon_image_set/'
    train_dataset = LungColonCancerDataset(root_dir=os.path.join(data_dir, 'train'), transform=transform)
    train_sampler = DistributedSampler(train_dataset, num_replicas=world_size, rank=rank)
    train_loader = DataLoader(train_dataset, batch_size=4, sampler=train_sampler)

    model = models.resnet50(pretrained=True)
    num_ftrs = model.fc.in_features
    model.fc = nn.Linear(num_ftrs, 5)  # Assuming 5 classes
    model = model.cuda(rank)
    ddp_model = nn.parallel.DistributedDataParallel(model, device_ids=[rank])

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(ddp_model.parameters(), lr=0.001, momentum=0.9)

    for epoch in range(10):  # Adjust the number of epochs
        ddp_model.train()
        for inputs, labels in train_loader:
            inputs = inputs.cuda(rank)
            labels = labels.cuda(rank)
            optimizer.zero_grad()
            outputs = ddp_model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

        if rank == 0:  # Only print from the first process
            print(f"Epoch {epoch+1}, Loss: {loss.item()}")

    cleanup()


In [None]:
if __name__ == '__main__':
    world_size = torch.cuda.device_count()
    processes = []
    for rank in range(world_size):
        p = mp.Process(target=train, args=(rank, world_size))
        p.start()
        processes.append(p)
    for p in processes:
        p.join()



Epoch 1, Loss: 0.3714846670627594
Epoch 2, Loss: 0.007094573229551315
Epoch 3, Loss: 0.0030669146217405796
