# Data Parallel Training

# Data Parallel Training

- we can have multiple GPUs on a single machine
- how can we take advantage of them?
- data parallel training:
    - split input batch into multiple smaller batches
    - train the same model on each GPU
    - sum up the gradients across GPUs
    - update the weights consistently across GPUs
- implemented by `torch.nn.DataParallel`
- simple modification of a model

In [1]:
%pylab inline
!date; hostname; whoami; pwd; curl https://ipinfo.io/hostname; nvidia-smi -L
from imp import reload
from torch import nn, optim
from torch.nn import functional as F
from torchmore import layers, flex
import torch
from torchvision import datasets, transforms
from torchvision.datasets import imagenet
import os.path
from torch.utils import data as torchdata
import helpers

Populating the interactive namespace from numpy and matplotlib
Thu Dec 12 09:33:06 PST 2019
bragi
tmb
/home/tmb/exp/bigdata19
GPU 0: TITAN X (Pascal) (UUID: GPU-a964bb9a-cb1a-5036-e1d8-1217c1faa8e7)
GPU 1: TITAN X (Pascal) (UUID: GPU-a16b9686-b668-e8d5-ff5f-f85aea86d034)


# Mock Loader for Benchmarking

In [2]:
class MockLoader(object):
    def __init__(self, shape):
        self.shape = shape
        self.data = torch.rand(shape).cuda()
        self.targets = torch.zeros((shape[0],), dtype=torch.int64)
    def __iter__(self):
        while True:
            yield self.data, self.targets
            
batch_size = 128
training_dl = MockLoader((batch_size, 3, 224, 224))
inputs, targets = next(iter(training_dl))

# Serial Model

In [3]:
from torchvision import models
def make_model():
    return models.resnet50()

In [4]:
model = make_model()
trainer = helpers.Trainer(model.cuda())
trainer.set_lr(1e-6)
trainer.train_for(5000, training_dl, quiet=True)
serial = trainer.timers.training/batch_size
del model

# DataParallel Model

Note: only one change.

In [5]:
model = make_model()
model = nn.DataParallel(make_model())
trainer = helpers.Trainer(model.cuda())
trainer.set_lr(1e-6)
trainer.train_for(5000, training_dl, quiet=True)
parallel = trainer.timers.training/batch_size
del model

KeyboardInterrupt: 

In [None]:
print(serial, parallel, serial/parallel)

# Bigger Batch Size

In [None]:
batch_size = 512
training_dl = MockLoader((batch_size, 3, 224, 224))

model = make_model()
model = nn.DataParallel(make_model())
trainer = helpers.Trainer(model.cuda())
trainer.set_lr(1e-6)
trainer.train_for(5000, training_dl, quiet=True)
parallel512 = trainer.timers.training/batch_size

In [None]:
print(serial, parallel, parallel512, serial/parallel512)

# Combining DataParallel with FP16

In [None]:
from apex import amp

class ParallelAmpTrainer(helpers.Trainer):
    def __init__(self, model):
        super().__init__(model)
    def set_lr(self, lr):
        optimizer = optim.SGD(self.model.parameters(), lr=lr, momentum=0.9)
        self.model, self.optimizer = amp.initialize(
            self.model, optimizer, opt_level="O1", loss_scale="dynamic")
        self.model = nn.DataParallel(self.model)
        
batch_size = 512
training_dl = MockLoader((batch_size, 3, 224, 224))

model = make_model()
trainer = ParallelAmpTrainer(model.cuda())
trainer.set_lr(1e-6)
trainer.train_for(5000, training_dl, quiet=True)
parallelamp = trainer.timers.training/batch_size
print(parallel512, parallelamp, parallel512/parallelamp)

# DataParallel

- `DataParallel` is a simple way of using multiple GPUs
- effectively you end up with much bigger batch sizes
- may create an I/O bottleneck
- consider using `DistributedDataParallel` even on a single node