In [1]:
! nvidia-smi

Wed Nov 14 13:20:51 2018       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 410.73       Driver Version: 410.73       CUDA Version: 10.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  GeForce RTX 208...  Off  | 00000000:08:00.0 Off |                  N/A |
| 28%   30C    P0    46W / 250W |      0MiB / 10989MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   1  GeForce RTX 208...  Off  | 00000000:42:00.0 Off |                  N/A |
| 28%   34C    P0    75W / 250W |      0MiB / 10989MiB |     31%      Default |
+-------------------------------+----------------------+----------------------+
|   2  GeForce RTX 208...  Off  | 00000000:43:00.0 Off |                  N/

In [2]:
DEVICE_IDS = [0, 1, 2]
BATCHSIZE = 1000 * len(DEVICE_IDS)

In [3]:
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# torch.backends.cudnn.deterministic = True

transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=BATCHSIZE, shuffle=False)

class VGG(nn.Module):
    "Implementation Ref: https://github.com/kuangliu/pytorch-cifar"
    def __init__(self):
        super(VGG, self).__init__()
        VGG16 = [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 
                 512, 512, 512, 'M', 512, 512, 512, 'M']
        self.features = self._make_layers(VGG16)
        self.classifier = nn.Linear(512, 10)

    def forward(self, x):
        out = self.features(x)
        out = out.view(out.size(0), -1)
        out = self.classifier(out)
        return out

    def _make_layers(self, cfg):
        layers = []
        in_channels = 3
        for x in cfg:
            if x == 'M':
                layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
            else:
                layers += [nn.Conv2d(in_channels, x, kernel_size=3, padding=1),
                           nn.BatchNorm2d(x),
                           nn.ReLU(inplace=True)]
                in_channels = x
        layers += [nn.AvgPool2d(kernel_size=1, stride=1)]
        return nn.Sequential(*layers)

net = VGG()

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

device = "cuda"
torch.cuda.set_device(0)
net.to(device);
net = nn.DataParallel(net, device_ids=DEVICE_IDS)

print('Training starts...')

for epoch in range(5):  # loop over the dataset multiple times

    running_loss = 0.0
    for i, data in enumerate(trainloader, 0):
        # get the inputs
        inputs, labels = data
        
        inputs, labels = inputs.cuda(device, async=True), labels.cuda(device, async=True)
        inputs, targets = torch.autograd.Variable(inputs), torch.autograd.Variable(labels)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
    
    print('[{:d}, {:5f}]'.format(epoch+1, loss.item()))
    
print('Done..')

Files already downloaded and verified
Training starts...
[1, 1.920340]
[2, 1.620958]
[3, 1.433068]
[4, 1.290872]
[5, 1.163199]
Done..


In [4]:
! nvidia-smi

Wed Nov 14 13:21:26 2018       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 410.73       Driver Version: 410.73       CUDA Version: 10.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  GeForce RTX 208...  Off  | 00000000:08:00.0 Off |                  N/A |
| 33%   50C    P2    85W / 250W |   7559MiB / 10989MiB |     92%      Default |
+-------------------------------+----------------------+----------------------+
|   1  GeForce RTX 208...  Off  | 00000000:42:00.0 Off |                  N/A |
| 34%   52C    P2    79W / 250W |   7505MiB / 10989MiB |     91%      Default |
+-------------------------------+----------------------+----------------------+
|   2  GeForce RTX 208...  Off  | 00000000:43:00.0 Off |                  N/