In [1]:
"""
Optional: Data Parallelism
==========================
**Authors**: `Sung Kim <https://github.com/hunkim>`_ and `Jenny Kang <https://github.com/jennykang>`_

In this tutorial, we will learn how to use multiple GPUs using ``DataParallel``.

It's very easy to use GPUs with PyTorch. You can put the model on a GPU:

.. code:: python

    device = torch.device("cuda:0")
    model.to(device)

Then, you can copy all your tensors to the GPU:

.. code:: python

    mytensor = my_tensor.to(device)

Please note that just calling ``my_tensor.to(device)`` returns a new copy of
``my_tensor`` on GPU instead of rewriting ``my_tensor``. You need to assign it to
a new tensor and use that tensor on the GPU.

It's natural to execute your forward, backward propagations on multiple GPUs.
However, Pytorch will only use one GPU by default. You can easily run your
operations on multiple GPUs by making your model run parallelly using
``DataParallel``:

.. code:: python

    model = nn.DataParallel(model)

That's the core behind this tutorial. We will explore it in more detail below.
"""


######################################################################
# Imports and parameters
# ----------------------
#
# Import PyTorch modules and define parameters.
#

import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
from modules.quantize import quantize, quantize_grad, QConv2d, QLinear, RangeBN
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import csv

# Specify Parameters
output_file = 'ant_trials.csv'
max_bits = 100000
conv1_w = 0
conv2_w = 0
fc1_w = 0
fc2_w = 0
fc3_w = 0

######################################################################
# Device
#
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

######################################################################
# Dummy DataSet
########################################################################
# The output of torchvision datasets are PILImage images of range [0, 1].
# We transform them to Tensors of normalized range [-1, 1].

transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

trainset = torchvision.datasets.CIFAR10(root='../../../data', train=True,
                                        download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=4,
                                          shuffle=True, num_workers=2)

testset = torchvision.datasets.CIFAR10(root='../../../data', train=False,
                                       download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=4,
                                         shuffle=False, num_workers=2)

classes = ('plane', 'car', 'bird', 'cat',
           'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

########################################################################
#


######################################################################
# Simple Model
# ------------
#
# For the demo, our model just gets an input, performs a linear operation, and
# gives an output. However, you can use ``DataParallel`` on any model (CNN, RNN,
# Capsule Net etc.)
#
# We've placed a print statement inside the model to monitor the size of input
# and output tensors.
# Please pay attention to what is printed at batch rank 0.
#

# class Model(nn.Module):
#     # Our model

#     def __init__(self, input_size, output_size):
#         super(Model, self).__init__()
#         self.fc = nn.Linear(input_size, output_size)

#     def forward(self, input):
#         output = self.fc(input)
#         print("\tIn Model: input size", input.size(),
#               "output size", output.size())

#         return output
def runNet():
    class Net(nn.Module):
        def __init__(self):
            super(Net, self).__init__()

            self.conv1 = QConv2d(3, 6, 5, num_bits_weight = conv1_w)
            self.pool = nn.MaxPool2d(2, 2)
            self.conv2 = QConv2d(6, 16, 5, num_bits_weight = conv2_w)
            self.fc1 = QLinear(16 * 5 * 5, 120, num_bits_weight = fc1_w)
            self.fc2 = QLinear(120, 84, num_bits_weight = fc2_w)
            self.fc3 = QLinear(84, 10, num_bits_weight = fc3_w)

        def update(self):
            self.__init__()

        def forward(self, x):
            x = self.pool(F.relu(self.conv1(x)))
            x = self.pool(F.relu(self.conv2(x)))
            x = x.view(-1, 16 * 5 * 5)
            x = F.relu(self.fc1(x))
            x = F.relu(self.fc2(x))
            x = self.fc3(x)
            return x


    ######################################################################
    # Create Model and DataParallel
    # -----------------------------
    #
    # This is the core part of the tutorial. First, we need to make a model instance
    # and check if we have multiple GPUs. If we have multiple GPUs, we can wrap
    # our model using ``nn.DataParallel``. Then we can put our model on GPUs by
    # ``model.to(device)``
    #

    net = Net()


    net.to(device)


    ######################################################################
    # Run the Model
    # -------------
    #
    # Now we can see the sizes of input and output tensors.
    #

    import torch.optim as optim

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(net.parameters(), lr=0.001)

    ########################################################################
    # 4. Train the network
    # ^^^^^^^^^^^^^^^^^^^^
    #
    # This is when things start to get interesting.
    # We simply have to loop over our data iterator, and feed the inputs to the
    # network and optimize.

    for epoch in range(2):  # loop over the dataset multiple times

        running_loss = 0.0
        for i, data in enumerate(trainloader, 0):
            # get the inputs
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = net(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()
            if i % 2000 == 1999:    # print every 2000 mini-batches
                print('[%d, %5d] loss: %.3f' %
                      (epoch + 1, i + 1, running_loss / 2000))
                running_loss = 0.0

    print('Finished Training')

    # Let us look at how the network performs on the whole dataset.

    correct = 0
    total = 0
    with torch.no_grad():
        for data in testloader:
            images, labels = data
            images, labels = images.to(device), labels.to(device)
            outputs = net(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    print('Accuracy of the network on the 10000 test images: %d %%' % (
        100 * correct / total))

    ########################################################################
    # That looks waaay better than chance, which is 10% accuracy (randomly picking
    # a class out of 10 classes).
    # Seems like the network learnt something.
    #
    # Hmmm, what are the classes that performed well, and the classes that did
    # not perform well:

    class_correct = list(0. for i in range(10))
    class_total = list(0. for i in range(10))
    with torch.no_grad():
        for data in testloader:
            images, labels = data
            images, labels = images.to(device), labels.to(device)
            outputs = net(images)
            _, predicted = torch.max(outputs, 1)
            c = (predicted == labels).squeeze()
            for i in range(4):
                label = labels[i]
                class_correct[label] += c[i].item()
                class_total[label] += 1


    for i in range(10):
        print('Accuracy of %5s : %2d %%' % (
            classes[i], 100 * class_correct[i] / class_total[i]))

    ########################################################################
    #Append CSV
    fields = ['']*11
    fields[0] = str(100 * correct / total)
    for i in range(10):
        fields[i+1] = str(100 * class_correct[i] / class_total[i])
    outfile = open(output_file, 'a', newline='')
    writer = csv.writer(outfile)
    writer.writerow(fields)
    outfile.close()

for a in range(3,9):
    conv1_w = a
    for b in range(1,9):
        conv2_w = b
        for c in range(1,9):
            fc1_w = c
            for d in range(1,9):
                fc2_w = d
                for e in range(1,9):
                    fc3_w = e
                    totBits = a * 4704 + b * 1600 + c * 48000 + d * 10080 + e * 840
                    if totBits < max_bits:
                        print()
                        runNet()


Files already downloaded and verified
Files already downloaded and verified



RuntimeError: cuda runtime error (30) : unknown error at ..\aten\src\THC\THCGeneral.cpp:87