# Neural Network on GPU



From Kaggle: 
"MNIST ("Modified National Institute of Standards and Technology") is the de facto “hello world” dataset of computer vision. Since its release in 1999, this classic dataset of handwritten images has served as the basis for benchmarking classification algorithms. As new machine learning techniques emerge, MNIST remains a reliable resource for researchers and learners alike."

[Read more.](https://www.kaggle.com/c/digit-recognizer)


<a title="By Josef Steppan [CC BY-SA 4.0 (https://creativecommons.org/licenses/by-sa/4.0)], from Wikimedia Commons" href="https://commons.wikimedia.org/wiki/File:MnistExamples.png"><img width="512" alt="MnistExamples" src="https://upload.wikimedia.org/wikipedia/commons/2/27/MnistExamples.png"/></a>

In [0]:
import torch
import torch.nn as nn
import torchvision.transforms as transforms
import torchvision.datasets as dsets 


## STEP 1: LOADING DATASET

In [0]:
train_dataset = dsets.MNIST(root='./data', 
                            train=True, 
                            transform=transforms.ToTensor(),
                            download=True)

test_dataset = dsets.MNIST(root='./data', 
                           train=False, 
                           transform=transforms.ToTensor())

## STEP 2: MAKING DATASET ITERABLE

In [0]:
batch_size = 100
n_iters = 1200 #3000 
num_epochs = n_iters / (len(train_dataset) / batch_size)
num_epochs = int(num_epochs)

train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=batch_size, 
                                           shuffle=True)

test_loader = torch.utils.data.DataLoader(dataset=test_dataset, 
                                          batch_size=batch_size, 
                                          shuffle=False)
 

## STEP 3: CREATE MODEL CLASS

In [0]:
class CNNModel(nn.Module):
    def __init__(self):
        super(CNNModel, self).__init__()

        # Convolution 1
        self.cnn1 = nn.Conv2d(in_channels=1, out_channels=16, kernel_size=5, stride=1, padding=0)
        self.relu1 = nn.ReLU()

        # Max pool 1
        self.maxpool1 = nn.MaxPool2d(kernel_size=2)

        # Convolution 2
        self.cnn2 = nn.Conv2d(in_channels=16, out_channels=32, kernel_size=5, stride=1, padding=0)
        self.relu2 = nn.ReLU()

        # Max pool 2
        self.maxpool2 = nn.MaxPool2d(kernel_size=2)

        # Fully connected 1 (readout)
        self.fc1 = nn.Linear(32 * 4 * 4, 10) 

    def forward(self, x):
        # Convolution 1
        out = self.cnn1(x)
        out = self.relu1(out)

        # Max pool 1
        out = self.maxpool1(out)

        # Convolution 2 
        out = self.cnn2(out)
        out = self.relu2(out)

        # Max pool 2 
        out = self.maxpool2(out)

        # Resize
        # Original size: (100, 32, 7, 7)
        # out.size(0): 100
        # New out size: (100, 32*7*7)
        out = out.view(out.size(0), -1)

        # Linear function (readout)
        out = self.fc1(out)

        return out



## STEP 4: INSTANTIATE MODEL CLASS

In [12]:
model = CNNModel()

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

device = "cpu"

print(device)
model.to(device)


cpu


CNNModel(
  (cnn1): Conv2d(1, 16, kernel_size=(5, 5), stride=(1, 1))
  (relu1): ReLU()
  (maxpool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (cnn2): Conv2d(16, 32, kernel_size=(5, 5), stride=(1, 1))
  (relu2): ReLU()
  (maxpool2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=512, out_features=10, bias=True)
)

In [6]:
print(model)

CNNModel(
  (cnn1): Conv2d(1, 16, kernel_size=(5, 5), stride=(1, 1))
  (relu1): ReLU()
  (maxpool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (cnn2): Conv2d(16, 32, kernel_size=(5, 5), stride=(1, 1))
  (relu2): ReLU()
  (maxpool2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=512, out_features=10, bias=True)
)


## STEP 5: INSTANTIATE LOSS CLASS

In [0]:
criterion = nn.CrossEntropyLoss()



## STEP 6: INSTANTIATE OPTIMIZER CLASS

In [0]:
learning_rate = 0.01

optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
#all parameters are on GPU


In [15]:
print(model.parameters)

<bound method Module.parameters of CNNModel(
  (cnn1): Conv2d(1, 16, kernel_size=(5, 5), stride=(1, 1))
  (relu1): ReLU()
  (maxpool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (cnn2): Conv2d(16, 32, kernel_size=(5, 5), stride=(1, 1))
  (relu2): ReLU()
  (maxpool2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=512, out_features=10, bias=True)
)>


Function to compute the accuracy on the test set

### Question: modify the following code to exploit both the GPU and the CPU

In [0]:
def test_model(test_loader, model, device):
  # Calculate Accuracy         
  correct = 0
  total = 0
  # Iterate through test dataset
  for images, labels in test_loader:
    images = images.requires_grad_().to(device)
    labels = labels.to(device)

    # Forward pass only to get logits/output
    outputs = model(images)

    # Get predictions from the maximum value
    _, predicted = torch.max(outputs.data, 1)

    # Total number of labels
    total += labels.size(0)

    # Total correct predictions
    correct += (predicted == labels).sum()

  accuracy = 100 * correct / total
  print(accuracy)
    
  return accuracy

## STEP 7: TRAIN THE MODEL

### Question: modify the following code to exploit the GPU instead of the CPU

In [11]:
%%time
# Time execution of a Python statement or expression.
# wall time is the actual time taken from the start of a computer program to the end

iter = 0
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader): # on CPU 

        # Transfer to GPU
        images = images.requires_grad_().to(device)
        labels = labels.to(device)

        # Clear gradients w.r.t. parameters
        optimizer.zero_grad()

        # Forward pass to get output/logits
        outputs = model(images)

        # Calculate Loss: softmax --> cross entropy loss
        loss = criterion(outputs, labels)

        # Getting gradients w.r.t. parameters
        loss.backward()

        # Updating parameters
        optimizer.step()

        iter += 1

        if iter % 500 == 0:
            # Calculate Accuracy on the test set        
            accuracy = test_model(test_loader, model,device)

            # Print Loss
            print('Iteration: {}. Loss: {}. Accuracy on test set: {}'.format(iter, loss.item(), accuracy))



tensor(89, device='cuda:0')
Iteration: 500. Loss: 0.31570589542388916. Accuracy on test set: 89
tensor(92, device='cuda:0')
Iteration: 1000. Loss: 0.25182855129241943. Accuracy on test set: 92
CPU times: user 19.7 s, sys: 1.92 s, total: 21.6 s
Wall time: 21.7 s


### Question: compare the wall time on GPU to the wall time on CPU

In [17]:
%%time
# Time execution of a Python statement or expression.
# wall time is the actual time taken from the start of a computer program to the end

iter = 0
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader): # on CPU 

        images = images.requires_grad_()
        labels = labels

        # Clear gradients w.r.t. parameters
        optimizer.zero_grad()

        # Forward pass to get output/logits
        outputs = model(images)

        # Calculate Loss: softmax --> cross entropy loss
        loss = criterion(outputs, labels)

        # Getting gradients w.r.t. parameters
        loss.backward()

        # Updating parameters
        optimizer.step()

        iter += 1

        if iter % 500 == 0:
            # Calculate Accuracy on the test set        
            accuracy = test_model(test_loader, model,device)

            # Print Loss
            print('Iteration: {}. Loss: {}. Accuracy on test set: {}'.format(iter, loss.item(), accuracy))

tensor(87)
Iteration: 500. Loss: 0.39787793159484863. Accuracy on test set: 87
tensor(92)
Iteration: 1000. Loss: 0.2556685209274292. Accuracy on test set: 92
CPU times: user 1min 7s, sys: 1.18 s, total: 1min 9s
Wall time: 1min 9s


### Question: increase the number of epoch until 5 to see if we can expect a better average accuracy