## Implementing TensorBoard

Tensorboard is a nice dashboard for all kind of metrics for the network we are creating. Further, it helps doing hyperparameter tuning and stuff like that

In [16]:
from __future__ import print_function

import numpy as np

import torch
import torch.optim as optim 
import torch.nn as nn
import torch.nn.functional as F

import torchvision
import torchvision.transforms as transforms

from torch.utils.tensorboard import SummaryWriter

from itertools import product

In [2]:
#Let tensorboard running on localhost:
#tensorboard --logdir=runs

#creats a runs folder

In [3]:
def get_num_correct(predictions, labels):
    return predictions.argmax(dim = 1).eq(labels).sum().item()

In [4]:
class Network(nn.Module):
    def __init__(self):
        super(Network, self).__init__()
        self.conv1 = nn.Conv2d(in_channels = 1, out_channels = 6, kernel_size = 5)
        self.conv2 = nn.Conv2d(in_channels = 6, out_channels = 12, kernel_size = 5)
        
        #table and formula to calculate the changes of img sizes:
        # https://deeplizard.com/learn/video/cin4YcGBh3Q
        self.fc1 = nn.Linear(in_features = 12*4*4, out_features = 120) #needed, because the img has the shape
                                                                        #(1, 12, 4, 4) when it arrives at the fc
                                                                        #because it is flattened, the input is 12*4*4
        self.fc2 = nn.Linear(in_features = 120, out_features = 60)
        self.out = nn.Linear(in_features = 60, out_features = 10)
        
    def forward(self, t):
        t = F.relu(self.conv1(t))
        t = F.max_pool2d(t, kernel_size = 2, stride = 2)
        
        t = F.relu(self.conv2(t))
        t = F.max_pool2d(t, kernel_size = 2, stride = 2)
        
        t = F.relu(self.fc1(t.reshape(-1, 12 * 4 * 4)))
        t = F.relu(self.fc2(t))
        
        t = self.out(t)
        #normally softmax, but is implicitly included in the cross entropy 
        return t

In [5]:
train_set = torchvision.datasets.FashionMNIST(
    root = "./data/FashionMNIST"
    , train = True
    , download = True
    , transform = transforms.Compose([
        transforms.ToTensor()
    ])
)

In [6]:
train_loader = torch.utils.data.DataLoader(train_set, batch_size = 100, shuffle = True)

In [7]:
#Starting with Tensorboard:
tb = SummaryWriter() #creating a new instance

network = Network()
images, labels = next(iter(train_loader))

#create a grid of images
grid = torchvision.utils.make_grid(images)

#pass a tag and the data
tb.add_image('images', grid)

#pass our network and images tensor
tb.add_graph(network, images)

#close SummaryWriter
tb.close()

#### Training Process with TB

In [19]:
parameters = dict(
    lr = [.01, .001]
    , batch_size = [10, 100, 1000]
    , shuffle = [True, False]
    , epochs = [1]
)

param_values = [v for v in parameters.values()]
param_values

[[0.01, 0.001], [10, 100, 1000], [True, False], [1]]

In [20]:
#run counter, how many have been done
runs = 0

#make our life easy - now we can add as many hyperparams to our dict as we want
for lr, batch_size, shuffle, epochs in product(*param_values): #creates kartesian product of the params - every with every
    
    network = Network()
    train_loader = torch.utils.data.DataLoader(train_set, batch_size = batch_size, shuffle = True)
    optimizer = optim.Adam(network.parameters(), lr = lr)

    images, labels = next(iter(train_loader))
    grid = torchvision.utils.make_grid(images)

    #add TB stuff
    #uniquely identify this run, as we name this run
    comment = f' batch_size = {batch_size} lr = {lr} shuffle = {shuffle}'
    tb = SummaryWriter(comment = comment) #creating a new instance
    tb.add_image('images', grid)
    tb.add_graph(network, images)

    for epoch in range(epochs): #how many epochs
        total_loss = 0
        total_correct = 0

        for batch in train_loader: #get the batch
            images, labels = batch

            preds = network(images) #pass batch to network
            loss = F.cross_entropy(preds, labels) #calculate loss

            optimizer.zero_grad() #make gradients zero
            loss.backward() #calculate gradients
            optimizer.step() #update weights

            total_loss += loss.item() * batch_size #multiply by batchsize for comparability in tensorboard
            total_correct += get_num_correct(preds, labels)

        #Add tensorboard KPIs
        #name of value, which scalar, epoch for when the value is occuring
        tb.add_scalar('Loss', total_loss, epoch)
        tb.add_scalar('Number Correct', total_correct, epoch)
        tb.add_scalar('Accuracy', total_correct / len(train_set), epoch)

        #name of value, set of values
        #tb.add_histogram('conv1.bias', network.conv1.bias, epoch)
        #tb.add_histogram('conv1.weight', network.conv1.weight, epoch)
        #tb.add_histogram('conv1.weight.grad', network.conv1.weight.grad, epoch)

        #this can be done better:

        for name, weight in network.named_parameters(): #gives parameter and parameter name as tuple
            tb.add_histogram(name, weight, epoch)
            tb.add_histogram(f'{name}.grad', weight.grad, epoch)
        
        runs += 1
        print('run', runs, 'epoch', epoch, 'total_correct', total_correct, 'loss', total_loss, 'accuracy%', total_correct / len(train_set) * 100)

    tb.close()

#With all those runs, we can easily compare hyperparameters in TB, as we see diagrams with it.
#That makes it easy to choose the best hyperparams. Use regex in TB to filter specific params

amount of runs 1 epoch 0 total_correct 46030 loss 37573.828353344 accuracy% 76.71666666666667
amount of runs 2 epoch 0 total_correct 47244 loss 35031.88015679829 accuracy% 78.74
amount of runs 3 epoch 0 total_correct 46380 loss 35839.84170258045 accuracy% 77.3
amount of runs 4 epoch 0 total_correct 47801 loss 32675.0136628747 accuracy% 79.66833333333334
amount of runs 5 epoch 0 total_correct 36598 loss 60045.03780603409 accuracy% 60.99666666666666
amount of runs 6 epoch 0 total_correct 33124 loss 70813.6677145958 accuracy% 55.20666666666667
amount of runs 7 epoch 0 total_correct 46423 loss 35896.46309865173 accuracy% 77.37166666666667
amount of runs 8 epoch 0 total_correct 46602 loss 35320.90608856641 accuracy% 77.66999999999999
amount of runs 9 epoch 0 total_correct 42936 loss 45168.02119016647 accuracy% 71.56
amount of runs 10 epoch 0 total_correct 43087 loss 45432.783845067024 accuracy% 71.81166666666667
amount of runs 11 epoch 0 total_correct 27544 loss 99347.93251752853 accuracy% 

All this can be done way nicer with a RunBuilder class - which is in a separat notebook