# Training a Pytorch CNN on the Intel Image Dataset

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import os

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import torch
import torchaudio
import torchaudio.transforms as atransforms
import torchvision
import torchvision.datasets as dset
import torchvision.transforms as vtransforms

# Set the GPU to device 0
gpu = torch.device('cuda:0')

print(f'PyTorch version= {torch.__version__}')
print(f'torchaudio version= {torchaudio.__version__}')
print(f'torchvision version= {torchvision.__version__}')
print(f'CUDA available= {torch.cuda.is_available()}')

PyTorch version= 2.1.1
torchaudio version= 2.1.1
torchvision version= 0.16.1
CUDA available= False


In [2]:
if torch.cuda.is_available():
    # CUDA Installation
    print('CUDA Version')
    !nvcc --version
    print()

    # CUDNN Installation
    print(f'CUDNN Version: {torch.backends.cudnn.version()}')
    print(f'Number of CUDA Devices: {torch.cuda.device_count()}')
    print(f'Active CUDA Device: {torch.cuda.current_device()}')
    print(f'Available devices: {torch.cuda.device_count()}, Name: {torch.cuda.get_device_name(0)}')
    print(f'Current CUDA device: {torch.cuda.current_device()}')

Notes
1. train data and test data in torch data loaders. Uses image size and batch size set at 28 and 5000 to compose a vtransforms which uses the image data, resizes it to the image size 28. Mnist image size is 28x28 already so this is just a confirmation. Turns the image data into tensor object and normalize it. Sets it into dataloader object after shuffling the data.
2. Fit on training data with info parameter set to true. Info is just to output so viewer can see progress. Initialize layers and send the out_class so the last layer has the same size as the number of output classes. Layers occur in sequential order:
    1. Conv2D passes the number of img_channels, number of neurons in the hidden layer, and the kernel size=5.
    2. ReLu activation layer
    3. Max Pool
    4. Batch Normalization in 2d
    5. Flatten
    6. Linear layer with 16(hidden layer size)*2*4*4, 1024 linear size, 512 in features, 1024 out features.
    7. Batch Normalization in 1d, 1024.
    8. Linear layer with 1024 features and 10 output features (classes)
3. Conduct through epochs, the optimizer is a Rprop:
    1. Optimizer zero_grad
    2. Training
    3. Loss_function
    4. Backpropogation
    5. Optimization step.

Let's do similar to the intel data imageset.

**Loading training, testing, and validation data as a data loader object with modifications**

In [3]:
from torchvision import models, datasets, transforms
from torch.utils.data.sampler import SubsetRandomSampler
import torch
import numpy as np

validation_split = .2
shuffle_dataset = True
random_seed= 42
BATCH_SIZE=5000

transforms = transforms.Compose([transforms.Resize((128,128)),
                                 transforms.RandomHorizontalFlip(),
                                 transforms.ToTensor(),
                                 transforms.Normalize((0.5,), (0.5,))
                                       ]) # could be augmentation

train_dir = '/home/yahya/Downloads/archive (4)/seg_train/seg_train/'
test_dir = '/home/yahya/Downloads/archive (4)/seg_test/seg_test/'

train_data = datasets.ImageFolder(train_dir, transform=transforms)
test_data = datasets.ImageFolder(test_dir, transform=transforms)

test_loader = torch.utils.data.DataLoader(test_data, batch_size = BATCH_SIZE)

# Creating data indices for training and validation splits:
dataset_size = len(train_data)
indices = list(range(dataset_size))
split = int(np.floor(validation_split * dataset_size))
if shuffle_dataset :
    np.random.seed(random_seed)
    np.random.shuffle(indices)
train_indices, val_indices = indices[split:], indices[:split]

# Creating PT data samplers and loaders:
train_sampler = SubsetRandomSampler(train_indices)
valid_sampler = SubsetRandomSampler(val_indices)

train_loader = torch.utils.data.DataLoader(train_data, batch_size=BATCH_SIZE, 
                                           sampler=train_sampler)
validation_loader = torch.utils.data.DataLoader(train_data, batch_size=BATCH_SIZE,
                                                sampler=valid_sampler)

In [4]:
print(len(train_loader), len(validation_loader), len(test_loader))

3 1 1


In [19]:
from sklearn.metrics import accuracy_score

IMG_SIZE = 128
IMG_CHANNEL= 3  # color channel

MLP_HIDDEN= 16  # Hidden layer size

LAST_LINEAR_SIZE = 1024  # 1024 is arbitrary
N_CLASSES = 6  # output layer size

BATCH_SIZE = 5000

class PyTorchCNN(torch.nn.Module):
    def __init__(self, epochs=10, eta=0.001, batch_size=1000, seed=0):
        super(PyTorchCNN, self).__init__()
        self.random = np.random.RandomState(seed)  # shuffle mini batches
        self.epochs = epochs  # number of iterations
        self.eta = eta  # learning rate
        self.batch_size = batch_size  # size of training batch - 1 would not work
        self.loss_func = torch.nn.CrossEntropyLoss()
        self.model = None

    def init_layers(self, _K):
        import torch.nn as nn
        self.model = nn.Sequential(
            # Conv -> (?, IMG_SIZE, IMG_SIZE, MLP_HIDDEN)
            # Pool -> (?, IMG_SIZE/2, IMG_SIZE/2, MLP_HIDDEN)
            nn.Conv2d(IMG_CHANNEL, MLP_HIDDEN, 5),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2),
            #nn.BatchNorm2d(MLP_HIDDEN),

            nn.Conv2d(MLP_HIDDEN, MLP_HIDDEN*2, 5),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2),

            nn.Flatten(start_dim=1),
            
            # 4*4 is computed in the above cell - findConv2dOutShape
            nn.Linear(MLP_HIDDEN*2 * 29*29, LAST_LINEAR_SIZE), #27k to 1k
            #nn.BatchNorm1d(LAST_LINEAR_SIZE),
            nn.Linear(LAST_LINEAR_SIZE, _K) #1k to 6
        )

    def predict(self, _X):
        _X = torch.FloatTensor(_X)
        assert self.model is not None
        with torch.no_grad():
            logits = self.model(_X)
        ypred = torch.argmax(logits, dim=1)
        return ypred.cpu().numpy()

    def fit(self, _train_dl, _valid_dl, info=False):
        import sys
        self.init_layers(N_CLASSES)

        optimizer = torch.optim.Rprop(self.model.parameters(), lr=self.eta)

        # The main training loop
        for e in range(self.epochs):
            for data in _train_dl:
                X, y = data[0], data[1]

                optimizer.zero_grad()
                net_out = self.model(X)
                loss = self.loss_func(net_out, y)
                loss.backward()
                optimizer.step()
                
            for vdata in _valid_dl:
                X_val, y_val = vdata[0], vdata[1]
                continue
                
            if info:
                with torch.no_grad():
                    acc = accuracy_score(data[1].tolist(), np.argmax(self.model(X).cpu(), axis=1).tolist())
                    val_acc = accuracy_score(vdata[1].tolist(), np.argmax(self.model(X_val).cpu(), axis=1).tolist())
                print(f"\r{e+1:02d}/{self.epochs:02d} | Loss: {loss:<6.2f} | Train/Valid Acc.: {acc*100:.2f}%/{val_acc*100:.2f}%")
                

In [20]:
cnn = PyTorchCNN(epochs=10, eta=0.001, batch_size=BATCH_SIZE)

# example - the fit function will override the NN configuration
cnn.init_layers(N_CLASSES)
print(cnn)

PyTorchCNN(
  (loss_func): CrossEntropyLoss()
  (model): Sequential(
    (0): Conv2d(3, 16, kernel_size=(5, 5), stride=(1, 1))
    (1): ReLU(inplace=True)
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(16, 32, kernel_size=(5, 5), stride=(1, 1))
    (4): ReLU(inplace=True)
    (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Flatten(start_dim=1, end_dim=-1)
    (7): Linear(in_features=26912, out_features=1024, bias=True)
    (8): Linear(in_features=1024, out_features=6, bias=True)
  )
)


In [21]:
# Compute H, W for image size 28 - see the nn.Sequential below
def findConv2dOutShape(_H, _W, _conv, _pool=2):
    kernel_size, stride, padding, dilation = _conv.kernel_size, _conv.stride, _conv.padding, _conv.dilation
    H = np.floor((_H+2*padding[0]-dilation[0]*(kernel_size[0]-1)-1)/stride[0]+1)
    W = np.floor((_W+2*padding[1]-dilation[1]*(kernel_size[1]-1)-1)/stride[1]+1)
    if _pool:
        H, W = H/_pool, W/_pool
    return int(H), int(W)

print(findConv2dOutShape(128, 128, torch.nn.Conv2d(3, 16, 5))) #image size 28, uses a conv2d layer in-channels:1 outchannels:16, 5 kernel size, stride:1, no-padding.
print(findConv2dOutShape(62, 62, torch.nn.Conv2d(32, 64, 5))) #image size 12, uses a conv2d layer in-channels:16, outchannels:32, 5 kernel size, stride:1, no-padding.

(62, 62)
(29, 29)


In [22]:
%%time

cnn.fit(train_loader, validation_loader, info=True) 

01/10 | Loss: 8.05   | Train/Valid Acc.: 26.63%/24.13%
02/10 | Loss: 2.57   | Train/Valid Acc.: 34.45%/33.64%
03/10 | Loss: 1.39   | Train/Valid Acc.: 44.87%/46.65%
04/10 | Loss: 1.18   | Train/Valid Acc.: 56.92%/51.75%
05/10 | Loss: 1.05   | Train/Valid Acc.: 57.65%/53.46%
06/10 | Loss: 0.98   | Train/Valid Acc.: 66.21%/60.98%
07/10 | Loss: 0.93   | Train/Valid Acc.: 66.69%/63.90%
08/10 | Loss: 0.93   | Train/Valid Acc.: 66.04%/65.79%
09/10 | Loss: 0.88   | Train/Valid Acc.: 69.63%/66.82%
10/10 | Loss: 0.79   | Train/Valid Acc.: 72.23%/67.75%
CPU times: user 38min 52s, sys: 8min 59s, total: 47min 51s
Wall time: 12min 34s


In [13]:
# Test performance
y_test, y_pred = [], []
for data in test_loader:
    X = data[0]
    y_test += data[1].tolist()

    with torch.no_grad():
        y_pred += cnn.predict(X).tolist()

accuracy_score(y_test, y_pred)

0.6876666666666666

### Adding dropout layers to CNN

Add regularization and/or drop-out features to your CNN. Report your model's best
performance. As the performance standard deviation decreases the model is deemed to be
more robust. Why?

In [15]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import os

from sklearn.metrics import accuracy_score


IMG_SIZE = 128
IMG_CHANNEL= 3  # color channel

MLP_HIDDEN= 16  # Hidden layer size

LAST_LINEAR_SIZE = 1024  # 1024 is arbitrary
N_CLASSES = 6  # output layer size

BATCH_SIZE = 5000

class PyTorchCNN(torch.nn.Module):
    def __init__(self, epochs=10, eta=0.001, batch_size=1000, seed=0):
        super(PyTorchCNN, self).__init__()
        self.random = np.random.RandomState(seed)  # shuffle mini batches
        self.epochs = epochs  # number of iterations
        self.eta = eta  # learning rate
        self.batch_size = batch_size  # size of training batch - 1 would not work
        self.loss_func = torch.nn.CrossEntropyLoss()
        self.model = None

    def init_layers(self, _K):
        import torch.nn as nn
        self.model = nn.Sequential(
            # Conv -> (?, IMG_SIZE, IMG_SIZE, MLP_HIDDEN)
            # Pool -> (?, IMG_SIZE/2, IMG_SIZE/2, MLP_HIDDEN)
            nn.Conv2d(IMG_CHANNEL, MLP_HIDDEN, 5),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2),
            #nn.BatchNorm2d(MLP_HIDDEN),

            nn.Conv2d(MLP_HIDDEN, MLP_HIDDEN*2, 5),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2),

            nn.Flatten(start_dim=1),
            
            nn.Dropout(0.5, inplace=True),
            # 4*4 is computed in the above cell - findConv2dOutShape
            nn.Linear(MLP_HIDDEN*2 * 29*29, LAST_LINEAR_SIZE), #27k to 10k
            #nn.ReLU(inplace=True),
            #nn.BatchNorm1d(LAST_LINEAR_SIZE),
            nn.Dropout(0.5, inplace=True),
            nn.Linear(LAST_LINEAR_SIZE, _K) #1k to 6
        )
    def compute_l1_loss(self, w):
        return torch.abs(w).sum()

    def predict(self, _X):
        _X = torch.FloatTensor(_X)
        assert self.model is not None
        with torch.no_grad():
            logits = self.model(_X)
        ypred = torch.argmax(logits, dim=1)
        return ypred.cpu().numpy()

    def fit(self, _train_dl, _valid_dl, info=False):
        import sys
        import torch.nn as nn
        self.init_layers(N_CLASSES)

        optimizer = torch.optim.Rprop(self.model.parameters(), lr=self.eta) #this optimizer does not support regularization, instead it uses resilient back-propagation.

        # The main training loop
        for e in range(self.epochs):
            for data in _train_dl:
                X, y = data[0], data[1]

                optimizer.zero_grad()
                net_out = self.model(X)
                loss = self.loss_func(net_out, y)
                loss.backward()
                optimizer.step()
                
            for vdata in _valid_dl:
                X_val, y_val = vdata[0], vdata[1]
                continue
                
            if info:
                with torch.no_grad():
                    acc = accuracy_score(data[1].tolist(), np.argmax(self.model(X).cpu(), axis=1).tolist())
                    val_acc = accuracy_score(vdata[1].tolist(), np.argmax(self.model(X_val).cpu(), axis=1).tolist())
                print(f"\r{e+1:02d}/{self.epochs:02d} | Loss: {loss:<6.2f} | Train/Valid Acc.: {acc*100:.2f}%/{val_acc*100:.2f}%")


In [16]:
cnn = PyTorchCNN(epochs=10, eta=0.001, batch_size=BATCH_SIZE)

# example - the fit function will override the NN configuration
cnn.init_layers(N_CLASSES)
print(cnn)

PyTorchCNN(
  (loss_func): CrossEntropyLoss()
  (model): Sequential(
    (0): Conv2d(3, 16, kernel_size=(5, 5), stride=(1, 1))
    (1): ReLU(inplace=True)
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(16, 32, kernel_size=(5, 5), stride=(1, 1))
    (4): ReLU(inplace=True)
    (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Flatten(start_dim=1, end_dim=-1)
    (7): Dropout(p=0.5, inplace=True)
    (8): Linear(in_features=26912, out_features=1024, bias=True)
    (9): Dropout(p=0.5, inplace=True)
    (10): Linear(in_features=1024, out_features=6, bias=True)
  )
)


In [17]:
%%time

cnn.fit(train_loader, validation_loader, info=True) 

01/10 | Loss: 6.27   | Train/Valid Acc.: 37.21%/35.53%
02/10 | Loss: 1.63   | Train/Valid Acc.: 34.85%/32.29%
03/10 | Loss: 1.34   | Train/Valid Acc.: 47.72%/47.18%
04/10 | Loss: 1.18   | Train/Valid Acc.: 57.65%/55.81%
05/10 | Loss: 1.04   | Train/Valid Acc.: 61.73%/59.48%
06/10 | Loss: 1.03   | Train/Valid Acc.: 64.33%/61.26%
07/10 | Loss: 0.97   | Train/Valid Acc.: 64.82%/63.19%
08/10 | Loss: 0.88   | Train/Valid Acc.: 66.86%/64.43%
09/10 | Loss: 0.88   | Train/Valid Acc.: 66.45%/65.86%
10/10 | Loss: 0.84   | Train/Valid Acc.: 69.71%/67.14%
CPU times: user 39min, sys: 9min 21s, total: 48min 21s
Wall time: 12min 21s


In [18]:
# Test performance
y_test, y_pred = [], []
for data in test_loader:
    X = data[0]
    y_test += data[1].tolist()

    with torch.no_grad():
        y_pred += cnn.predict(X).tolist()

accuracy_score(y_test, y_pred)

0.6796666666666666

Adding dropout helped the accuracy marginally, and the standard deviation in the reclassification accuracy was greatly helped across training runs. This could be improved by adding more dropout, however a 2% difference between training and reclassification accuracy is pretty neglegible to be honest. Our model seems have enjoyed better generalizability.

The results are expected since regularization reduces overfitting and this is manifested through the deviation between the training accuracy and the reclassification accuracy. We implemented Dropout layers in the fully connected layers of the CNN, this is common practice in the field, however it is to be noted that regularization was not crucially needed in our model so far. The model became more robust because it became more generalizeable.

Let's see if we can improve accuracy by adding batch normalization.

## Batch Normalization with Early Training Termination Features

Adding batch normalization and early stopping features to the pipeline and demonstrating their effectiveness.

In [23]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import os

from sklearn.metrics import accuracy_score


IMG_SIZE = 128
IMG_CHANNEL= 3  # color channel

MLP_HIDDEN= 16  # Hidden layer size

LAST_LINEAR_SIZE = 1024  # 1024 is arbitrary
N_CLASSES = 6  # output layer size

BATCH_SIZE = 5000


class PyTorchCNN(torch.nn.Module):
    def __init__(self, epochs=10, eta=0.001, batch_size=1000, tolerance=4, min_delta=0.005, seed=0):
        super(PyTorchCNN, self).__init__()
        self.random = np.random.RandomState(seed)  # shuffle mini batches
        self.epochs = epochs  # number of iterations
        self.eta = eta  # learning rate
        self.batch_size = batch_size  # size of training batch - 1 would not work
        self.loss_func = torch.nn.CrossEntropyLoss()
        self.model = None
        self.counter = 0
        self.tolerance = tolerance
        self.min_delta = min_delta

    def init_layers(self, _K):
        import torch.nn as nn
        self.model = nn.Sequential(
            # Conv -> (?, IMG_SIZE, IMG_SIZE, MLP_HIDDEN)
            # Pool -> (?, IMG_SIZE/2, IMG_SIZE/2, MLP_HIDDEN)
            nn.Conv2d(IMG_CHANNEL, MLP_HIDDEN, 5),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2),
            nn.BatchNorm2d(MLP_HIDDEN),

            nn.Conv2d(MLP_HIDDEN, MLP_HIDDEN*2, 5),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2),

            nn.Flatten(start_dim=1),
            
            nn.Dropout(0.5, inplace=True),
            nn.Linear(MLP_HIDDEN*2 * 29*29, LAST_LINEAR_SIZE), #27k to 10k
            nn.BatchNorm1d(LAST_LINEAR_SIZE),
            nn.Dropout(0.5, inplace=True),
            nn.Linear(LAST_LINEAR_SIZE, _K) #1k to 6
        )
    def compute_l1_loss(self, w):
        return torch.abs(w).sum()

    def predict(self, _X):
        _X = torch.FloatTensor(_X)
        assert self.model is not None
        with torch.no_grad():
            logits = self.model(_X)
        ypred = torch.argmax(logits, dim=1)
        return ypred.cpu().numpy()

    def fit(self, _train_dl, _valid_dl, info=False):
        import sys
        import torch.nn as nn
        self.init_layers(N_CLASSES)

        optimizer = torch.optim.Rprop(self.model.parameters(), lr=self.eta) #this optimizer does not support regularization, instead it uses resilient back-propagation.
        
        # to track the training loss as the model trains
        train_losses = []
        # to track the validation loss as the model trains
        valid_losses = []

        # initialize the early_stopping object
        # The main training loop
        for e in range(self.epochs):
            for data in _train_dl:
                X, y = data[0], data[1]

                optimizer.zero_grad()
                net_out = self.model(X)
                loss = self.loss_func(net_out, y)
                loss.backward()
                optimizer.step()
            train_losses.append(loss.item())            
            if info:
                with torch.no_grad():
                    acc = accuracy_score(data[1].tolist(), np.argmax(self.model(X).cpu(), axis=1).tolist())
                    epoch_train_loss = np.average(train_losses)
                    for vdata in _valid_dl:
                        X_val, y_val = vdata[0], vdata[1]
                        output = self.model(X_val)
                        # calculate the loss
                        val_loss = self.loss_func(output, y_val)
                    
                    # record validation loss
                    valid_losses.append(val_loss.item())
                    val_acc = accuracy_score(vdata[1].tolist(), np.argmax(self.model(X_val).cpu(), axis=1).tolist())

                print(f"\r{e+1:02d}/{self.epochs:02d} | Loss: {loss:<6.2f} | Train/Valid Acc.: {acc*100:.2f}%/{val_acc*100:.2f}%")
                print('epoch_validate_loss',valid_losses[-1])
                print('epoch_train_loss',train_losses[-1])
                if (valid_losses[-1] - train_losses[-1]) > self.min_delta:
                    self.counter +=1
                    print(self.counter)
                    if self.counter >= self.tolerance:  
                        print(f'training terminated at epoch {e+1}')
                        break


In [24]:
cnn = PyTorchCNN(epochs=10, eta=0.001, batch_size=BATCH_SIZE)

# example - the fit function will override the NN configuration
cnn.init_layers(N_CLASSES)
print(cnn)

PyTorchCNN(
  (loss_func): CrossEntropyLoss()
  (model): Sequential(
    (0): Conv2d(3, 16, kernel_size=(5, 5), stride=(1, 1))
    (1): ReLU(inplace=True)
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (4): Conv2d(16, 32, kernel_size=(5, 5), stride=(1, 1))
    (5): ReLU(inplace=True)
    (6): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (7): Flatten(start_dim=1, end_dim=-1)
    (8): Dropout(p=0.5, inplace=True)
    (9): Linear(in_features=26912, out_features=1024, bias=True)
    (10): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (11): Dropout(p=0.5, inplace=True)
    (12): Linear(in_features=1024, out_features=6, bias=True)
  )
)


In [25]:
%%time

cnn.fit(train_loader, validation_loader, info=True) 

01/10 | Loss: 2.04   | Train/Valid Acc.: 52.69%/49.54%
epoch_validate_loss 1.8442014455795288
epoch_train_loss 2.038452386856079
02/10 | Loss: 1.32   | Train/Valid Acc.: 63.36%/61.40%
epoch_validate_loss 1.2950190305709839
epoch_train_loss 1.3248707056045532
03/10 | Loss: 1.17   | Train/Valid Acc.: 64.98%/65.11%
epoch_validate_loss 1.0313133001327515
epoch_train_loss 1.1695287227630615
04/10 | Loss: 0.92   | Train/Valid Acc.: 71.09%/68.64%
epoch_validate_loss 0.9020087122917175
epoch_train_loss 0.9207519888877869
05/10 | Loss: 0.76   | Train/Valid Acc.: 73.45%/69.17%
epoch_validate_loss 0.853967010974884
epoch_train_loss 0.7645508050918579
1
06/10 | Loss: 0.74   | Train/Valid Acc.: 73.37%/70.03%
epoch_validate_loss 0.8035445809364319
epoch_train_loss 0.7399264574050903
2
07/10 | Loss: 0.69   | Train/Valid Acc.: 76.55%/71.77%
epoch_validate_loss 0.7694196105003357
epoch_train_loss 0.6877537965774536
3
08/10 | Loss: 0.76   | Train/Valid Acc.: 74.59%/72.13%
epoch_validate_loss 0.766555786

In [26]:
# Test performance
y_test, y_pred = [], []
for data in test_loader:
    X = data[0]
    y_test += data[1].tolist()

    with torch.no_grad():
        y_pred += cnn.predict(X).tolist()

accuracy_score(y_test, y_pred)

0.7383333333333333

As you can see batch normalization drastically improved performance in accuracy. Batch normalization takes outputs from layers and normalizes them before passing them on to other layers. This normalization allows the optimizer to work more effectively to a local optimum, and it improves the rate of training. Batch Normalization is required due to a phenomenon called internal covariate shift, that is, change in the distribution of layer inputs due to updates in the previous layer; that continous change negatively impacts learning. So batch normalization mellows out the change by applying normalization. For our case batch normalization improved the accuracy, which means that our optimizer's convergence was more accurate.

Now in terms of early termination, we created an algorithm that works by comparing validation loss to training loss, and measures if the validation loss is greater than the training loss by a significant amount named min_delta, in this situation, further training would not be needed. So we provide a tolerance value which is a threshold count, when this threshold count is met then training terminates. For this example we set threshold to be 4, and it saved us an epoch of training.