<a href="https://colab.research.google.com/github/yuanhaobo0108/BATCH-NORMALIZATION-FOR-GRADIENT-VANISHING/blob/main/BN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Batch normalization for gradient vanishing**

In [None]:
import torch
import matplotlib.pyplot as plt
import torchvision
from torchvision import datasets, transforms
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
import numpy as np

transform=transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
])

batch_size = 64

trainset = torchvision.datasets.MNIST(root='./mnist', train=True,
                                        download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
                                          shuffle=True, num_workers=2)

testset = torchvision.datasets.MNIST(root='./mnist', train=False,
                                       download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size,
                                         shuffle=False, num_workers=2)

classes = ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9')

In [None]:
# functions to show an image
def imshow(img):
    img = img / 0.1307 + 0.3081     # unnormalize
    npimg = img.numpy()
    plt.imshow(np.transpose(npimg, (1, 2, 0)))
    plt.show()

# get some random training images
dataiter = iter(trainloader)
images, labels = dataiter.next()

# show images
imshow(torchvision.utils.make_grid(images))

In [None]:
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        # 1 input image channel, 25 output channels, 5x5 square convolution kernel, (2, 2) stride
        self.conv1 = nn.Conv2d(1, 25, 12, stride=2)
        # 25 input image channel, 64 output channels, 5x5 square convolution kernel, (1, 1) stride, 2 padding
        self.conv2 = nn.Conv2d(25, 64, 5, padding=2)
        self.pool = nn.MaxPool2d(2, 2)
        self.fc1 = nn.Linear(1024, 64)
        self.dp  = nn.Dropout(p=0)
        self.fc2 = nn.Linear(64, 10)
        
    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = self.pool(x)
        x = torch.flatten(x,1)
        x = F.relu(self.fc1(x))  
        x = self.dp(x)
        x = self.fc2(x)
        x = F.log_softmax(x, 1)
        return x

net = Net()


WithoutBN+0.001LR

In [None]:
import torch.optim as optim

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
loss1 = []
acc1=[]
for epoch in range(10):  # loop over the dataset multiple times

    running_loss = 0.0

    for i, data in enumerate(trainloader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % 50 == 49:    # print every 50 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss /50))
            
            loss1.append(running_loss / 50)

            running_loss = 0.0
            correct = 0
            total = 0
            #acc1=[]
            # since we're not training, we don't need to calculate the gradients for our outputs
            with torch.no_grad():
                for data in testloader:
                    images, labels = data
                    # calculate outputs by running images through the network
                    outputs = net(images)
                    # the class with the highest energy is what we choose as prediction
                    _, predicted = torch.max(outputs.data, 1)
                    total += labels.size(0)
                    correct += (predicted == labels).sum().item()

            print('Accuracy of the network on the 10000 test images: %f %%' % (100 * correct / total))
            acc1.append(100 * correct / total)  
print('Finished Training')


WithoutBN+0.01LR

In [None]:
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        # 1 input image channel, 25 output channels, 5x5 square convolution kernel, (2, 2) stride
        self.conv1 = nn.Conv2d(1, 25, 12, stride=2)
        # 25 input image channel, 64 output channels, 5x5 square convolution kernel, (1, 1) stride, 2 padding
        self.conv2 = nn.Conv2d(25, 64, 5, padding=2)
        self.bnorm1 = nn.BatchNorm2d(25)
        self.bnorm2 = nn.BatchNorm2d(64)
        self.bnorm3 = nn.BatchNorm1d(64)
        self.bnorm4 = nn.BatchNorm1d(10)
        self.pool = nn.MaxPool2d(2, 2)
        self.fc1 = nn.Linear(1024, 64)
        self.dp  = nn.Dropout(p=0)
        self.fc2 = nn.Linear(64, 10)
        
    def forward(self, x):
        x = self.conv1(x)
        x = self.bnorm1(x)
        x = F.relu(x)
        x = self.conv2(x)
        x = self.bnorm2(x)
        x = F.relu(x)
        x = self.pool(x)
        x = torch.flatten(x,1)
        x = self.fc1(x)
        x = self.bnorm3(x)
        x = F.relu(x)  
        x = self.dp(x)
        x = self.fc2(x)
        x = self.bnorm4(x)
        x = F.log_softmax(x, 1)
        return x

net = Net()

WithBN+0.001LR

In [None]:
import torch.optim as optim

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
loss3 = []
acc3=[]
for epoch in range(10):  # loop over the dataset multiple times
    running_loss = 0.0
    for i, data in enumerate(trainloader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % 50 == 49:    # print every 50 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 50))
            
            loss3.append(running_loss / 50)

            running_loss = 0.0
            correct = 0
            total = 0
            #acc1=[]
            # since we're not training, we don't need to calculate the gradients for our outputs
            with torch.no_grad():
                for data in testloader:
                    images, labels = data
                    # calculate outputs by running images through the network
                    outputs = net(images)
                    # the class with the highest energy is what we choose as prediction
                    _, predicted = torch.max(outputs.data, 1)
                    total += labels.size(0)
                    correct += (predicted == labels).sum().item()

            print('Accuracy of the network on the 10000 test images: %f %%' % (100 * correct / total))
            acc3.append(100 * correct / total)  
print('Finished Training')


WithBN+0.01LR

In [None]:
import torch.optim as optim

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.01, momentum=0.9)
loss4 = []
acc4=[]
for epoch in range(10):  # loop over the dataset multiple times

    running_loss = 0.0

    for i, data in enumerate(trainloader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % 50 == 49:    # print every 50 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 50))
            
            loss4.append(running_loss / 50)

            running_loss = 0.0
            correct = 0
            total = 0
            #acc1=[]
            # since we're not training, we don't need to calculate the gradients for our outputs
            with torch.no_grad():
                for data in testloader:
                    images, labels = data
                    # calculate outputs by running images through the network
                    outputs = net(images)
                    # the class with the highest energy is what we choose as prediction
                    _, predicted = torch.max(outputs.data, 1)
                    total += labels.size(0)
                    correct += (predicted == labels).sum().item()

            print('Accuracy of the network on the 10000 test images: %f %%' % (100 * correct / total))
            acc4.append(100 * correct / total)  
print('Finished Training')


In [None]:
import torch.optim as optim

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.1, momentum=0.9)
loss2 = []
acc2=[]
for epoch in range(10):  # loop over the dataset multiple times

    running_loss = 0.0

    for i, data in enumerate(trainloader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % 50 == 49:    # print every 50 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 50))
            
            loss2.append(running_loss / 50)

            running_loss = 0.0
            correct = 0
            total = 0
            #acc1=[]
            # since we're not training, we don't need to calculate the gradients for our outputs
            with torch.no_grad():
                for data in testloader:
                    images, labels = data
                    # calculate outputs by running images through the network
                    outputs = net(images)
                    # the class with the highest energy is what we choose as prediction
                    _, predicted = torch.max(outputs.data, 1)
                    total += labels.size(0)
                    correct += (predicted == labels).sum().item()

            print('Accuracy of the network on the 10000 test images: %f %%' % (100 * correct / total))
            acc2.append(100 * correct / total)  
print('Finished Training')


In [None]:
plt.plot(loss1,  label = "withoutBN+0.001LR")
plt.plot(loss3,  label = "withBN+0.001LR")
plt.plot(loss4,  label = "withBN+0.01LR")
plt.plot(loss2,  label = "withBN+0.1LR")
plt.xlabel('Every 50 Steps')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
plt.plot(acc1,  label = "withoutBN+0.001LR")
plt.plot(acc3,  label = "withBN+0.001LR")
plt.plot(acc4,  label = "withBN+0.01LR")
plt.plot(acc2,  label = "withBN+0.1LR")
plt.xlabel('Every 50 Steps')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

DP=0.2


In [None]:
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        # 1 input image channel, 25 output channels, 5x5 square convolution kernel, (2, 2) stride
        self.conv1 = nn.Conv2d(1, 25, 12, stride=2)
        # 25 input image channel, 64 output channels, 5x5 square convolution kernel, (1, 1) stride, 2 padding
        self.conv2 = nn.Conv2d(25, 64, 5, padding=2)
        self.bnorm1 = nn.BatchNorm2d(25)
        self.bnorm2 = nn.BatchNorm2d(64)
        self.bnorm3 = nn.BatchNorm1d(64)
        self.bnorm4 = nn.BatchNorm1d(10)
        self.pool = nn.MaxPool2d(2, 2)
        self.fc1 = nn.Linear(1024, 64)
        self.dp  = nn.Dropout(p=0.2)
        self.fc2 = nn.Linear(64, 10)
        
    def forward(self, x):
        x = self.conv1(x)
        x = self.bnorm1(x)
        x = F.relu(x)
        x = self.conv2(x)
        x = self.bnorm2(x)
        x = F.relu(x)
        x = self.pool(x)

        x = torch.flatten(x,1)
        x = self.fc1(x)
        x = self.bnorm3(x)
        x = F.relu(x)  
        x = self.dp(x)
        x = self.fc2(x)
        x = self.bnorm4(x)
        x = F.log_softmax(x, 1)
        return x

net = Net()

In [None]:
import torch.optim as optim

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.01, momentum=0.9)
loss5 = []
acc5=[]
for epoch in range(10):  # loop over the dataset multiple times

    running_loss = 0.0

    for i, data in enumerate(trainloader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % 50 == 49:    # print every 50 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 50))
            
            loss5.append(running_loss / 50)

            running_loss = 0.0
            correct = 0
            total = 0
            #acc1=[]
            # since we're not training, we don't need to calculate the gradients for our outputs
            with torch.no_grad():
                for data in testloader:
                    images, labels = data
                    # calculate outputs by running images through the network
                    outputs = net(images)
                    # the class with the highest energy is what we choose as prediction
                    _, predicted = torch.max(outputs.data, 1)
                    total += labels.size(0)
                    correct += (predicted == labels).sum().item()

            print('Accuracy of the network on the 10000 test images: %f %%' % (100 * correct / total))
            acc5.append(100 * correct / total)  
print('Finished Training')

In [None]:
Dp=0.5

In [None]:
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        # 1 input image channel, 25 output channels, 5x5 square convolution kernel, (2, 2) stride
        self.conv1 = nn.Conv2d(1, 25, 12, stride=2)
        # 25 input image channel, 64 output channels, 5x5 square convolution kernel, (1, 1) stride, 2 padding
        self.conv2 = nn.Conv2d(25, 64, 5, padding=2)
        self.bnorm1 = nn.BatchNorm2d(25)
        self.bnorm2 = nn.BatchNorm2d(64)
        self.bnorm3 = nn.BatchNorm1d(64)
        self.bnorm4 = nn.BatchNorm1d(10)
        self.pool = nn.MaxPool2d(2, 2)
        self.fc1 = nn.Linear(1024, 64)
        self.dp  = nn.Dropout(p=0.5)
        self.fc2 = nn.Linear(64, 10)
        
    def forward(self, x):
        x = self.conv1(x)
        x = self.bnorm1(x)
        x = F.relu(x)
        x = self.conv2(x)
        x = self.bnorm2(x)
        x = F.relu(x)
        x = self.pool(x)

        x = torch.flatten(x,1)
        x = self.fc1(x)
        x = self.bnorm3(x)
        x = F.relu(x)  
        x = self.dp(x)
        x = self.fc2(x)
        x = self.bnorm4(x)
        x = F.log_softmax(x, 1)
        return x

net = Net()

In [None]:
import torch.optim as optim

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.01, momentum=0.9)
loss6 = []
acc6=[]
for epoch in range(10):  # loop over the dataset multiple times

    running_loss = 0.0

    for i, data in enumerate(trainloader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % 50 == 49:    # print every 50 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 50))
            
            loss6.append(running_loss / 50)

            running_loss = 0.0
            correct = 0
            total = 0
            #acc1=[]
            # since we're not training, we don't need to calculate the gradients for our outputs
            with torch.no_grad():
                for data in testloader:
                    images, labels = data
                    # calculate outputs by running images through the network
                    outputs = net(images)
                    # the class with the highest energy is what we choose as prediction
                    _, predicted = torch.max(outputs.data, 1)
                    total += labels.size(0)
                    correct += (predicted == labels).sum().item()

            print('Accuracy of the network on the 10000 test images: %f %%' % (100 * correct / total))
            acc6.append(100 * correct / total)  
print('Finished Training')

In [None]:
plt.plot(loss4,  label = "withBN+0Dropout")
plt.plot(loss5,  label = "withBN+0.2Dropout")
plt.plot(loss6,  label = "withBN+0.5Dropout")
plt.xlabel('Every 50 Steps')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
plt.plot(acc4,  label = "withBN+0Dropout")
plt.plot(acc5,  label = "withBN+0.2Dropout")
plt.plot(acc6,  label = "withBN+0.5Dropout")
plt.xlabel('Every 50 Steps')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

BN after activation function

In [None]:
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        # 1 input image channel, 25 output channels, 5x5 square convolution kernel, (2, 2) stride
        self.conv1 = nn.Conv2d(1, 25, 12, stride=2)
        # 25 input image channel, 64 output channels, 5x5 square convolution kernel, (1, 1) stride, 2 padding
        self.conv2 = nn.Conv2d(25, 64, 5, padding=2)
        self.bnorm1 = nn.BatchNorm2d(25)
        self.bnorm2 = nn.BatchNorm2d(64)
        self.bnorm3 = nn.BatchNorm1d(64)
        self.bnorm4 = nn.BatchNorm1d(10)
        self.pool = nn.MaxPool2d(2, 2)
        self.fc1 = nn.Linear(1024, 64)
        self.dp  = nn.Dropout(p=0)
        self.fc2 = nn.Linear(64, 10)
        
    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(x)
        x = self.bnorm1(x)
        x = self.conv2(x)
        x = F.relu(x)  
        x = self.bnorm2(x)
        x = self.pool(x)

        x = torch.flatten(x,1)
        x = self.fc1(x)
        x = F.relu(x)  
        x = self.bnorm3(x)
        x = self.dp(x)
        x = self.fc2(x)
        x = F.log_softmax(x, 1)
        x = self.bnorm4(x)
        return x

net = Net()

In [None]:
import torch.optim as optim

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.01, momentum=0.9)
loss7 = []
acc7=[]
for epoch in range(10):  # loop over the dataset multiple times

    running_loss = 0.0

    for i, data in enumerate(trainloader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % 50 == 49:    # print every 50 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 50))
            
            loss7.append(running_loss / 50)

            running_loss = 0.0
            correct = 0
            total = 0
            #acc1=[]
            # since we're not training, we don't need to calculate the gradients for our outputs
            with torch.no_grad():
                for data in testloader:
                    images, labels = data
                    # calculate outputs by running images through the network
                    outputs = net(images)
                    # the class with the highest energy is what we choose as prediction
                    _, predicted = torch.max(outputs.data, 1)
                    total += labels.size(0)
                    correct += (predicted == labels).sum().item()

            print('Accuracy of the network on the 10000 test images: %f %%' % (100 * correct / total))
            acc7.append(100 * correct / total)  
print('Finished Training')

In [None]:
plt.plot(loss4,  label = "withBN+before Activation")
plt.plot(loss7,  label = "withBN+after Activation")
plt.xlabel('Every 50 Steps')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
plt.plot(acc4,  label = "withBN+before Activation")
plt.plot(acc7,  label = "withBN+after Activation")
plt.xlabel('Every 50 Steps')
plt.ylabel('Accuracy')
plt.legend()
plt.show()