# Lecture 44: Transfer Learning GoogLeNet

In [None]:
%matplotlib inline
import copy
import time
import tqdm
import torch
import numpy as np
import torchvision
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import matplotlib.pyplot as plt
from torchvision import transforms,datasets, models

print(torch.__version__) # This code has been updated for PyTorch 1.0.0

## Load Data:

In [None]:
# inception_v3 in pytorch requries input to be of size(3x299x299)
apply_transform = transforms.Compose([transforms.Resize(299),transforms.ToTensor()])
BatchSize = 2 # Batchsize > 1

trainset = datasets.CIFAR10(root='./CIFAR10', train=True, download=True, transform=apply_transform)
trainLoader = torch.utils.data.DataLoader(trainset, batch_size=BatchSize,
                                          shuffle=True, num_workers=4) # Creating dataloader

testset = datasets.CIFAR10(root='./CIFAR10', train=False, download=True, transform=apply_transform)
testLoader = torch.utils.data.DataLoader(testset, batch_size=BatchSize,
                                         shuffle=False, num_workers=4) # Creating dataloader

In [None]:
# Size of train and test datasets
print('No. of samples in train set: '+str(len(trainLoader.dataset)))
print('No. of samples in test set: '+str(len(testLoader.dataset)))

## Define network architecture

In [None]:
net1 = models.inception_v3() # Training from scratch
net2 = models.inception_v3(pretrained=True) # End-to-end fine-tuning
net3 = models.inception_v3(pretrained=True) # Training only the last layer
print(net1)

In [None]:
# Counting number of trainable parameters
totalParams = 0
for name,params in net1.named_parameters():
    print(name,'-->',params.size())
    totalParams += np.sum(np.prod(params.size()))
print('Total number of parameters: '+str(totalParams))

In [None]:
# Modifying the last fully-connected layers(including aux network) for 10 classes
net1.AuxLogits.fc = nn.Linear(768,10)
net1.fc = nn.Linear(2048,10)
net2.AuxLogits.fc = nn.Linear(768,10)
net2.fc = nn.Linear(2048,10)
net3.AuxLogits.fc = nn.Linear(768,10)
net3.fc = nn.Linear(2048,10)

In [None]:
# Copying initial weights for visualization
# Model 1
init_weightConv1_1 = copy.deepcopy(net1.Conv2d_1a_3x3.conv.weight.data) # 1st conv layer
init_weightConv2_1 = copy.deepcopy(net1.Conv2d_2a_3x3.conv.weight.data) # 2nd conv layer
Model 2
init_weightConv1_2 = copy.deepcopy(net2.Conv2d_1a_3x3.conv.weight.data) # 1st conv layer
init_weightConv2_2 = copy.deepcopy(net2.Conv2d_2a_3x3.conv.weight.data) # 2nd conv layer
# Model 3
init_weightConv1_3 = copy.deepcopy(net3.Conv2d_1a_3x3.conv.weight.data) # 1st conv layer
init_weightConv2_3 = copy.deepcopy(net3.Conv2d_2a_3x3.conv.weight.data) # 2nd conv layer

In [None]:
# Check availability of GPU

use_gpu = torch.cuda.is_available()
# use_gpu = False # Uncomment in case of GPU memory error
if use_gpu:
    print('GPU is available!')
    device = "cuda"
else:
    print('GPU is not available!')
    device = "cpu"
    
net1 = net1.to(device)
net2 = net2.to(device)
net3 = net3.to(device)
# Freezing all parameters to save memory and computation
for parameter in net3.parameters(): 
    parameter.requires_grad = False
for parameter in net3.fc.parameters(): # Unfreezing fully-conencted layer
    parameter.requires_grad = True                           

## Define loss function and optimizer

In [None]:
criterion = nn.NLLLoss() # Negative Log-likelihood
optimizer1 = optim.Adam(net1.parameters(), lr=1e-4) # Adam; passing all params
optimizer2 = optim.Adam(net2.parameters(), lr=1e-4) # Adam; passing all params
optimizer3 = optim.Adam(net3.fc.parameters(), lr=1e-4) # Adam; passing params of only the last fc layer

## Train the network

In [None]:
iterations = 10
# Model 1
trainLoss1 = [] # List for saving main loss per epoch
trainAuxLoss1 = [] # List for saving auxillary loss per epoch
trainTotalLoss1 = [] # List for saving total loss per epoch
trainAcc1 = [] # List for saving training accuracy per epoch
testLoss1 = [] # List for saving testing loss per epoc
testAcc1 = [] # List for saving testing accuracy per epoch
# Model 2
trainLoss2 = [] # List for saving main loss per epoch
trainAuxLoss2 = [] # List for saving auxillary loss per epoch
trainTotalLoss2 = [] # List for saving total loss per epoch
trainAcc2 = [] # List for saving training accuracy per epoch
testLoss2 = [] # List for saving testing loss per epoc
testAcc2 = [] # List for saving testing accuracy per epoch
# Model 3
trainTotalLoss3 = [] # List for saving total loss per epoch
trainAcc3 = [] # List for saving training accuracy per epoch
testLoss3 = [] # List for saving testing loss per epoc
testAcc3 = [] # List for saving testing accuracy per epoch


start = time.time()
for epoch in range(iterations):
    epochStart = time.time()
    # Model 1
    runningLoss1 = 0.0
    runningAuxLoss1 = 0.0 
    runningTotalLoss1 = 0.0
    avgAuxLoss1 = 0.0
    avgTotalLoss1 = 0.0
    running_correct1 = 0
    # Model 2
    runningLoss2 = 0.0
    runningAuxLoss2 = 0.0 
    runningTotalLoss2 = 0.0
    avgAuxLoss2 = 0.0
    avgTotalLoss2 = 0.0
    running_correct2 = 0
    # Model 3
    runningLoss3 = 0.0
    runningAuxLoss3 = 0.0 
    runningTotalLoss3 = 0.0
    avgAuxLoss3 = 0.0
    avgTotalLoss3 = 0.0
    running_correct3 = 0
    
    net1.train() # For training
    net2.train()
    net3.train()

    for data in tqdm.tqdm_notebook(trainLoader):
        inputs,labels = data
       
        inputs, labels = inputs.to(device), labels.to(device)
        # Feed-forward input data through model 1     
        outputs1,aux_outputs1 = net1(inputs)        
        _, predicted1 = torch.max(outputs1.data, 1)
        running_correct1 += (predicted1 == labels.data).sum()
        # Feed-forward input data through model 2     
        outputs2,aux_outputs2 = net2(inputs)        
        _, predicted2 = torch.max(outputs2.data, 1)
        running_correct2 += (predicted2 == labels.data).sum()
        # Feed-forward input data through model 3     
        outputs3,_ = net3(inputs)   # Training only the last fc layer  
        _, predicted3 = torch.max(outputs3.data, 1)
        running_correct3 += (predicted3 == labels.data).sum()
       
        # Initialize gradients to zero
        optimizer1.zero_grad()
        optimizer2.zero_grad()
        optimizer3.zero_grad()
        
        # Compute loss/error
        # Model 1
        loss1 = criterion(F.log_softmax(outputs1,dim=1), labels)
        aux_loss1 = criterion(F.log_softmax(aux_outputs1,dim=1), labels)
        total_loss1 = loss1+aux_loss1
        # Model 2
        loss2 = criterion(F.log_softmax(outputs2,dim=1), labels)
        aux_loss2 = criterion(F.log_softmax(aux_outputs2,dim=1), labels)
        total_loss2 = loss2+aux_loss2
        # Model 3
        loss3 = criterion(F.log_softmax(outputs3,dim=1), labels)        
        total_loss3 = loss3
        
        # Backpropagate loss and compute gradients
        total_loss1.backward()
        total_loss2.backward()
        total_loss3.backward()
        # Update the network parameters
        optimizer1.step()
        optimizer2.step()
        optimizer3.step()
        # Accumulate loss per batch
        runningLoss1 += loss1.item()    
        runningAuxLoss1 += aux_loss1.item()    
        runningTotalLoss1 += total_loss1.item()  
        #--------------------------------------
        runningLoss2 += loss2.item()    
        runningAuxLoss2 += aux_loss2.item()    
        runningTotalLoss2 += total_loss2.item()  
        #--------------------------------------
        runningLoss3 += loss3.item()        
        runningTotalLoss3 += total_loss3.item()  
    
        
    avgTrainAcc1 = 100*float(running_correct1)/50000.0
    avgTrainLoss1 = runningLoss1/(50000.0/BatchSize)
    avgAuxLoss1 = runningAuxLoss1/(50000.0/BatchSize)
    avgTotalLoss1 = runningTotalLoss1/(50000.0/BatchSize)
    trainAcc1.append(avgTrainAcc1)
    trainLoss1.append(avgTrainLoss1)
    trainAuxLoss1.append(avgAuxLoss1)
    trainTotalLoss1.append(avgTotalLoss1)
    #------------------------------------
    avgTrainAcc2 = 100*float(running_correct2)/50000.0
    avgTrainLoss2 = runningLoss2/(50000.0/BatchSize)
    avgAuxLoss2 = runningAuxLoss2/(50000.0/BatchSize)
    avgTotalLoss2 = runningTotalLoss2/(50000.0/BatchSize)
    trainAcc2.append(avgTrainAcc2)
    trainLoss2.append(avgTrainLoss2)
    trainAuxLoss2.append(avgAuxLoss2)
    trainTotalLoss2.append(avgTotalLoss2)
    #------------------------------------
    avgTrainAcc3 = 100*float(running_correct3)/50000.0
    avgTotalLoss3 = runningTotalLoss3/(50000.0/BatchSize)
    trainAcc3.append(avgTrainAcc3)    
    trainTotalLoss3.append(avgTotalLoss3)
    
    # Evaluating performance on test set for each epoch
    net1.eval() # For testing [Affects batch-norm and dropout layers (if any)]
    net2.eval()
    net3.eval()
    # Model 1
    runningLoss1 = 0.0
    running_correct1 = 0
    # Model 2
    runningLoss2 = 0.0
    running_correct2 = 0
    # Model 3
    runningLoss3 = 0.0
    running_correct3 = 0
    
    with torch.no_grad():
        for data in tqdm.tqdm_notebook(testLoader):
            inputs,labels = data
            
            inputs, labels = inputs.to(device), labels.to(device)
            # Model 1
            outputs1= net1(inputs)
            _, predicted1 = torch.max(outputs1.data, 1)
            running_correct1 += (predicted1 == labels.data).sum()
            Model 2
            outputs2 = net2(inputs)
            _, predicted2 = torch.max(outputs2.data, 1)
            running_correct2 += (predicted2 == labels.data).sum()
            # Model 3
            outputs3 = net3(inputs)
            _, predicted3 = torch.max(outputs3.data, 1)
            running_correct3 += (predicted3 == labels.data).sum()

            loss1 = criterion(F.log_softmax(outputs1,dim=1), labels)
            runningLoss1 += loss1.item() 
            #-----------------------------
            loss2 = criterion(F.log_softmax(outputs2,dim=1), labels)
            runningLoss2 += loss2.item() 
            #-----------------------------
            loss3 = criterion(F.log_softmax(outputs3,dim=1), labels)
            runningLoss3 += loss3.item() 
            
        
    avgTestLoss1 = runningLoss1/(10000.0/BatchSize)
    avgTestAcc1 = 100*float(running_correct1)/10000.0
    testLoss1.append(avgTestLoss1)
    testAcc1.append(avgTestAcc1)
    #---------------------------------------
    avgTestLoss2 = runningLoss2/(10000.0/BatchSize)
    avgTestAcc2 = 100*float(running_correct2)/10000.0
    testLoss2.append(avgTestLoss2)
    testAcc2.append(avgTestAcc2)
    #---------------------------------------
    avgTestLoss3 = runningLoss3/(10000.0/BatchSize)
    avgTestAcc3 = 100*float(running_correct3)/10000.0
    testLoss3.append(avgTestLoss3)
    testAcc3.append(avgTestAcc3)
        
    # Plotting training loss vs aux_loss
    fig1 = plt.figure(1)            
    plt.plot(range(epoch+1),trainAuxLoss1,'r-',label='Model 1') 
    plt.plot(range(epoch+1),trainAuxLoss2,'g-',label='Model 2')     
    if epoch==0:
        plt.legend(loc='upper left')
        plt.xlabel('Epochs')
        plt.ylabel('Auxilalry loss')  
    
    # Plotting training loss vs Epochs: Model 1
    fig2 = plt.figure(2)        
    plt.plot(range(epoch+1),trainTotalLoss1,'r-',label='train')  
    plt.plot(range(epoch+1),testLoss1,'g-',label='test') 
    if epoch==0:
        plt.legend(loc='upper left')
        plt.xlabel('Epochs')
        plt.ylabel('Loss')   
    # Plotting testing accuracy vs Epochs
    fig3 = plt.figure(3)        
    plt.plot(range(epoch+1),trainAcc1,'r-',label='train')    
    plt.plot(range(epoch+1),testAcc1,'g-',label='test')        
    if epoch==0:
        plt.legend(loc='upper left')
        plt.xlabel('Epochs')
        plt.ylabel('Accuracy')    
        
    # Plotting training loss vs Epochs: Model 2
    fig4 = plt.figure(4)        
    plt.plot(range(epoch+1),trainTotalLoss2,'r-',label='train')  
    plt.plot(range(epoch+1),testLoss2,'g-',label='test') 
    if epoch==0:
        plt.legend(loc='upper left')
        plt.xlabel('Epochs')
        plt.ylabel('Loss')   
    # Plotting testing accuracy vs Epochs
    fig5 = plt.figure(5)        
    plt.plot(range(epoch+1),trainAcc2,'r-',label='train')    
    plt.plot(range(epoch+1),testAcc2,'g-',label='test')        
    if epoch==0:
        plt.legend(loc='upper left')
        plt.xlabel('Epochs')
        plt.ylabel('Accuracy')  
        
     # Plotting training loss vs Epochs: Model 3
    fig6 = plt.figure(6)        
    plt.plot(range(epoch+1),trainTotalLoss3,'r-',label='train')  
    plt.plot(range(epoch+1),testLoss3,'g-',label='test') 
    if epoch==0:
        plt.legend(loc='upper left')
        plt.xlabel('Epochs')
        plt.ylabel('Loss')   
    # Plotting testing accuracy vs Epochs
    fig7 = plt.figure(7)        
    plt.plot(range(epoch+1),trainAcc3,'r-',label='train')    
    plt.plot(range(epoch+1),testAcc3,'g-',label='test')        
    if epoch==0:
        plt.legend(loc='upper left')
        plt.xlabel('Epochs')
        plt.ylabel('Accuracy')  
    
    epochEnd = time.time()-epochStart
    print('Iteration: {:.0f} /{:.0f} Model 1  ;  Training Loss: {:.6f} ; Testing Acc: {:.3f}'\
          .format(epoch + 1,iterations,avgTrainLoss1,avgTestAcc1))
    print('Iteration: {:.0f} /{:.0f} Model 2  ;  Training Loss: {:.6f} ; Testing Acc: {:.3f} '\
          .format(epoch + 1,iterations,avgTrainLoss2,avgTestAcc2))
    print('Iteration: {:.0f} /{:.0f} Model 3  ;  Training Loss: {:.6f} ; Testing Acc: {:.3f} '\
      .format(epoch + 1,iterations,avgTotalLoss3,avgTestAcc3))
    print('Time consumed: {:.0f}m {:.0f}s'.format(epochEnd//60,epochEnd%60))
end = time.time()-start
print('Training completed in {:.0f}m {:.0f}s'.format(end//60,end%60))


In [None]:
# Plotting training loss vs Epochs
fig8 = plt.figure(8)        
plt.plot(range(epoch+1),trainLoss1,'r-',label='model1')  
plt.plot(range(epoch+1),trainLoss2,'g-',label='model2') 
plt.plot(range(epoch+1),trainTotalLoss3,'b-',label='model3') 
plt.legend(loc='upper left')
plt.xlabel('Epochs')
plt.ylabel('Train Loss')  

fig9 = plt.figure(9)        
plt.plot(range(epoch+1),testLoss1,'r-',label='model1')  
plt.plot(range(epoch+1),testLoss2,'g-',label='model2') 
plt.plot(range(epoch+1),testLoss3,'b-',label='model3') 
plt.legend(loc='upper left')
plt.xlabel('Epochs')
plt.ylabel('Test Loss') 

fig10 = plt.figure(10)        
plt.plot(range(epoch+1),testAcc1,'r-',label='model1')  
plt.plot(range(epoch+1),testAcc2,'g-',label='model2') 
plt.plot(range(epoch+1),testAcc3,'b-',label='model3') 
plt.legend(loc='upper left')
plt.xlabel('Epochs')
plt.ylabel('Test Accuracy') 

In [None]:
# Copying trained weights for visualization
trained_weightConv1_1 = copy.deepcopy(net1.Conv2d_1a_3x3.conv.weight.data)
trained_weightConv2_1 = copy.deepcopy(net1.Conv2d_2a_3x3.conv.weight.data)

trained_weightConv1_2 = copy.deepcopy(net2.Conv2d_1a_3x3.conv.weight.data)
trained_weightConv2_2 = copy.deepcopy(net2.Conv2d_2a_3x3.conv.weight.data)

trained_weightConv1_3 = copy.deepcopy(net3.Conv2d_1a_3x3.conv.weight.data)
trained_weightConv2_3 = copy.deepcopy(net3.Conv2d_2a_3x3.conv.weight.data)
if use_gpu:
    trained_weightConv1_1 = trained_weightConv1_1.cpu()
    trained_weightConv2_1 = trained_weightConv2_1.cpu()
    
    trained_weightConv1_2 = trained_weightConv1_2.cpu()
    trained_weightConv2_2 = trained_weightConv2_2.cpu()
    
    trained_weightConv1_3 = trained_weightConv1_3.cpu()
    trained_weightConv2_3 = trained_weightConv2_3.cpu()    

## Visualization of weights

In [None]:
# functions to show an image
def imshow(img, strlabel):
    npimg = img.numpy()
    npimg = np.abs(npimg)
    fig_size = plt.rcParams["figure.figsize"]
    fig_size[0] = 10
    fig_size[1] = 10
    plt.rcParams["figure.figsize"] = fig_size
    plt.figure()
    plt.title(strlabel)
    plt.imshow(np.transpose(npimg, (1, 2, 0)))

In [None]:
# Visualizing weights of 1st convolutional layer of Model 1
imshow(torchvision.utils.make_grid(init_weightConv1_1,nrow=8,normalize=True),'Initial weights: conv1')
imshow(torchvision.utils.make_grid(trained_weightConv1_1,nrow=8,normalize=True),'Trained weights: conv1')
imshow(torchvision.utils.make_grid(init_weightConv1_1-trained_weightConv1_1,nrow=8,normalize=True),'Difference of weights: conv1')

In [None]:
# Visualizing weights of 2nd convolutional layer of Model 1
imshow(torchvision.utils.make_grid(init_weightConv2_1[0].unsqueeze(1),nrow=8,normalize=True),'Initial weights: conv2')
imshow(torchvision.utils.make_grid(trained_weightConv2_1[0].unsqueeze(1),nrow=8,normalize=True),'Trained weights: conv2')
imshow(torchvision.utils.make_grid(init_weightConv2_1[0].unsqueeze(1)-trained_weightConv2_1[0].unsqueeze(1),nrow=8,normalize=True),'Difference of weights: conv2')

In [None]:
# Visualizing weights of 1st convolutional layer of Model 2
imshow(torchvision.utils.make_grid(init_weightConv1_2,nrow=8,normalize=True),'Initial weights: conv1')
imshow(torchvision.utils.make_grid(trained_weightConv1_2,nrow=8,normalize=True),'Trained weights: conv1')
imshow(torchvision.utils.make_grid(init_weightConv1_2-trained_weightConv1_2,nrow=8,normalize=True),'Difference of weights: conv1')

In [None]:
# Visualizing weights of 2nd convolutional layer of Model 2
imshow(torchvision.utils.make_grid(init_weightConv2_2[0].unsqueeze(1),nrow=8,normalize=True),'Initial weights: conv2')
imshow(torchvision.utils.make_grid(trained_weightConv2_2[0].unsqueeze(1),nrow=8,normalize=True),'Trained weights: conv2')
imshow(torchvision.utils.make_grid(init_weightConv2_2[0].unsqueeze(1)-trained_weightConv2_2[0].unsqueeze(1),nrow=8,normalize=True),'Difference of weights: conv2')

In [None]:
# Visualizing weights of 1st convolutional layer of Model 3
imshow(torchvision.utils.make_grid(init_weightConv1_3,nrow=8,normalize=True),'Initial weights: conv1')
imshow(torchvision.utils.make_grid(trained_weightConv1_3,nrow=8,normalize=True),'Trained weights: conv1')
imshow(torchvision.utils.make_grid(init_weightConv1_3-trained_weightConv1_3,nrow=8,normalize=True),'Difference of weights: conv1')

In [None]:
# Visualizing weights of 2nd convolutional layer of Model 3
imshow(torchvision.utils.make_grid(init_weightConv2_3[0].unsqueeze(1),nrow=8,normalize=True),'Initial weights: conv2')
imshow(torchvision.utils.make_grid(trained_weightConv2_3[0].unsqueeze(1),nrow=8,normalize=True),'Trained weights: conv2')
imshow(torchvision.utils.make_grid(init_weightConv2_3[0].unsqueeze(1)-trained_weightConv2_3[0].unsqueeze(1),nrow=8,normalize=True),'Difference of weights: conv2')