In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn import metrics
import matplotlib.pyplot as plt
import numpy as np
import cv2
import time
import math

In [3]:
from matplotlib.lines import Line2D      

def plot_grad_flow(named_parameters, epoch, loss, acc, savepath):
    '''Plots the gradients flowing through different layers in the net during training.
    Can be used for checking for possible gradient vanishing / exploding problems.
    
    Usage: Plug this function in Trainer class after loss.backwards() as 
    "plot_grad_flow(self.model.named_parameters())" to visualize the gradient flow'''
    ave_grads = []
    max_grads= []
    layers = []
    for n, p in named_parameters:
        if(p.requires_grad) and ("bias" not in n):

            try:
                ave_grads.append(p.grad.abs().mean())
                max_grads.append(p.grad.abs().max())
                layers.append(n)
            except:
                continue
#                 ave_grads.append(0.)
#                 max_grads.append(0.)
#                 layers.append(n)
                
    plt.figure(figsize=(10,10))
    plt.bar(np.arange(len(max_grads)), max_grads, alpha=0.5, lw=1, color="c")
    plt.bar(np.arange(len(max_grads)), ave_grads, alpha=0.5, lw=1, color="b")
    plt.hlines(0, 0, len(ave_grads)+1, lw=2, color="k" )
    plt.xticks(range(0,len(ave_grads), 1), layers, rotation="vertical")
    plt.xlim(left=0, right=len(ave_grads))
    plt.ylim(bottom = -0.001, top=0.02) # zoom in on the lower gradient regions
    plt.xlabel("Layers")
    plt.ylabel("average gradient")
    plt.title("Gradient flow | Epoch " + str(epoch) + " | ls: " + str(loss) + " | ac: " + str(acc))
    plt.tight_layout()
    plt.grid(True)
    plt.legend([Line2D([0], [0], color="c", lw=4),
                Line2D([0], [0], color="b", lw=4),
                Line2D([0], [0], color="k", lw=4)], ['max-gradient', 'mean-gradient', 'zero-gradient'])
    
    if not savepath is None:
        plt.savefig(savepath)
        
    plt.close()
    
    

In [4]:
import torch
from torchvision import transforms, datasets

train_data_transform = transforms.Compose([
        
#         transforms.Lambda(lbp_transform),
#         transforms.Lambda(combine_lbp_hog),
#         transforms.ToPILImage(),
        # transforms.RandomSizedCrop(224),
        transforms.RandomHorizontalFlip(),
#         transforms.RandomVerticalFlip(),
        transforms.RandomRotation(degrees=150),
        transforms.Grayscale(num_output_channels=1),
        transforms.ToTensor(),
        transforms.Normalize((0.5,), (0.5,))
    ])

test_data_transform = transforms.Compose([
        transforms.Grayscale(num_output_channels=1),
        transforms.ToTensor(),
        transforms.Normalize((0.5,), (0.5,))
    ])

train_fer = datasets.ImageFolder(root='/tf/data/Quan/fer2013/cleaned_data/fer2013-clean/Training', transform=train_data_transform)
val_fer = datasets.ImageFolder(root='/tf/data/Quan/fer2013/cleaned_data/fer2013-clean/PublicTest', transform=test_data_transform)
test_fer = datasets.ImageFolder(root='/tf/data/Quan/fer2013/cleaned_data/fer2013-clean/PrivateTest/', transform=test_data_transform)

batch_size = 128
train_loader = torch.utils.data.DataLoader(train_fer,
                                             batch_size=batch_size, shuffle=True,
                                             num_workers=8)
val_loader = torch.utils.data.DataLoader(val_fer,
                                             batch_size=batch_size, shuffle=True,
                                             num_workers=8)
test_loader = torch.utils.data.DataLoader(test_fer,
                                             batch_size=batch_size, shuffle=True,
                                             num_workers=8)

In [5]:
class Attention(nn.Module):
    
    def __init__(self, feature_depth, target_depth, emb_dim):
        super(Attention, self).__init__()
        self.emb_dim = emb_dim
        self.iconv1 = nn.Conv2d(feature_depth, emb_dim, kernel_size=1)
        self.iconv2 = nn.Conv2d(target_depth, emb_dim, kernel_size=1)
        self.iconv3 = nn.Conv2d(target_depth, emb_dim, kernel_size=1)
        self.iconv_out = nn.Conv2d(emb_dim, target_depth, kernel_size=1)
        self.bn_out = nn.BatchNorm2d(target_depth)
        
        # ensure the init stage is the indentical mapping
#         nn.init.zeros_(self.iconv_out.weight)
#         nn.init.zeros_(self.iconv_out.bias)
#         nn.init.zeros_(self.bn_out.weight)
#         nn.init.zeros_(self.bn_out.bias)
        
    def forward(self, x1, x2):
        x1_emb = self.iconv1(x1)
        _x1 = nn.Flatten(-2)(x1_emb)
        
        x2_emb = self.iconv2(x2)
        _x2 = nn.Flatten(-2)(x2_emb)
        
        x3_emb = self.iconv3(x2)
        _x3 = nn.Flatten(-2)(x3_emb)
        
        QK = torch.matmul(_x1, _x2.permute(0,2,1))
        QK = QK - QK.mean()
        QK = QK / QK.std()
#         QK = QK / math.sqrt(_x1.size(2))
#         QK = nn.Softmax()(QK)
        
#         if self.training is False:
#             print(QK.mean(), QK.var())

        x_out = torch.matmul(QK, _x3)
        x_out = x_out.reshape(x3_emb.size())
        
        x_out = self.iconv_out(x_out)
        x_out = self.bn_out(x_out)
        x_out = nn.LeakyReLU()(x_out)
        
        x_out = x_out + x2 #residual
        
        return x_out

In [6]:
class SVGG(nn.Module):
    
    def __init__(self, **kwargs):
        super(SVGG, self).__init__()

        self.device = torch.device('cpu') if not 'device' in kwargs else kwargs['device']

        self.sv0 = SVblock(1, 64).to(self.device)
        self.sv1 = SVblock(64, 128).to(self.device)
        self.sv2 = SVblock(128, 256).to(self.device)
        self.sv3 = SVblock(256, 512).to(self.device)
        self.sv4 = SVblock(512, 1024).to(self.device)

        self.wf0 = Attention(64, 64, emb_dim=32).to(self.device)
        self.wf1 = Attention(128, 128, emb_dim=64).to(self.device)
        self.wf2 = Attention(256, 256, emb_dim=128).to(self.device)
        self.wf3 = Attention(512, 512, emb_dim=256).to(self.device)
        self.wf4 = Attention(1024, 1024, emb_dim=512).to(self.device)
        
        self.iconv_out = nn.Conv2d(1024, 1024, kernel_size=1)
        self.bn_out = nn.BatchNorm2d(1024)

        self.fc0 = nn.Linear(2560, 2560)
        self.bn0 = nn.BatchNorm1d(2560)
        self.fc1 = nn.Linear(2560, 1024)
        self.bn1 = nn.BatchNorm1d(1024)
        self.fc2 = nn.Linear(1024, 7)
        self.bn2 = nn.BatchNorm1d(7)
        
    
    def forward(self, x):
        x0 = self.sv0(x)
        x_wf0 = self.wf0(x0, x0)

        x1 = self.sv1(x_wf0)
        x_wf1 = self.wf1(x1, x1)

        x2 = self.sv2(x_wf1)
        x_wf2 = self.wf2(x2, x2)

        x3 = self.sv3(x_wf2)
        x_wf3 = self.wf3(x3, x3)

        x4 = self.sv4(x_wf3)
        x_wf4 = self.wf4(x4, x4)
        
        x_wf4 = self.iconv_out(x_wf4)
        x_wf4 = self.bn_out(x_wf4)

        x_wf4 = nn.AvgPool2d(kernel_size=(x_wf4.size(-2), x_wf4.size(-1)))(x_wf4)
        x_wf4 = nn.Flatten()(x_wf4)
        # additional output from sv3 and sv4
        x_sub_3 = nn.AvgPool2d(kernel_size=(x3.size(-2), x3.size(-1)))(x3)
        x_sub_3 = nn.Flatten()(x_sub_3)
        x_sub_4 = nn.AvgPool2d(kernel_size=(x4.size(-2), x4.size(-1)))(x4)
        x_sub_4 = nn.Flatten()(x_sub_4)
        
        x = torch.cat([x_wf4, x_sub_3, x_sub_4], dim=1)

        x = self.fc0(x)
        x = self.bn0(x)
        x = nn.LeakyReLU()(x)
        x = self.fc1(x)
        x = self.bn1(x)
        x = nn.LeakyReLU()(x)
        x = self.fc2(x)
        x = self.bn2(x)
        
        return x

In [7]:
class SVGG(nn.Module):
    
    def __init__(self, **kwargs):
        super(SVGG, self).__init__()

        self.device = torch.device('cpu') if not 'device' in kwargs else kwargs['device']

        self.sv0 = SVblock(1, 64).to(self.device)
        self.sv1 = SVblock(64, 128).to(self.device)
        self.sv2 = SVblock(128, 256).to(self.device)
        self.sv3 = SVblock(256, 512).to(self.device)
        self.sv4 = SVblock(512, 1024).to(self.device)

        self.wf0 = Attention(64, 64, emb_dim=32).to(self.device)
        self.wf1 = Attention(128, 128, emb_dim=64).to(self.device)
        self.wf2 = Attention(256, 256, emb_dim=128).to(self.device)
        self.wf3 = Attention(512, 512, emb_dim=256).to(self.device)
        self.wf4 = Attention(1024, 1024, emb_dim=512).to(self.device)
        
        self.iconv_out = nn.Conv2d(1024, 1024, kernel_size=1)
        self.bn_out = nn.BatchNorm2d(1024)

        self.fc0 = nn.Linear(2560, 2560)
        self.bn0 = nn.BatchNorm1d(2560)
        self.fc1 = nn.Linear(2560, 1024)
        self.bn1 = nn.BatchNorm1d(1024)
        self.fc2 = nn.Linear(1024, 7)
        self.bn2 = nn.BatchNorm1d(7)
        
    
    def forward(self, x):
        x0 = self.sv0(x)
        x_wf0 = self.wf0(x0, x0)

        x1 = self.sv1(x_wf0)
        x_wf1 = self.wf1(x1, x1)

        x2 = self.sv2(x_wf1)
        x_wf2 = self.wf2(x2, x2)

        x3 = self.sv3(x_wf2)
        x_wf3 = self.wf3(x3, x3)

        x4 = self.sv4(x_wf3)
        x_wf4 = self.wf4(x4, x4)
        
        x_wf4 = self.iconv_out(x_wf4)
        x_wf4 = self.bn_out(x_wf4)

        x_wf4 = nn.AvgPool2d(kernel_size=(x_wf4.size(-2), x_wf4.size(-1)))(x_wf4)
        x_wf4 = nn.Flatten()(x_wf4)
        # additional output from sv3 and sv4
        x_sub_3 = nn.AvgPool2d(kernel_size=(x3.size(-2), x3.size(-1)))(x3)
        x_sub_3 = nn.Flatten()(x_sub_3)
        x_sub_4 = nn.AvgPool2d(kernel_size=(x4.size(-2), x4.size(-1)))(x4)
        x_sub_4 = nn.Flatten()(x_sub_4)
        
        x = torch.cat([x_wf4, x_sub_3, x_sub_4], dim=1)

        x = self.fc0(x)
        x = self.bn0(x)
        x = nn.LeakyReLU()(x)
        x = self.fc1(x)
        x = self.bn1(x)
        x = nn.LeakyReLU()(x)
        x = self.fc2(x)
        x = self.bn2(x)
        
        return x

In [8]:
device = torch.device('cuda')
print(device)

model = SVGG(device=device)

criterion = nn.CrossEntropyLoss()

learning_rate = 0.008
reduce_factor = 5
optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-4)


cuda


### Transfer learning from VGG

In [8]:
from torchvision import models

In [7]:
VGG19 = models.vgg19(pretrained=True)

In [9]:
vgg19bn = models.vgg19_bn(pretrained=True)

In [10]:
import copy
vgg19bn_clone = copy.deepcopy(vgg19bn)

In [17]:
vgg19bn

VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ReLU(inplace=True)
    (6): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (7): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (9): ReLU(inplace=True)
    (10): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (12): ReLU(inplace=True)
    (13): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (14): Conv2d(128, 256

In [11]:
class SVGG(nn.Module):
    
    def __init__(self, **kwargs):
        super(SVGG, self).__init__()

        self.device = torch.device('cpu') if not 'device' in kwargs else kwargs['device']

        self.sv0 = vgg19bn_clone.features[:6]
        self.sv0[0].in_channels = 1
        self.sv0[0].weight = nn.Parameter(vgg19bn.features[0].weight.mean(1).unsqueeze(1))
        
        self.sv1 = vgg19bn_clone.features[7:14]
        self.sv2 = vgg19bn_clone.features[14:27]
        self.sv3 = vgg19bn_clone.features[27:53]

        self.wf0 = Attention(64, 64, emb_dim=32).to(self.device)
        self.wf1 = Attention(128, 128, emb_dim=64).to(self.device)
        self.wf2 = Attention(256, 256, emb_dim=128).to(self.device)
        self.wf3 = Attention(512, 512, emb_dim=256).to(self.device)
        
        self.iconv_out = nn.Conv2d(512, 512, kernel_size=1)
        self.bn_out = nn.BatchNorm2d(512)


        self.fc1 = nn.Linear(512, 512)
        self.bn1 = nn.BatchNorm1d(512)
        self.fc2 = nn.Linear(512, 7)
        self.bn2 = nn.BatchNorm1d(7)
        
    
    def forward(self, x):
        x0 = self.sv0(x)
        x_wf0 = self.wf0(x0, x0)
        
        x1 = self.sv1(x0)
        x_wf1 = self.wf1(x1, x1)

        x2 = self.sv2(x_wf1)
        x_wf2 = self.wf2(x2, x2)

        x3 = self.sv3(x_wf2)
        x_wf3 = self.wf3(x3, x3)

        x_wf3 = self.iconv_out(x_wf3)
        x_wf3 = self.bn_out(x_wf3)

        x_wf3 = nn.AvgPool2d(kernel_size=(x_wf3.size(-2), x_wf3.size(-1)))(x_wf3)
    
        x = nn.Flatten()(x_wf3)

        x = self.fc1(x)
        x = self.bn1(x)
        x = nn.LeakyReLU()(x)
        x = self.fc2(x)
        x = self.bn2(x)


        return x

In [12]:
device = torch.device('cuda')
print(device)

model = SVGG(device=device)

criterion = nn.CrossEntropyLoss()

learning_rate = 0.008
reduce_factor = 5
optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-4)


cuda


In [13]:
model = model.to(device)

model_folder = '/tf/data/Quan/fer2013/backtobasics/attention_TF/'
model_name = 'svgg_vgg19'
model_path = os.path.join(model_folder, model_name + '.pt')

best_acc = 0.0
curloss = 0.0
hist = []

for epoch in range(300):  # loop over the dataset multiple times
    
    
    
#     if (epoch % 60) == 0 and epoch != 0:
#         learning_rate /= reduce_factor
#         print('Decrese learning rate to: ', learning_rate)
#         optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-4)
    
    print('Epoch: ', epoch + 1)
    running_loss = 0.0
    running_acc = 0.0

    # TRAIN
    model.train()
    for i, data in enumerate(train_loader):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data
        inputs = inputs.to(device)
        labels = labels.to(device)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inputs)
        acc = float((torch.argmax(outputs, dim=1) == labels).float().sum()/labels.size(0))
        loss = criterion(outputs, labels)
        loss.backward()
        
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        running_acc += acc
#         print('\t - Step %d: loss: %.3f acc: %.3f' % (i+1, loss.item(), acc))

    print('- Avg.loss: %.3f  | Avg.acc: %.3f' % (running_loss / (i+1), running_acc / (i+1)))
    avgloss = running_loss / (i+1)
    avgacc = running_acc / (i+1)
    
    # print gradient flow figure
    plot_grad_flow(model.named_parameters(), epoch, avgloss, avgacc,
                   savepath=os.path.join(model_folder, model_name + '_gf' + '_' + str(epoch) + '.png'))

    # EVALUATE
    model.eval()
    running_valloss = 0.0
    running_valacc = 0.0
    for i,data in enumerate(val_loader):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data
        inputs = inputs.to(device)
        labels = labels.to(device)

        outputs = model(inputs)
        acc = float((torch.argmax(outputs, dim=1) == labels).float().sum()/labels.size(0))
        loss = criterion(outputs, labels)

        running_valloss += loss.item()
        running_valacc += acc

    print('- Avg. val_loss: %.3f  | Avg. val_acc: %.3f' % (running_valloss / (i+1), running_valacc / (i+1)))

    avgvalloss = running_valloss / (i+1)
    avgvalcc = running_valacc / (i+1)

    hist.append([avgloss, avgvalloss, avgacc, avgvalcc])
    
    if best_acc < (running_valacc / (i+1)):
        best_acc = (running_valacc / (i+1))
        curloss = (running_valloss / (i+1))
        torch.save(model, model_path)
        print('* Update optimal model')

print('Finished Training')

Epoch:  1
- Avg.loss: 1.905  | Avg.acc: 0.236
- Avg. val_loss: 1.814  | Avg. val_acc: 0.261


  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


* Update optimal model
Epoch:  2
- Avg.loss: 1.811  | Avg.acc: 0.255
- Avg. val_loss: 1.805  | Avg. val_acc: 0.258
Epoch:  3
- Avg.loss: 1.807  | Avg.acc: 0.256
- Avg. val_loss: 1.809  | Avg. val_acc: 0.262
* Update optimal model
Epoch:  4


KeyboardInterrupt: 

In [15]:
len(hist)

13

### Right way to custom forward on pre-trained model

In [None]:
class MyResnet50(models.resnet.ResNet):
    def __init__(self, pretrained=False):
        # Pass default resnet50 arguments to super init
        # https://github.com/pytorch/vision/blob/e130c6cca88160b6bf7fea9b8bc251601a1a75c5/torchvision/models/resnet.py#L260
        super(MyResnet50, self).__init__(models.resnet.Bottleneck, [3, 4, 6, 3])
        if pretrained:
            self.load_state_dict(models.resnet50(pretrained=True).state_dict())

    def _forward_impl(self, x):
        # See note [TorchScript super()]
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)

        return x

    def forward(self, x):
        return self._forward_impl(x)


model = MyResnet50(pretrained=True)
x = torch.randn(2, 3, 224, 224)
output = model(x)