In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn import metrics
import matplotlib.pyplot as plt
import numpy as np
import cv2
import time
import math
import os

In [3]:
import torch
from torchvision import transforms, datasets

train_data_transform = transforms.Compose([
        
#         transforms.Lambda(lbp_transform),
#         transforms.Lambda(combine_lbp_hog),
#         transforms.ToPILImage(),
        # transforms.RandomSizedCrop(224),
        transforms.RandomHorizontalFlip(),
#         transforms.RandomVerticalFlip(),
        transforms.RandomRotation(degrees=20),
        transforms.Grayscale(num_output_channels=1),
        transforms.ToTensor(),
        transforms.Normalize((0.5,), (0.5,))
    ])

test_data_transform = transforms.Compose([
        transforms.Grayscale(num_output_channels=1),
        transforms.ToTensor(),
        transforms.Normalize((0.5,), (0.5,))
    ])

train_fer = datasets.ImageFolder(root='/tf/data/Quan/fer2013/cleaned_data/fer2013-clean/Training', transform=train_data_transform)
val_fer = datasets.ImageFolder(root='/tf/data/Quan/fer2013/cleaned_data/fer2013-clean/PublicTest', transform=test_data_transform)
test_fer = datasets.ImageFolder(root='/tf/data/Quan/fer2013/cleaned_data/fer2013-clean/PrivateTest/', transform=test_data_transform)

batch_size = 128
train_loader = torch.utils.data.DataLoader(train_fer,
                                             batch_size=batch_size, shuffle=True,
                                             num_workers=8)
val_loader = torch.utils.data.DataLoader(val_fer,
                                             batch_size=batch_size, shuffle=True,
                                             num_workers=8)
test_loader = torch.utils.data.DataLoader(test_fer,
                                             batch_size=batch_size, shuffle=True,
                                             num_workers=8)

In [4]:
class ResidualBlock(nn.Module):
    def __init__(self, input_channels, output_channels, stride=1):
        super(ResidualBlock, self).__init__()
        self.input_channels = input_channels
        self.output_channels = output_channels
        self.stride = stride
        
        self.bn1 = nn.BatchNorm2d(input_channels)
        self.relu = nn.ReLU(inplace=True)
        self.conv1 = nn.Conv2d(input_channels, output_channels//4, 1, 1, bias = False)
        
        self.bn2 = nn.BatchNorm2d(output_channels//4)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv2d(output_channels//4, output_channels//4, 3, stride, padding = 1, bias = False)
        
        self.bn3 = nn.BatchNorm2d(output_channels//4)
        self.relu = nn.ReLU(inplace=True)
        self.conv3 = nn.Conv2d(output_channels//4, output_channels, 1, 1, bias = False)
        
        self.conv4 = nn.Conv2d(input_channels, output_channels , 1, stride, bias = False)
        
    def forward(self, x):
        residual = x
        
        out = self.bn1(x)
        out1 = self.relu(out)
        out = self.conv1(out1)
        
        out = self.bn2(out)
        out = self.relu(out)
        out = self.conv2(out)
        
        out = self.bn3(out)
        out = self.relu(out)
        out = self.conv3(out)
        
        if (self.input_channels != self.output_channels) or (self.stride !=1 ):
            residual = self.conv4(out1)
            
        out += residual
        
        return out

In [5]:
class ResidualMasking(nn.Module):
    def __init__(self, in_channels, out_channels, emb_dim, **kwargs):
        super(ResidualMasking, self).__init__()
        
        emb_dim = out_channels if emb_dim is None else emb_dim
        
        self.mpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) # s -> s/2
        
        self.res1_down = ResidualBlock(in_channels, emb_dim)
        self.res2_down = ResidualBlock(emb_dim, emb_dim)
        
        
        self.res2_up = nn.Sequential(nn.UpsamplingBilinear2d(scale_factor=2),
                                     ResidualBlock(emb_dim, emb_dim))
        self.res1_up = nn.Sequential(nn.UpsamplingBilinear2d(scale_factor=2),
                                     ResidualBlock(emb_dim, out_channels))
        
        self.preconv_out = nn.Sequential(nn.Conv2d(out_channels, out_channels, 1, stride=1, padding=0),
                                         nn.BatchNorm2d(out_channels))
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        # s, in_channels 
        x1_down = self.res1_down(x)
        x1_down_pool = self.mpool(x1_down)
        
        # s / 2, emb_dim
        x2_down = self.res2_down(x1_down_pool)
        x2_down_pool = self.mpool(x2_down)
        
        # -- s / 4 --
        
        # s / 2, emb_dim
        x2_up = self.res2_up(x2_down_pool)
        x2_up = x2_up + x1_down_pool
        
        # s, in_channels
        x1_up = self.res1_up(x2_up)
        x1_up = x1_up + x
        
        # out
        x_pre_out = self.preconv_out(x1_up)
        x_sigmoid = self.sigmoid(x_pre_out)
        
        return x_sigmoid

In [6]:
class AttentionBlock(nn.Module):
    def __init__(self, in_channels, out_channels, emb_dim=None, **kwargs):
        super(AttentionBlock, self).__init__()
        
        emb_dim = out_channels if emb_dim is None else emb_dim
        
        # 2 branches: trunk and residual masking
        
        ##   trunk branch
        self.trunk_branch = nn.Sequential(ResidualBlock(in_channels, emb_dim),
                                          ResidualBlock(emb_dim, out_channels))
        
        ##  residual masking
        self.residual_masking = ResidualMasking(in_channels, out_channels, emb_dim)
        
    def forward(self, x):
        
        x_trunk = self.trunk_branch(x)
        x_sigmoid = self.residual_masking(x_trunk)
        
        x_out = (1 + x_sigmoid) * x_trunk
        
        x_out = x_out + x_trunk
        
        return x_out

In [7]:
class ResidualMaskingModel(nn.Module):
    def __init__(self):
        super(ResidualMaskingModel, self).__init__()
        
        self.preconv_in = nn.Sequential(nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3),
                                        nn.BatchNorm2d(64),
                                        nn.ReLU(inplace=True))
        
        self.mpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        
        
        self.resblock1 = ResidualBlock(64, 128)
        self.att1 = AttentionBlock(128, 128, 64)
        
        self.resblock2 = ResidualBlock(128, 256)
        self.att2 = AttentionBlock(256, 256, 128)
        
        self.resblock3 = ResidualBlock(256, 512)
        self.att3 = AttentionBlock(512, 512, 256)
        
        self.resblock4 = ResidualBlock(512, 1024)
        self.resblock5 = ResidualBlock(1024, 1024)
        
        self.preconv_out = nn.Sequential(nn.BatchNorm2d(1024),
                                         nn.ReLU(inplace=True))
        
        self.classifier = nn.Sequential(nn.Flatten(),
                                        nn.Linear(1024, 512),
                                        nn.BatchNorm1d(512),
                                        nn.ReLU(inplace=True),
                                        nn.Linear(512,7),
                                        nn.BatchNorm1d(7))
        
    def forward(self, x):
        x = self.preconv_in(x)
        x = self.mpool(x)
        
        x = self.resblock1(x)
        x = self.att1(x)
        
        x = self.resblock2(x)
        x = self.att2(x)
        
        x = self.resblock3(x)
        x = self.att3(x)
        
        x = self.resblock4(x)
        x = self.resblock5(x)
        
        x = self.preconv_out(x)
        x = self.mpool(x)
        
        x_out = nn.AvgPool2d(kernel_size=x.size(-1))(x)
        x_out = self.classifier(x_out)
        
        return x_out

In [8]:
from torch.optim import lr_scheduler

device = torch.device('cuda')
print(device)

model = ResidualMaskingModel()

criterion = nn.CrossEntropyLoss()

learning_rate = 0.004
# reduce_factor = 2
optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-4)
scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=8, verbose=True)

cuda


In [9]:
model = model.to(device)

model_folder = '/tf/data/Quan/fer2013/backtobasics_cleaned_data/residual_masking/'
model_name = 'rs50_1'
model_path = os.path.join(model_folder, model_name + '.pt')

best_acc = 0.0
curloss = 0.0
hist = []

for epoch in range(100):

    print('Epoch: ', epoch + 1)
    running_loss = 0.0
    running_acc = 0.0

    # TRAIN
    model.train()
    for i, data in enumerate(train_loader):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data
        inputs = inputs.to(device)
        labels = labels.to(device)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inputs)
        acc = float((torch.argmax(outputs, dim=1) == labels).float().sum()/labels.size(0))
        loss = criterion(outputs, labels)
        loss.backward()
        
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        running_acc += acc
#         print('\t - Step %d: loss: %.3f acc: %.3f' % (i+1, loss.item(), acc))

    print('- Avg.loss: %.3f  | Avg.acc: %.3f' % (running_loss / (i+1), running_acc / (i+1)))
    avgloss = running_loss / (i+1)
    avgacc = running_acc / (i+1)
    
    # print gradient flow figure
#     plot_grad_flow(model.named_parameters(), epoch, avgloss, avgacc,
#                    savepath=os.path.join(model_folder, model_name + '_gf' + '_' + str(epoch) + '.png'))

    # EVALUATE
    model.eval()
    running_valloss = 0.0
    running_valacc = 0.0
    for i,data in enumerate(val_loader):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data
        inputs = inputs.to(device)
        labels = labels.to(device)

        outputs = model(inputs)
        acc = float((torch.argmax(outputs, dim=1) == labels).float().sum()/labels.size(0))
        loss = criterion(outputs, labels)
        
#         print('\t - Step %d: loss: %.3f acc: %.3f' % (i+1, loss.item(), acc))

        running_valloss += loss.item()
        running_valacc += acc

    print('- Avg. val_loss: %.3f  | Avg. val_acc: %.3f' % (running_valloss / (i+1), running_valacc / (i+1)))

    avgvalloss = running_valloss / (i+1)
    avgvalcc = running_valacc / (i+1)

    hist.append([avgloss, avgvalloss, avgacc, avgvalcc])
    
    if best_acc < (running_valacc / (i+1)):
        best_acc = (running_valacc / (i+1))
        curloss = (running_valloss / (i+1))
        torch.save(model, model_path)
        print('* Update optimal model')
        
    scheduler.step(avgvalloss)

print('Finished Training')

Epoch:  1
- Avg.loss: 1.889  | Avg.acc: 0.239
- Avg. val_loss: 1.803  | Avg. val_acc: 0.261
* Update optimal model
Epoch:  2


  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


- Avg.loss: 1.733  | Avg.acc: 0.298
- Avg. val_loss: 1.693  | Avg. val_acc: 0.352
* Update optimal model
Epoch:  3
- Avg.loss: 1.561  | Avg.acc: 0.385
- Avg. val_loss: 1.581  | Avg. val_acc: 0.367
* Update optimal model
Epoch:  4
- Avg.loss: 1.464  | Avg.acc: 0.432
- Avg. val_loss: 1.435  | Avg. val_acc: 0.444
* Update optimal model
Epoch:  5
- Avg.loss: 1.401  | Avg.acc: 0.457
- Avg. val_loss: 1.469  | Avg. val_acc: 0.439
Epoch:  6
- Avg.loss: 1.365  | Avg.acc: 0.474
- Avg. val_loss: 1.507  | Avg. val_acc: 0.405
Epoch:  7
- Avg.loss: 1.329  | Avg.acc: 0.488
- Avg. val_loss: 1.428  | Avg. val_acc: 0.457
* Update optimal model
Epoch:  8
- Avg.loss: 1.310  | Avg.acc: 0.495
- Avg. val_loss: 1.402  | Avg. val_acc: 0.467
* Update optimal model
Epoch:  9
- Avg.loss: 1.290  | Avg.acc: 0.502
- Avg. val_loss: 1.504  | Avg. val_acc: 0.430
Epoch:  10
- Avg.loss: 1.276  | Avg.acc: 0.510
- Avg. val_loss: 1.449  | Avg. val_acc: 0.423
Epoch:  11
- Avg.loss: 1.265  | Avg.acc: 0.511
- Avg. val_loss: 1.

- Avg.loss: 0.867  | Avg.acc: 0.675
- Avg. val_loss: 1.048  | Avg. val_acc: 0.599
Epoch:  86
- Avg.loss: 0.870  | Avg.acc: 0.675
- Avg. val_loss: 0.986  | Avg. val_acc: 0.644
* Update optimal model
Epoch:  87
- Avg.loss: 0.865  | Avg.acc: 0.673
- Avg. val_loss: 1.006  | Avg. val_acc: 0.609
Epoch:  88
- Avg.loss: 0.867  | Avg.acc: 0.678
- Avg. val_loss: 0.982  | Avg. val_acc: 0.640
Epoch:  89
- Avg.loss: 0.865  | Avg.acc: 0.677
- Avg. val_loss: 1.015  | Avg. val_acc: 0.616
Epoch    89: reducing learning rate of group 0 to 5.0000e-04.
Epoch:  90
- Avg.loss: 0.832  | Avg.acc: 0.690
- Avg. val_loss: 1.000  | Avg. val_acc: 0.631
Epoch:  91
- Avg.loss: 0.826  | Avg.acc: 0.692
- Avg. val_loss: 1.002  | Avg. val_acc: 0.630
Epoch:  92
- Avg.loss: 0.824  | Avg.acc: 0.694
- Avg. val_loss: 0.992  | Avg. val_acc: 0.646
* Update optimal model
Epoch:  93
- Avg.loss: 0.820  | Avg.acc: 0.698
- Avg. val_loss: 0.997  | Avg. val_acc: 0.629
Epoch:  94
- Avg.loss: 0.817  | Avg.acc: 0.696
- Avg. val_loss: 0.

In [10]:
model = model.to(device)

model_folder = '/tf/data/Quan/fer2013/backtobasics_cleaned_data/residual_masking/'
model_name = 'rs50_1'
model_path = os.path.join(model_folder, model_name + '.pt')

# best_acc = 0.0
# curloss = 0.0
# hist = []

for epoch in range(100, 150):

    print('Epoch: ', epoch + 1)
    running_loss = 0.0
    running_acc = 0.0

    # TRAIN
    model.train()
    for i, data in enumerate(train_loader):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data
        inputs = inputs.to(device)
        labels = labels.to(device)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inputs)
        acc = float((torch.argmax(outputs, dim=1) == labels).float().sum()/labels.size(0))
        loss = criterion(outputs, labels)
        loss.backward()
        
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        running_acc += acc
#         print('\t - Step %d: loss: %.3f acc: %.3f' % (i+1, loss.item(), acc))

    print('- Avg.loss: %.3f  | Avg.acc: %.3f' % (running_loss / (i+1), running_acc / (i+1)))
    avgloss = running_loss / (i+1)
    avgacc = running_acc / (i+1)
    
    # print gradient flow figure
#     plot_grad_flow(model.named_parameters(), epoch, avgloss, avgacc,
#                    savepath=os.path.join(model_folder, model_name + '_gf' + '_' + str(epoch) + '.png'))

    # EVALUATE
    model.eval()
    running_valloss = 0.0
    running_valacc = 0.0
    for i,data in enumerate(val_loader):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data
        inputs = inputs.to(device)
        labels = labels.to(device)

        outputs = model(inputs)
        acc = float((torch.argmax(outputs, dim=1) == labels).float().sum()/labels.size(0))
        loss = criterion(outputs, labels)
        
#         print('\t - Step %d: loss: %.3f acc: %.3f' % (i+1, loss.item(), acc))

        running_valloss += loss.item()
        running_valacc += acc

    print('- Avg. val_loss: %.3f  | Avg. val_acc: %.3f' % (running_valloss / (i+1), running_valacc / (i+1)))

    avgvalloss = running_valloss / (i+1)
    avgvalcc = running_valacc / (i+1)

    hist.append([avgloss, avgvalloss, avgacc, avgvalcc])
    
    if best_acc < (running_valacc / (i+1)):
        best_acc = (running_valacc / (i+1))
        curloss = (running_valloss / (i+1))
        torch.save(model, model_path)
        print('* Update optimal model')
        
    scheduler.step(avgvalloss)

print('Finished Training')

Epoch:  101
- Avg.loss: 0.800  | Avg.acc: 0.705
- Avg. val_loss: 0.986  | Avg. val_acc: 0.643
Epoch:  102
- Avg.loss: 0.803  | Avg.acc: 0.701
- Avg. val_loss: 0.996  | Avg. val_acc: 0.626
Epoch:  103
- Avg.loss: 0.803  | Avg.acc: 0.703
- Avg. val_loss: 0.987  | Avg. val_acc: 0.647
Epoch:  104
- Avg.loss: 0.799  | Avg.acc: 0.704
- Avg. val_loss: 0.991  | Avg. val_acc: 0.627
Epoch:  105
- Avg.loss: 0.792  | Avg.acc: 0.707
- Avg. val_loss: 1.069  | Avg. val_acc: 0.629
Epoch:  106
- Avg.loss: 0.793  | Avg.acc: 0.704
- Avg. val_loss: 0.992  | Avg. val_acc: 0.649
Epoch:  107
- Avg.loss: 0.788  | Avg.acc: 0.707
- Avg. val_loss: 0.999  | Avg. val_acc: 0.637
Epoch:  108
- Avg.loss: 0.796  | Avg.acc: 0.703
- Avg. val_loss: 0.986  | Avg. val_acc: 0.650
Epoch   108: reducing learning rate of group 0 to 2.5000e-04.
Epoch:  109
- Avg.loss: 0.780  | Avg.acc: 0.710
- Avg. val_loss: 1.041  | Avg. val_acc: 0.633
Epoch:  110
- Avg.loss: 0.770  | Avg.acc: 0.715
- Avg. val_loss: 1.047  | Avg. val_acc: 0.61