In [16]:
import torch

from torch.utils.data import DataLoader
from torchvision import transforms

import argparse
import glob
import numpy as np
import yaml
import tqdm

from addict import Dict

from dataset import PartAffordanceDataset, ToTensor, CenterCrop, Normalize
from dataset import Resize, RandomFlip, RandomRotate, RandomCrop, reverse_normalize
from model.drn import drn_c_58
from model.drn_max import drn_c_58_max
from utils.cam import CAM, GradCAM
from torchvision.utils import save_image

In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision

""" 
for the details of SegNet, please refer to this paper:

Badrinarayanan, V., Kendall, A., & Cipolla, R. (2017). 
SegNet: A Deep Convolutional Encoder-Decoder Architecture for Image Segmentation.
IEEE Transactions on Pattern Analysis and Machine Intelligence, 39(12), 2481–2495. 
https://doi.org/10.1109/TPAMI.2016.2644615


SegNet Basic is a smaller version of SegNet
Please refer to this repository:
https://github.com/0bserver07/Keras-SegNet-Basic

"""


class Encoder(nn.Module):
    def __init__(self, in_channel, out_channel):
        super().__init__()
        
        self.conv = nn.Conv2d(in_channel, out_channel, kernel_size=3, padding=1)
        self.bn = nn.BatchNorm2d(out_channel)
        
    def forward(self, x):
        x = self.conv(x)
        x = self.bn(x)
        x = F.relu(x)
        x, idx = F.max_pool2d(x, kernel_size=2, stride=2, return_indices=True)
        return x, idx


class Decoder(nn.Module):
    def __init__(self, in_channel, out_channel):
        super().__init__()
        
        self.conv = nn.Conv2d(in_channel, out_channel, kernel_size=3, padding=1)
        self.bn = nn.BatchNorm2d(out_channel)
        
    def forward(self, x):
        x = self.conv(x)
        x = self.bn(x)
        x = F.relu(x)
        return x


class SegNetBasic(nn.Module):
    
    def __init__(self, in_channel, out_channel):
        super().__init__()
        
        self.encoder1 = Encoder(in_channel, 64)
        self.encoder2 = Encoder(64, 80)
        self.encoder3 = Encoder(80, 96)
        self.encoder4 = Encoder(96, 128)
        
        self.decoder1 = Decoder(128, 96)
        self.decoder2 = Decoder(96, 80)
        self.decoder3 = Decoder(80, 64)
        self.decoder4 = Decoder(64, out_channel)
        
    def forward(self, x):
        size1 = x.size()
        x, idx1 = self.encoder1(x)

        size2 = x.size()
        x, idx2 = self.encoder2(x)

        size3 = x.size()
        x, idx3 = self.encoder3(x)
        
        size4 = x.size()
        x, idx4 = self.encoder4(x)

        x = F.max_unpool2d(x, idx4, kernel_size=2, stride=2, output_size=size4)
        x = self.decoder1(x)
        
        x = F.max_unpool2d(x, idx3, kernel_size=2, stride=2, output_size=size3)
        x = self.decoder2(x)

        x = F.max_unpool2d(x, idx2, kernel_size=2, stride=2, output_size=size2)
        x = self.decoder3(x)

        x = F.max_unpool2d(x, idx1, kernel_size=2, stride=2, output_size=size1)
        x = self.decoder4(x)

        return x

In [8]:
model = SegNetBasic(3, 8)

In [9]:
state_dict = torch.load('final_model.prm', map_location=lambda storage, loc: storage)

In [10]:
model.load_state_dict(state_dict)

In [28]:
""" normalize images """
class Normalize(object):
    def __init__(self, mean=[0.2191, 0.2349, 0.3598], std=[0.1243, 0.1171, 0.0748]):
        self.mean = mean
        self.std = std


    def __call__(self, sample):
        image = sample['image']
        image = transforms.functional.normalize(image, self.mean, self.std)
        sample['image'] = image
        return sample

In [29]:
CONFIG = Dict(yaml.safe_load(open('./result/drn_c_58_max/config.yaml')))

""" DataLoader """
train_transform = transforms.Compose([
    ToTensor(CONFIG),
    Normalize()
])

train_data = PartAffordanceDataset(
    CONFIG.train_data, config=CONFIG, transform=train_transform, mode='test', make_cam_label=True)

train_loader = DataLoader(
    train_data, batch_size=5, shuffle=True, num_workers=2)

In [30]:
colors = torch.tensor([[0, 0, 0],         # class 0 'background'  black
                        [255, 0, 0],       # class 1 'grasp'       red
                        [255, 255, 0],     # class 2 'cut'         yellow
                        [0, 255, 0],       # class 3 'scoop'       green
                        [0, 255, 255],     # class 4 'contain'     sky blue
                        [0, 0, 255],       # class 5 'pound'       blue
                        [255, 0, 255],     # class 6 'support'     purple
                        [255, 255, 255]    # class 7 'wrap grasp'  white
                        ])

# convert class prediction to the mask
def class_to_mask(cls):
    
    mask = colors[cls].transpose(1, 2).transpose(1, 3)
    
    return mask

In [38]:
import numpy as np
import pydensecrf.densecrf as dcrf
import pydensecrf.utils as utils
from joblib import Parallel, delayed

class DenseCRF(object):
    def __init__(self, iter_max=10, pos_w=3, pos_xy_std=1, bi_w=4, bi_xy_std=67, bi_rgb_std=3):
        self.iter_max = iter_max
        self.pos_w = pos_w
        self.pos_xy_std = pos_xy_std
        self.bi_w = bi_w
        self.bi_xy_std = bi_xy_std
        self.bi_rgb_std = bi_rgb_std

    def __call__(self, image, probmap):
        C, H, W = probmap.shape

        U = utils.unary_from_softmax(probmap)
        U = np.ascontiguousarray(U)

        image = np.ascontiguousarray(image)

        d = dcrf.DenseCRF2D(W, H, C)
        d.setUnaryEnergy(U)
        d.addPairwiseGaussian(sxy=self.pos_xy_std, compat=self.pos_w)
        d.addPairwiseBilateral(
            sxy=self.bi_xy_std, srgb=self.bi_rgb_std, rgbim=image, compat=self.bi_w
        )

        Q = d.inference(self.iter_max)
        Q = np.array(Q).reshape((C, H, W))

        return Q

crf = DenseCRF()

In [42]:
for sample in train_loader:
    img, y = sample['image'], sample['label']
    prob = model(img)
    _, h = prob.max(1)
    
    true_mask = class_to_mask(y).to('cpu')
    pred_mask = class_to_mask(h).to('cpu')
    
    save_image(true_mask, 'true_mask.jpg')
    save_image(pred_mask, 'pred_mask.jpg')
    
    img = (img * 255).to('cpu').numpy().astype(np.uint8).transpose(0, 2, 3, 1)
    prob = torch.softmax(prob, 1)    # shape => (N, C, h, w)
    probmap = prob.detach().numpy()

    # CRF
    Q = Parallel(n_jobs=-2)([
        delayed(crf)(*pair) for pair in zip(img, probmap)
    ])
    Q = torch.tensor(Q).to('cpu')    # shape => (N, C, h, w)
    _, Q = Q.max(1)
    
    Q = class_to_mask(Q)
    save_image(Q, 'crf.jpg')
    
    
    break

torch.Size([5, 8, 480, 640])


### CAM

In [11]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torchvision import transforms

import argparse
import glob
import numpy as np
import yaml
import tqdm

from addict import Dict
from PIL import Image

from dataset import PartAffordanceDataset, ToTensor, CenterCrop, Normalize
from dataset import Resize, RandomFlip, RandomRotate, RandomCrop, reverse_normalize
from model.drn import drn_c_58
from model.drn_max import drn_c_58_max, drn_d_105_max
from utils.cam import CAM, GradCAM

In [12]:
CONFIG = Dict(
    yaml.safe_load(open('./result/drn_c_58_max/config.yaml')))

""" DataLoader """
train_transform = transforms.Compose([
    ToTensor(CONFIG),
    Normalize()
])

train_data = PartAffordanceDataset(
    CONFIG.train_data, config=CONFIG, transform=train_transform, mode='test', make_cam_label=True)

train_loader = DataLoader(
    train_data, batch_size=1, shuffle=True, num_workers=2)

model = drn_c_58_max(
        pretrained=False, num_obj=CONFIG.obj_classes, num_aff=CONFIG.aff_classes)

state_dict = torch.load(CONFIG.result_path + '/best_accuracy_model.prm',
                        map_location=lambda storage, loc: storage)
model.load_state_dict(state_dict)
model.eval()

target_layer_obj = model.obj_conv
target_layer_aff = model.aff_conv

# choose CAM or GradCAM
wrapped_model = CAM(model, target_layer_obj, target_layer_aff)
# wrapped_model = GradCAM(model, target_layer_obj, target_layer_aff)

In [13]:
train_iter = iter(train_loader)

In [14]:
sample = next(train_iter)

In [42]:
x, obj_label, aff_label = sample['image'], sample['obj_label'], sample['aff_label']

In [43]:
_, _, H, W = x.shape
pred_obj, pred_aff = wrapped_model(x)

weight_fc_obj = list(
    wrapped_model.model._modules.get('obj_fc').parameters())[0].to('cpu').data
weight_fc_aff = list(
    wrapped_model.model._modules.get('aff_fc').parameters())[0].to('cpu').data

cam_obj = F.conv2d(
    wrapped_model.values_obj.activations, weight=weight_fc_obj[:, :, None, None])
cam_aff = F.conv2d(
    wrapped_model.values_aff.activations, weight=weight_fc_aff[:, :, None, None])

# resize
cam_obj = F.interpolate(
    cam_obj, (H, W), mode='bilinear').view(-1, H, W)
cam_aff = F.interpolate(
    cam_aff, (H, W), mode='bilinear').view(-1, H, W)

predicted object ids tensor([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.]],
       grad_fn=<IndexPutBackward>)
predicted affordance ids tensor([[1., 1., 1., 0., 0., 0., 0., 0.]], grad_fn=<IndexPutBackward>)


  "See the documentation of nn.Upsample for details.".format(mode))


In [44]:
cam_aff = cam_aff[1:]

In [45]:
cam_aff.shape

torch.Size([7, 480, 640])

In [46]:
cam_obj = cam_obj[1:]

In [47]:
cam_obj.shape

torch.Size([17, 480, 640])

In [48]:
cam_aff = cam_aff.data.numpy()
cam_obj = cam_obj.data.numpy()

In [49]:
obj_label, aff_label = obj_label[0, 1:].numpy(), aff_label[0, 1:].numpy()

IndexError: arrays used as indices must be of integer (or boolean) type

In [1]:
import torch

In [2]:
a = torch.zeros((1, 256, 256))

In [3]:
b = torch.randn((8, 256, 256))

In [4]:
c = torch.cat([a, b], dim=0)

In [5]:
c

tensor([[[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],

        [[-0.2858, -0.7670, -0.5304,  ...,  0.1446, -0.3225,  1.4257],
         [ 0.5527,  1.3963, -0.5061,  ...,  1.5962, -0.0037, -2.5691],
         [ 0.4276, -1.6818, -0.3491,  ..., -0.9040,  0.8153,  0.4837],
         ...,
         [ 2.2317,  0.0387, -2.0376,  ...,  2.0216, -1.4848, -0.2504],
         [-0.4412,  0.4135, -0.1002,  ..., -0.8318, -1.2113,  0.1461],
         [ 1.5334, -1.1749, -0.3043,  ...,  0.2135, -1.7319, -0.3815]],

        [[ 0.2128,  0.4446,  0.1813,  ..., -0.4390, -0.9295, -0.9911],
         [ 1.1174, -1.1594,  0.0392,  ..., -1

In [6]:
a = torch.zeros((20, 20))
b = torch.zeros((20, 20))

In [7]:
c = torch.stack([a, b])

In [9]:
c.shape

torch.Size([2, 20, 20])