In [1]:
import numpy as np
import os
import copy
import itertools
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader

from main.head import ASPPHeadNode
from data.nyuv2_dataloader_adashare import NYU_v2
from data.pixel2pixel_loss import NYUCriterions
from data.pixel2pixel_metrics import NYUMetrics

In [12]:
!python two_task_exp.py --exp_dir='exp' --data='Taskonomy' --batch_size=64 --backbone='resnet34' --branch=0 --two_task 'segment_semantic' 'normal' --total_iters=50000 --lr=0.0001 --decay_lr_freq=10000 --decay_lr_rate=0.3 --print_iters=10 --save_iters=500 --val_iters=20

Namespace(backbone='resnet34', branch=0, bz=64, ckpt_dir='checkpoint/', data='Taskonomy', dataroot='/mnt/nfs/work1/huiguan/lijunzhang/policymtl/data/', decay_lr_freq=10000, decay_lr_rate=0.3, exp_dir='exp', loss_lambda=[1, 1], lr=0.0001, print_iters=10, projectroot='/mnt/nfs/work1/huiguan/lijunzhang/multibranch/', reload_ckpt=None, save_iters=500, seed=10, total_iters=50000, two_task=['segment_semantic', 'normal'], val_iters=20)
[Iter 10 Task segm] Train Loss: 1.7091
[Iter 10 Task norm] Train Loss: 0.4213
[Iter 10 Total] Train Loss: 2.1304
[Iter 20 Task segm] Train Loss: 1.6368
[Iter 20 Task norm] Train Loss: 0.3136
[Iter 20 Total] Train Loss: 1.9503
^C
Error in loading mcdade/rgb/point_19_view_11_domain_rgb.png


# Deeplab Resnet with Branch

In [2]:
affine_par = True

def conv3x3(in_channels, out_channels, stride=1, dilation=1):
    "3x3 convolution with padding"

    kernel_size = np.asarray((3, 3))

    # Compute the size of the upsampled filter with
    # a specified dilation rate.
    upsampled_kernel_size = (kernel_size - 1) * (dilation - 1) + kernel_size

    # Determine the padding that is necessary for full padding,
    # meaning the output spatial size is equal to input spatial size
    full_padding = (upsampled_kernel_size - 1) // 2

    # Conv2d doesn't accept numpy arrays as arguments
    full_padding, kernel_size = tuple(full_padding), tuple(kernel_size)

    return nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, stride=stride,
                      padding=full_padding, dilation=dilation, bias=False)

# No projection: identity shortcut
# conv -> bn -> relu -> conv -> bn
class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, inplanes, planes, stride=1, dilation=1):
        super(BasicBlock, self).__init__() 
        self.conv1 = conv3x3(inplanes, planes, stride, dilation=dilation)
        self.bn1 = nn.BatchNorm2d(planes, affine = affine_par)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = conv3x3(planes, planes, dilation=dilation)
        self.bn2 = nn.BatchNorm2d(planes, affine = affine_par)

    def forward(self, x):
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.conv2(out)
        y = self.bn2(out)
        return y
    
# Add residual projection
class ResidualBlock(nn.Module):
    def __init__(self, block, ds):
        super(ResidualBlock, self).__init__() 
        self.block = block
        self.ds = ds
        
    def forward(self, x):
        residual = self.ds(x) if self.ds is not None else x
        x = F.relu(residual + self.block(x))
        return x

In [3]:
class Deeplab_ResNet_Backbone_Branch(nn.Module):
    def __init__(self, block, layers, branch=None, task_num=2):
        super(Deeplab_ResNet_Backbone_Branch, self).__init__()
        
        self.inplanes = 64
        self.branch = branch
        self.task_num = task_num

        strides = [1, 2, 1, 1]
        dilations = [1, 1, 2, 4]
        filt_sizes = [64, 128, 256, 512]
        self.shared_blocks, self.separate_blocks = [], []
        self.layer_config = layers
        
        branch_cnt = 0
        seed = self._make_seed()
        self.__add_to_share_or_separate(branch_cnt, seed)
        branch_cnt += 1
        
        for segment, num_blocks in enumerate(self.layer_config):
            filt_size, num_blocks, stride, dilation = filt_sizes[segment],layers[segment],strides[segment],dilations[segment]
            for b_idx in range(num_blocks):
                blocklayer = self._make_blocklayer(b_idx, block, filt_size, stride=stride, dilation=dilation)
                self.__add_to_share_or_separate(branch_cnt, blocklayer)
                branch_cnt += 1

        self.shared_blocks = nn.ModuleList(self.shared_blocks)
        self.separate_blocks = nn.ModuleList(self.separate_blocks) 

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, 0.01)
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()

    def _make_seed(self):
        seed = nn.Sequential(nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False),
                             nn.BatchNorm2d(64, affine=affine_par),
                             nn.ReLU(inplace=True),
                             nn.MaxPool2d(kernel_size=3, stride=2, padding=1, ceil_mode=True)) 
        return seed
    
    def _make_downsample(self, block, inplanes, planes, stride=1, dilation=1):
        downsample = None
        if stride != 1 or inplanes != planes * block.expansion or dilation == 2 or dilation == 4:
            downsample = nn.Sequential(nn.Conv2d(inplanes, planes * block.expansion, kernel_size=1, stride=stride, bias=False),
                                       nn.BatchNorm2d(planes * block.expansion, affine = affine_par))
        return downsample
    
    def _make_blocklayer(self, block_idx, block, planes, stride=1, dilation=1):
        ds = None
        if block_idx == 0:
            basic_block = block(self.inplanes, planes, stride, dilation=dilation)
            ds = self._make_downsample(block, self.inplanes, planes, stride=stride, dilation=dilation)
            self.inplanes = planes * block.expansion
        else:
            basic_block = block(self.inplanes, planes, dilation=dilation)
            
        blocklayer = ResidualBlock(basic_block, ds)
        return blocklayer
    
    def __add_to_share_or_separate(self, branch_cnt, block):
        if self.branch is None or branch_cnt < self.branch:
            self.shared_blocks.append(block)
        else:
            multiple_blocks = []
            for i in range(self.task_num):
                multiple_blocks.append(copy.deepcopy(block))
            self.separate_blocks.append(nn.ModuleList(multiple_blocks))
        return

    def forward(self, x): 
        for block in self.shared_blocks:
            x = block(x)
#         return x
        output = [x] * self.task_num
        for multiple_blocks in self.separate_blocks:
            for i in range(self.task_num):
                output[i] = multiple_blocks[i](output[i])
        return output

In [4]:
backbone = Deeplab_ResNet_Backbone_Branch(BasicBlock, [3, 4, 6, 3], None)

In [5]:
print(backbone)

Deeplab_ResNet_Backbone_Branch(
  (shared_blocks): ModuleList(
    (0): Sequential(
      (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU(inplace=True)
      (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=True)
    )
    (1): ResidualBlock(
      (block): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (2): ResidualBlock(
      (block): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bi

In [14]:
from graphviz import Digraph
from torch.autograd import Variable

In [15]:
def make_dot(var, params=None):
    if params is not None:
        assert isinstance(params.values()[0], Variable)
        param_map = {id(v): k for k, v in params.items()}

    node_attr = dict(style="filled", shape="box", align="left", fontsize="12", ranksep="0.1", height="0.2")
    dot = Digraph(node_attr=node_attr, graph_attr=dict(size="12,12"))
    seen = set()

    def size_to_str(size):
        return "(" + (", ").join(["%d" % v for v in size]) + ")"

    def add_nodes(var):
        if var not in seen:
            if torch.is_tensor(var):
                dot.node(str(id(var)), size_to_str(var.size()), fillcolor="orange")
                dot.edge(str(id(var.grad_fn)), str(id(var)))
                var = var.grad_fn
            if hasattr(var, "variable"):
                u = var.variable
                name = param_map[id(u)] if params is not None else ""
                node_name = "%s\n %s" % (name, size_to_str(u.size()))
#                 print(node_name)
                
                dot.node(str(id(var)), node_name, fillcolor="lightblue")
            else:
                print(type(var).__name__)
                
                dot.node(str(id(var)), str(type(var).__name__))
            seen.add(var)
            if hasattr(var, "next_functions"):
                for u in var.next_functions:
                    if u[0] is not None:
                        dot.edge(str(id(u[0])), str(id(var)))
                        add_nodes(u[0])
            if hasattr(var, "saved_tensors"):
                for t in var.saved_tensors:
                    dot.edge(str(id(t)), str(id(var)))
                    add_nodes(t)

    add_nodes(var)
    return dot

In [16]:
inputs = torch.randn(1, 3, 224, 224)
y = backbone(inputs)
g = make_dot(y)
g.view()

ReluBackward0
AddBackward0
ReluBackward0
AddBackward0
ReluBackward0
AddBackward0
NativeBatchNormBackward
MkldnnConvolutionBackward
ReluBackward0
AddBackward0
ReluBackward0
AddBackward0
ReluBackward0
AddBackward0
ReluBackward0
AddBackward0
ReluBackward0
AddBackward0
ReluBackward0
AddBackward0
NativeBatchNormBackward
MkldnnConvolutionBackward
ReluBackward0
AddBackward0
ReluBackward0
AddBackward0
ReluBackward0
AddBackward0
ReluBackward0
AddBackward0
NativeBatchNormBackward
MkldnnConvolutionBackward
ReluBackward0
AddBackward0
ReluBackward0
AddBackward0
ReluBackward0
AddBackward0
MaxPool2DWithIndicesBackward
ReluBackward1
NativeBatchNormBackward
MkldnnConvolutionBackward
NativeBatchNormBackward
MkldnnConvolutionBackward
ReluBackward1
NativeBatchNormBackward
MkldnnConvolutionBackward
NativeBatchNormBackward
MkldnnConvolutionBackward
ReluBackward1
NativeBatchNormBackward
MkldnnConvolutionBackward
NativeBatchNormBackward
MkldnnConvolutionBackward
ReluBackward1
NativeBatchNormBackward
MkldnnCon

'Digraph.gv.pdf'

# Model with Branch

In [4]:
class Deeplab_ASPP_Branch(nn.Module):
    def __init__(self, branch, cls_num):
        super(Deeplab_ASPP_Branch, self).__init__()
        self.branch = branch
        self.backbone = Deeplab_ResNet_Backbone_Branch(BasicBlock, [3, 4, 6, 3], branch, len(cls_num))
        self.heads = nn.ModuleDict()
        for task in cls_num:
            self.heads[task] = ASPPHeadNode(512, cls_num[task])
        
    def forward(self, x):
        features = self.backbone(x)
        output = {}
        idx = 0
        for task in self.heads:
            output[task] = self.heads[task](features[idx])
            idx += 1
        return output

# Train on NYUv2

In [5]:
class Trainer():
    def __init__(self, model, two_task, train_dataloader, val_dataloader, criterion_dict, metric_dict, 
                 lr=0.001, decay_lr_freq=4000, decay_lr_rate=0.5,
                 print_iters=50, val_iters=200, save_iters=200):
        super(Trainer, self).__init__()
        self.model = model
        self.startIter = 0
        self.optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, self.model.parameters()), lr=lr, betas=(0.5, 0.999), weight_decay=0.0001)
        self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=decay_lr_freq, gamma=decay_lr_rate)
        
        self.two_task = two_task
        
        self.train_dataloader = train_dataloader
        self.train_iter = iter(self.train_dataloader)
        self.val_dataloader = val_dataloader
        self.criterion_dict = criterion_dict
        self.metric_dict = metric_dict
        
        self.loss_list = {}
        self.set_train_loss()
        
        self.print_iters = print_iters
        self.val_iters = val_iters
        self.save_iters = save_iters
    
    def train(self, iters, loss_lambda, savePath=None, reload=None):
        self.model.train()
        if reload is not None and savePath is not None:
            self.load_model(savePath, reload)

        for i in range(self.startIter, iters):
            self.train_step(loss_lambda)

            if (i+1) % self.print_iters == 0:
                self.print_train_loss(i)
                self.set_train_loss()
            if (i+1) % self.val_iters == 0:
                self.validate(i)
            if (i+1) % self.save_iters == 0:
                if savePath is not None:
                    self.save_model(i, savePath)
            
        # Reset loss list and the data iters
        self.set_train_loss()
        return
    
    def train_step(self, loss_lambda):
        self.model.train()
        try:
            data = next(self.train_iter)
        except StopIteration:
            self.train_iter = iter(self.train_dataloader)
            data = next(self.train_iter)
            
        x = data['input'].cuda()
        self.optimizer.zero_grad()
        output = self.model(x)
        
        loss = 0
        for task in self.two_task:
            y = data[task].cuda()
            if task + '_mask' in data:
                tloss = self.criterion_dict[task](output[task], y, data[task + '_mask'].cuda())
            else:
                tloss = self.criterion_dict[task](output[task], y)
                
            self.loss_list[task].append(tloss.item())
            loss += loss_lambda[task] * tloss
        self.loss_list['total'].append(loss.item())
        
        loss.backward()
        self.optimizer.step()
        
        if self.scheduler is not None:
            self.scheduler.step()
        return
    
    def validate(self, it):
        self.model.eval()
        loss_list = {}
        for task in self.two_task:
            loss_list[task] = []
        
        for i, data in enumerate(self.val_dataloader):
            x = data['input'].cuda()
            output = self.model(x)

            for task in self.two_task:
                y = data[task].cuda()
                if task + '_mask' in data:
                    tloss = self.criterion_dict[task](output[task], y, data[task + '_mask'].cuda())
                    self.metric_dict[task](output[task], y, data[task + '_mask'].cuda())
                else:
                    tloss = self.criterion_dict[task](output[task], y)
                    self.metric_dict[task](output[task], y)
                loss_list[task].append(tloss.item())

        for task in self.two_task:
            val_results = self.metric_dict[task].val_metrics()
            print('[Iter {} Task {}] Val Loss: {:.4f}'.format((it+1), task[:4], np.mean(loss_list[task])), flush=True)
            print(val_results, flush=True)
        print('======================================================================', flush=True)
        return
    
    # helper functions
    def set_train_loss(self):
        for task in self.two_task:
            self.loss_list[task] = []
        self.loss_list['total'] = []
        return
    
    def load_model(self, savePath, reload):
        state = torch.load(savePath + reload)
        if self.two_task[0] in reload and self.two_task[1] in reload:
            self.startIter = state['iter'] + 1
            self.model.load_state_dict(state['state_dict'])
            self.optimizer.load_state_dict(state['optimizer'])
            self.scheduler.load_state_dict(state['scheduler'])
        else:
            print('Cannot load from models trained from different tasks.')
            exit()
        return
    
    def save_model(self, it, savePath):
        state = {'iter': it,
                'state_dict': self.model.state_dict(),
                'optimizer': self.optimizer.state_dict(),
                'scheduler': self.scheduler.state_dict()}
        branch = '_allshare' if self.model.branch is None else '_b'+ str(self.model.branch)
        torch.save(state, savePath + self.two_task[0] + '_' + self.two_task[1] + branch + '.model')
        return
    
    def print_train_loss(self, it):
        # Function: Print loss for each task
        for task in self.two_task:
            if self.loss_list[task]:
                avg_loss = np.mean(self.loss_list[task])
            else:
                continue
            print('[Iter {} Task {}] Train Loss: {:.4f}'.format((it+1), task[:4], avg_loss), flush=True)
        print('[Iter {} Total] Train Loss: {:.4f}'.format((it+1), np.mean(self.loss_list['total'])), flush=True)
        print('======================================================================', flush=True)
        return

In [6]:
dataroot = '/mnt/nfs/work1/huiguan/lijunzhang/policymtl/data/NYUv2/'
tasks = ('segment_semantic','normal','depth_zbuffer')
task_cls_num = {'segment_semantic': 40, 'normal':3, 'depth_zbuffer': 1}


criterionDict = {}
metricDict = {}
clsNum = {}
two_task = ['segment_semantic','depth_zbuffer']
dataset = NYU_v2(dataroot, 'train', crop_h=321, crop_w=321)
trainDataloader = DataLoader(dataset, 16, shuffle=True)

dataset = NYU_v2(dataroot, 'test', crop_h=321, crop_w=321)
valDataloader = DataLoader(dataset, 16, shuffle=True)

for task in two_task:
    criterionDict[task] = NYUCriterions(task)
    metricDict[task] = NYUMetrics(task)
    clsNum[task] = task_cls_num[task]

In [7]:
for b in range(0,1): # in shell
    model = Deeplab_ASPP_Branch(b, clsNum)
model = model.cuda()

In [None]:
# bs = 16, Adam
loss_lambda = {'segment_semantic': 1, 'normal':1, 'depth_zbuffer': 1}
checkpoint = '/mnt/nfs/work1/huiguan/lijunzhang/multibranch/checkpoint/NYUv2/exp/'

trainer = Trainer(model, two_task, trainDataloader, valDataloader, criterionDict, metricDict)
trainer.train(20000, loss_lambda, checkpoint)

[Iter 50 Task segm] Train Loss: 3.4915
[Iter 50 Task dept] Train Loss: 2.5242
[Iter 50 Total] Train Loss: 6.0157
[Iter 100 Task segm] Train Loss: 2.7859
[Iter 100 Task dept] Train Loss: 1.1379
[Iter 100 Total] Train Loss: 3.9238
[Iter 150 Task segm] Train Loss: 2.7230
[Iter 150 Task dept] Train Loss: 1.0866
[Iter 150 Total] Train Loss: 3.8096
[Iter 200 Task segm] Train Loss: 2.6157
[Iter 200 Task dept] Train Loss: 1.0742
[Iter 200 Total] Train Loss: 3.6899
[Iter 200 Task segm] Val Loss: 2.5310
{'mIoU': 0.1232, 'Pixel Acc': 0.3489}
[Iter 200 Task dept] Val Loss: 1.0999
{'abs_err': 1.0833, 'rel_err': 0.3499, 'sigma_1.25': 28.6498, 'sigma_1.25^2': 57.9327, 'sigma_1.25^3': 80.7769}
[Iter 250 Task segm] Train Loss: 2.5960
[Iter 250 Task dept] Train Loss: 1.0750
[Iter 250 Total] Train Loss: 3.6710
[Iter 300 Task segm] Train Loss: 2.6063
[Iter 300 Task dept] Train Loss: 1.0867
[Iter 300 Total] Train Loss: 3.6930
[Iter 350 Task segm] Train Loss: 2.5760
[Iter 350 Task dept] Train Loss: 1.1205
[

{'abs_err': 0.9491, 'rel_err': 0.3347, 'sigma_1.25': 37.5332, 'sigma_1.25^2': 68.5595, 'sigma_1.25^3': 86.4294}
[Iter 1650 Task segm] Train Loss: 2.0657
[Iter 1650 Task dept] Train Loss: 1.0116
[Iter 1650 Total] Train Loss: 3.0773
[Iter 1700 Task segm] Train Loss: 2.0888
[Iter 1700 Task dept] Train Loss: 1.0177
[Iter 1700 Total] Train Loss: 3.1065
[Iter 1750 Task segm] Train Loss: 2.0309
[Iter 1750 Task dept] Train Loss: 1.0119
[Iter 1750 Total] Train Loss: 3.0428
[Iter 1800 Task segm] Train Loss: 2.0348
[Iter 1800 Task dept] Train Loss: 1.0511
[Iter 1800 Total] Train Loss: 3.0860
[Iter 1800 Task segm] Val Loss: 2.5144
{'mIoU': 0.0968, 'Pixel Acc': 0.383}
[Iter 1800 Task dept] Val Loss: 0.8844
{'abs_err': 0.8771, 'rel_err': 0.3523, 'sigma_1.25': 42.7551, 'sigma_1.25^2': 72.0901, 'sigma_1.25^3': 88.4133}
[Iter 1850 Task segm] Train Loss: 2.0320
[Iter 1850 Task dept] Train Loss: 1.0121
[Iter 1850 Total] Train Loss: 3.0440
[Iter 1900 Task segm] Train Loss: 2.0154
[Iter 1900 Task dept] Tra

[Iter 3200 Task dept] Train Loss: 0.9738
[Iter 3200 Total] Train Loss: 2.6174
[Iter 3200 Task segm] Val Loss: 1.8375
{'mIoU': 0.1634, 'Pixel Acc': 0.4861}
[Iter 3200 Task dept] Val Loss: 0.8653
{'abs_err': 0.8598, 'rel_err': 0.3718, 'sigma_1.25': 44.4399, 'sigma_1.25^2': 72.4958, 'sigma_1.25^3': 88.6351}
[Iter 3250 Task segm] Train Loss: 1.6363
[Iter 3250 Task dept] Train Loss: 0.9772
[Iter 3250 Total] Train Loss: 2.6135
[Iter 3300 Task segm] Train Loss: 1.6454
[Iter 3300 Task dept] Train Loss: 1.0154
[Iter 3300 Total] Train Loss: 2.6608
[Iter 3350 Task segm] Train Loss: 1.6443
[Iter 3350 Task dept] Train Loss: 0.9918
[Iter 3350 Total] Train Loss: 2.6361
[Iter 3400 Task segm] Train Loss: 1.6687
[Iter 3400 Task dept] Train Loss: 0.9576
[Iter 3400 Total] Train Loss: 2.6263
[Iter 3400 Task segm] Val Loss: 1.9532
{'mIoU': 0.1414, 'Pixel Acc': 0.4555}
[Iter 3400 Task dept] Val Loss: 0.8732
{'abs_err': 0.8646, 'rel_err': 0.3417, 'sigma_1.25': 42.5594, 'sigma_1.25^2': 73.0343, 'sigma_1.25^3':

[Iter 4750 Task segm] Train Loss: 1.2529
[Iter 4750 Task dept] Train Loss: 0.9404
[Iter 4750 Total] Train Loss: 2.1933
[Iter 4800 Task segm] Train Loss: 1.2349
[Iter 4800 Task dept] Train Loss: 0.9202
[Iter 4800 Total] Train Loss: 2.1551
[Iter 4800 Task segm] Val Loss: 1.6546
{'mIoU': 0.2051, 'Pixel Acc': 0.5191}
[Iter 4800 Task dept] Val Loss: 0.8395
{'abs_err': 0.8361, 'rel_err': 0.3689, 'sigma_1.25': 45.6967, 'sigma_1.25^2': 74.2456, 'sigma_1.25^3': 89.0871}
[Iter 4850 Task segm] Train Loss: 1.2198
[Iter 4850 Task dept] Train Loss: 0.9252
[Iter 4850 Total] Train Loss: 2.1450
[Iter 4900 Task segm] Train Loss: 1.2258
[Iter 4900 Task dept] Train Loss: 0.9013
[Iter 4900 Total] Train Loss: 2.1271
[Iter 4950 Task segm] Train Loss: 1.1846
[Iter 4950 Task dept] Train Loss: 0.9313
[Iter 4950 Total] Train Loss: 2.1159
[Iter 5000 Task segm] Train Loss: 1.2448
[Iter 5000 Task dept] Train Loss: 0.9111
[Iter 5000 Total] Train Loss: 2.1559
[Iter 5000 Task segm] Val Loss: 1.6400
{'mIoU': 0.217, 'Pi

[Iter 6300 Task segm] Train Loss: 0.9967
[Iter 6300 Task dept] Train Loss: 0.9136
[Iter 6300 Total] Train Loss: 1.9103
[Iter 6350 Task segm] Train Loss: 1.0498
[Iter 6350 Task dept] Train Loss: 0.8998
[Iter 6350 Total] Train Loss: 1.9495
[Iter 6400 Task segm] Train Loss: 1.0077
[Iter 6400 Task dept] Train Loss: 0.8814
[Iter 6400 Total] Train Loss: 1.8891
[Iter 6400 Task segm] Val Loss: 1.5615
{'mIoU': 0.2235, 'Pixel Acc': 0.5423}
[Iter 6400 Task dept] Val Loss: 0.8046
{'abs_err': 0.801, 'rel_err': 0.3443, 'sigma_1.25': 47.3838, 'sigma_1.25^2': 76.1108, 'sigma_1.25^3': 90.3446}
[Iter 6450 Task segm] Train Loss: 0.9913
[Iter 6450 Task dept] Train Loss: 0.8721
[Iter 6450 Total] Train Loss: 1.8634
[Iter 6500 Task segm] Train Loss: 0.9993
[Iter 6500 Task dept] Train Loss: 0.8891
[Iter 6500 Total] Train Loss: 1.8884
[Iter 6550 Task segm] Train Loss: 1.0759
[Iter 6550 Task dept] Train Loss: 0.9081
[Iter 6550 Total] Train Loss: 1.9840
[Iter 6600 Task segm] Train Loss: 0.9971
[Iter 6600 Task de

In [8]:
# bs = 16, Adam, reload
loss_lambda = {'segment_semantic': 1, 'normal':1, 'depth_zbuffer': 1}
checkpoint = '/mnt/nfs/work1/huiguan/lijunzhang/multibranch/checkpoint/NYUv2/exp/'

trainer = Trainer(model, two_task, trainDataloader, valDataloader, criterionDict, metricDict)
trainer.train(20000, loss_lambda, checkpoint, reload='segment_semantic_depth_zbuffer_b0.model')

[Iter 7250 Task segm] Train Loss: 0.9060
[Iter 7250 Task dept] Train Loss: 0.8417
[Iter 7250 Total] Train Loss: 1.7477
[Iter 7300 Task segm] Train Loss: 0.9274
[Iter 7300 Task dept] Train Loss: 0.8713
[Iter 7300 Total] Train Loss: 1.7987
[Iter 7350 Task segm] Train Loss: 0.9272
[Iter 7350 Task dept] Train Loss: 0.8598
[Iter 7350 Total] Train Loss: 1.7870
[Iter 7400 Task segm] Train Loss: 0.8955
[Iter 7400 Task dept] Train Loss: 0.8578
[Iter 7400 Total] Train Loss: 1.7533
[Iter 7400 Task segm] Val Loss: 1.5620
{'mIoU': 0.232, 'Pixel Acc': 0.5459}
[Iter 7400 Task dept] Val Loss: 0.7815
{'abs_err': 0.7787, 'rel_err': 0.3372, 'sigma_1.25': 48.2351, 'sigma_1.25^2': 77.511, 'sigma_1.25^3': 90.9162}
[Iter 7450 Task segm] Train Loss: 0.9142
[Iter 7450 Task dept] Train Loss: 0.8479
[Iter 7450 Total] Train Loss: 1.7621
[Iter 7500 Task segm] Train Loss: 0.9131
[Iter 7500 Task dept] Train Loss: 0.8602
[Iter 7500 Total] Train Loss: 1.7733
[Iter 7550 Task segm] Train Loss: 0.9008
[Iter 7550 Task dep

{'mIoU': 0.2426, 'Pixel Acc': 0.5557}
[Iter 8800 Task dept] Val Loss: 0.7611
{'abs_err': 0.7553, 'rel_err': 0.297, 'sigma_1.25': 49.1501, 'sigma_1.25^2': 78.7395, 'sigma_1.25^3': 92.0939}
[Iter 8850 Task segm] Train Loss: 0.6792
[Iter 8850 Task dept] Train Loss: 0.7771
[Iter 8850 Total] Train Loss: 1.4562
[Iter 8900 Task segm] Train Loss: 0.7038
[Iter 8900 Task dept] Train Loss: 0.8042
[Iter 8900 Total] Train Loss: 1.5080
[Iter 8950 Task segm] Train Loss: 0.7104
[Iter 8950 Task dept] Train Loss: 0.7848
[Iter 8950 Total] Train Loss: 1.4952
[Iter 9000 Task segm] Train Loss: 0.6969
[Iter 9000 Task dept] Train Loss: 0.7623
[Iter 9000 Total] Train Loss: 1.4592
[Iter 9000 Task segm] Val Loss: 1.5217
{'mIoU': 0.2502, 'Pixel Acc': 0.5659}
[Iter 9000 Task dept] Val Loss: 0.7743
{'abs_err': 0.7669, 'rel_err': 0.2859, 'sigma_1.25': 47.6748, 'sigma_1.25^2': 78.3512, 'sigma_1.25^3': 92.4897}
[Iter 9050 Task segm] Train Loss: 0.6665
[Iter 9050 Task dept] Train Loss: 0.7893
[Iter 9050 Total] Train Lo

[Iter 10400 Task segm] Train Loss: 0.6328
[Iter 10400 Task dept] Train Loss: 0.7328
[Iter 10400 Total] Train Loss: 1.3657
[Iter 10400 Task segm] Val Loss: 1.6136
{'mIoU': 0.2353, 'Pixel Acc': 0.5546}
[Iter 10400 Task dept] Val Loss: 0.7830
{'abs_err': 0.7739, 'rel_err': 0.2837, 'sigma_1.25': 46.9386, 'sigma_1.25^2': 77.5763, 'sigma_1.25^3': 92.4057}
[Iter 10450 Task segm] Train Loss: 0.5810
[Iter 10450 Task dept] Train Loss: 0.7317
[Iter 10450 Total] Train Loss: 1.3126
[Iter 10500 Task segm] Train Loss: 0.5918
[Iter 10500 Task dept] Train Loss: 0.7347
[Iter 10500 Total] Train Loss: 1.3266
[Iter 10550 Task segm] Train Loss: 0.6087
[Iter 10550 Task dept] Train Loss: 0.7366
[Iter 10550 Total] Train Loss: 1.3453
[Iter 10600 Task segm] Train Loss: 0.6220
[Iter 10600 Task dept] Train Loss: 0.7491
[Iter 10600 Total] Train Loss: 1.3711
[Iter 10600 Task segm] Val Loss: 1.5942
{'mIoU': 0.2454, 'Pixel Acc': 0.567}
[Iter 10600 Task dept] Val Loss: 0.7851
{'abs_err': 0.7794, 'rel_err': 0.3003, 'sig

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/home/lijunzhang/anaconda3/envs/multitask/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3331, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-8-25ad36a14b60>", line 6, in <module>
    trainer.train(20000, loss_lambda, checkpoint, reload='segment_semantic_depth_zbuffer_b0.model')
  File "<ipython-input-5-3958f6b9c3a9>", line 32, in train
    self.train_step(loss_lambda)
  File "<ipython-input-5-3958f6b9c3a9>", line 72, in train_step
    self.optimizer.step()
  File "/home/lijunzhang/anaconda3/envs/multitask/lib/python3.6/site-packages/torch/optim/lr_scheduler.py", line 67, in wrapper
    return wrapped(*args, **kwargs)
  File "/home/lijunzhang/anaconda3/envs/multitask/lib/python3.6/site-packages/torch/autograd/grad_mode.py", line 15, in decorate_context
    return func(*args, **kwargs)
  File "/home/lijunzhang/anaconda3/envs/multitask/lib/python3.6/site-packages/torch/optim/adam.py"

KeyboardInterrupt: 

In [8]:
# bs = 16, Adam, reload
loss_lambda = {'segment_semantic': 1, 'normal':1, 'depth_zbuffer': 1}
checkpoint = '/mnt/nfs/work1/huiguan/lijunzhang/multibranch/checkpoint/NYUv2/exp/'

trainer = Trainer(model, two_task, trainDataloader, valDataloader, criterionDict, metricDict)
trainer.train(20000, loss_lambda, checkpoint, reload='segment_semantic_depth_zbuffer_b0.model')

[Iter 10850 Task segm] Train Loss: 0.6160
[Iter 10850 Task dept] Train Loss: 0.7187
[Iter 10850 Total] Train Loss: 1.3347
[Iter 10900 Task segm] Train Loss: 0.6028
[Iter 10900 Task dept] Train Loss: 0.7065
[Iter 10900 Total] Train Loss: 1.3092
[Iter 10950 Task segm] Train Loss: 0.6142
[Iter 10950 Task dept] Train Loss: 0.7414
[Iter 10950 Total] Train Loss: 1.3557
[Iter 11000 Task segm] Train Loss: 0.5974
[Iter 11000 Task dept] Train Loss: 0.7149
[Iter 11000 Total] Train Loss: 1.3123
[Iter 11000 Task segm] Val Loss: 1.5378
{'mIoU': 0.2508, 'Pixel Acc': 0.5752}
[Iter 11000 Task dept] Val Loss: 0.7344
{'abs_err': 0.7276, 'rel_err': 0.2768, 'sigma_1.25': 51.0981, 'sigma_1.25^2': 80.4403, 'sigma_1.25^3': 93.4942}
[Iter 11050 Task segm] Train Loss: 0.5802
[Iter 11050 Task dept] Train Loss: 0.7121
[Iter 11050 Total] Train Loss: 1.2923
[Iter 11100 Task segm] Train Loss: 0.6012
[Iter 11100 Task dept] Train Loss: 0.7325
[Iter 11100 Total] Train Loss: 1.3337
[Iter 11150 Task segm] Train Loss: 0.5

[Iter 12400 Task segm] Val Loss: 1.5733
{'mIoU': 0.2621, 'Pixel Acc': 0.577}
[Iter 12400 Task dept] Val Loss: 0.6986
{'abs_err': 0.6961, 'rel_err': 0.2797, 'sigma_1.25': 53.0811, 'sigma_1.25^2': 81.9836, 'sigma_1.25^3': 93.7027}
[Iter 12450 Task segm] Train Loss: 0.4889
[Iter 12450 Task dept] Train Loss: 0.6520
[Iter 12450 Total] Train Loss: 1.1409
[Iter 12500 Task segm] Train Loss: 0.4627
[Iter 12500 Task dept] Train Loss: 0.6489
[Iter 12500 Total] Train Loss: 1.1116
[Iter 12550 Task segm] Train Loss: 0.4770
[Iter 12550 Task dept] Train Loss: 0.6439
[Iter 12550 Total] Train Loss: 1.1209
[Iter 12600 Task segm] Train Loss: 0.4803
[Iter 12600 Task dept] Train Loss: 0.6673
[Iter 12600 Total] Train Loss: 1.1476
[Iter 12600 Task segm] Val Loss: 1.5824
{'mIoU': 0.262, 'Pixel Acc': 0.5786}
[Iter 12600 Task dept] Val Loss: 0.7108
{'abs_err': 0.7075, 'rel_err': 0.2844, 'sigma_1.25': 51.7958, 'sigma_1.25^2': 81.2837, 'sigma_1.25^3': 93.6387}
[Iter 12650 Task segm] Train Loss: 0.4700
[Iter 12650 

[Iter 13950 Task segm] Train Loss: 0.4461
[Iter 13950 Task dept] Train Loss: 0.6095
[Iter 13950 Total] Train Loss: 1.0556
[Iter 14000 Task segm] Train Loss: 0.4260
[Iter 14000 Task dept] Train Loss: 0.6098
[Iter 14000 Total] Train Loss: 1.0358
[Iter 14000 Task segm] Val Loss: 1.6324
{'mIoU': 0.2539, 'Pixel Acc': 0.5741}
[Iter 14000 Task dept] Val Loss: 0.6990
{'abs_err': 0.6929, 'rel_err': 0.2659, 'sigma_1.25': 53.2549, 'sigma_1.25^2': 82.315, 'sigma_1.25^3': 94.3209}
[Iter 14050 Task segm] Train Loss: 0.4542
[Iter 14050 Task dept] Train Loss: 0.6005
[Iter 14050 Total] Train Loss: 1.0547
[Iter 14100 Task segm] Train Loss: 0.4327
[Iter 14100 Task dept] Train Loss: 0.6156
[Iter 14100 Total] Train Loss: 1.0483
[Iter 14150 Task segm] Train Loss: 0.4351
[Iter 14150 Task dept] Train Loss: 0.6135
[Iter 14150 Total] Train Loss: 1.0486
[Iter 14200 Task segm] Train Loss: 0.4307
[Iter 14200 Task dept] Train Loss: 0.6203
[Iter 14200 Total] Train Loss: 1.0510
[Iter 14200 Task segm] Val Loss: 1.6663

[Iter 15450 Task segm] Train Loss: 0.3971
[Iter 15450 Task dept] Train Loss: 0.5783
[Iter 15450 Total] Train Loss: 0.9754
[Iter 15500 Task segm] Train Loss: 0.3873
[Iter 15500 Task dept] Train Loss: 0.5890
[Iter 15500 Total] Train Loss: 0.9763
[Iter 15550 Task segm] Train Loss: 0.3974
[Iter 15550 Task dept] Train Loss: 0.5775
[Iter 15550 Total] Train Loss: 0.9748
[Iter 15600 Task segm] Train Loss: 0.3895
[Iter 15600 Task dept] Train Loss: 0.5878
[Iter 15600 Total] Train Loss: 0.9773
[Iter 15600 Task segm] Val Loss: 1.6061
{'mIoU': 0.2715, 'Pixel Acc': 0.5847}
[Iter 15600 Task dept] Val Loss: 0.7206
{'abs_err': 0.7143, 'rel_err': 0.2612, 'sigma_1.25': 51.8557, 'sigma_1.25^2': 81.1298, 'sigma_1.25^3': 93.8368}
[Iter 15650 Task segm] Train Loss: 0.4028
[Iter 15650 Task dept] Train Loss: 0.5856
[Iter 15650 Total] Train Loss: 0.9883
[Iter 15700 Task segm] Train Loss: 0.4119
[Iter 15700 Task dept] Train Loss: 0.5913
[Iter 15700 Total] Train Loss: 1.0032
[Iter 15750 Task segm] Train Loss: 0.3

[Iter 17000 Task dept] Train Loss: 0.5291
[Iter 17000 Total] Train Loss: 0.8823
[Iter 17000 Task segm] Val Loss: 1.6559
{'mIoU': 0.2635, 'Pixel Acc': 0.5822}
[Iter 17000 Task dept] Val Loss: 0.6584
{'abs_err': 0.6549, 'rel_err': 0.2578, 'sigma_1.25': 55.6321, 'sigma_1.25^2': 84.1517, 'sigma_1.25^3': 95.226}
[Iter 17050 Task segm] Train Loss: 0.3468
[Iter 17050 Task dept] Train Loss: 0.5385
[Iter 17050 Total] Train Loss: 0.8853
[Iter 17100 Task segm] Train Loss: 0.3393
[Iter 17100 Task dept] Train Loss: 0.5436
[Iter 17100 Total] Train Loss: 0.8829
[Iter 17150 Task segm] Train Loss: 0.3521
[Iter 17150 Task dept] Train Loss: 0.5530
[Iter 17150 Total] Train Loss: 0.9051
[Iter 17200 Task segm] Train Loss: 0.3468
[Iter 17200 Task dept] Train Loss: 0.5525
[Iter 17200 Total] Train Loss: 0.8993
[Iter 17200 Task segm] Val Loss: 1.6586
{'mIoU': 0.2639, 'Pixel Acc': 0.5805}
[Iter 17200 Task dept] Val Loss: 0.6899
{'abs_err': 0.6825, 'rel_err': 0.2546, 'sigma_1.25': 53.273, 'sigma_1.25^2': 82.7799,

[Iter 18500 Task dept] Train Loss: 0.5329
[Iter 18500 Total] Train Loss: 0.8552
[Iter 18550 Task segm] Train Loss: 0.3325
[Iter 18550 Task dept] Train Loss: 0.5380
[Iter 18550 Total] Train Loss: 0.8705
[Iter 18600 Task segm] Train Loss: 0.3219
[Iter 18600 Task dept] Train Loss: 0.5247
[Iter 18600 Total] Train Loss: 0.8466
[Iter 18600 Task segm] Val Loss: 1.6699
{'mIoU': 0.2666, 'Pixel Acc': 0.5832}
[Iter 18600 Task dept] Val Loss: 0.6596
{'abs_err': 0.6535, 'rel_err': 0.2482, 'sigma_1.25': 55.1234, 'sigma_1.25^2': 84.0872, 'sigma_1.25^3': 95.6125}
[Iter 18650 Task segm] Train Loss: 0.3263
[Iter 18650 Task dept] Train Loss: 0.5278
[Iter 18650 Total] Train Loss: 0.8541
[Iter 18700 Task segm] Train Loss: 0.3221
[Iter 18700 Task dept] Train Loss: 0.5371
[Iter 18700 Total] Train Loss: 0.8591
[Iter 18750 Task segm] Train Loss: 0.3268
[Iter 18750 Task dept] Train Loss: 0.5139
[Iter 18750 Total] Train Loss: 0.8407
[Iter 18800 Task segm] Train Loss: 0.3239
[Iter 18800 Task dept] Train Loss: 0.5



In [27]:
model

Deeplab_ASPP_Branch(
  (backbone): Deeplab_ResNet_Backbone_Branch(
    (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=True)
    (shared_blocks): ModuleList(
      (0): ResidualBlock(
        (block): BasicBlock(
          (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (relu): ReLU(inplace=True)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (1): ResidualBlock(
        (block): BasicBlock(
          (conv1): Conv2d(64, 64, kernel_size=

# For Visualize

In [14]:
import torch.onnx

In [16]:
PATH = "models/backbone_b10.onnx"

x = torch.rand(1,3,224,224)
y = model(x)
torch.onnx.export(model,x,PATH)