In [65]:
import numpy as np
import os
import itertools
import sys
sys.path.append('/home/lijunzhang/multibranch/')
from pathlib import Path
from scipy import stats
from scipy.optimize import linear_sum_assignment
from collections import OrderedDict
from ptflops import get_model_complexity_info
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
from torchsummary import summary
from torch.utils.data import DataLoader

from framework.layer_node import Conv2dNode, InputNode
from main.layout import Layout
from main.algorithms import enum_layout_wo_rdt, init_S, coarse_to_fined
from main.auto_models import MTSeqBackbone, MTSeqModel, ComputeBlock
from main.head import ASPPHeadNode
from main.trainer import Trainer
from main.algs_FMTL import simple_alignment, complex_alignment

from data.nyuv2_dataloader_adashare import NYU_v2
from data.pixel2pixel_loss import NYUCriterions
from data.pixel2pixel_metrics import NYUMetrics

In [2]:
assert torch.cuda.is_available()

In [25]:
import pickle
def save_obj(obj, name):
    with open('./exp/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name):
    with open('./exp/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

# backbone and data

In [8]:
# backbone
# mobilenet
backbone_type = 'mobilenet'
prototxt = '../models/mobilenetv2.prototxt'
D = coarse_B = 5
mapping = {0:[0,1,2,3,4,5,6], 1:[7,8,9,10,11,12,13,14,15,16,17], 2:[18,19,20,21,22], 
           3:[23,24,25,26,27,28,29,30], 4:[31], 5:[32]}

In [5]:
# data
# NYUv2
data = 'NYUv2'
dataroot = '/mnt/nfs/work1/huiguan/lijunzhang/policymtl/data/NYUv2/'
tasks = ['segment_semantic', 'normal', 'depth_zbuffer']
cls_num = {'segment_semantic': 40, 'normal':3, 'depth_zbuffer': 1}

dataset = NYU_v2(dataroot, 'train', crop_h=321, crop_w=321)
trainDataloader = DataLoader(dataset, 32, shuffle=True)
dataset = NYU_v2(dataroot, 'test', crop_h=224, crop_w=224)
valDataloader = DataLoader(dataset, 32, shuffle=True)

criterionDict = {}
metricDict = {}
for task in tasks:
    print(task, flush=True)
    criterionDict[task] = NYUCriterions(task)
    metricDict[task] = NYUMetrics(task)

input_dim = (3,321,321)
T = len(tasks)

segment_semantic
normal
depth_zbuffer


In [6]:
# ind. weights
ckpt_PATH = '/mnt/nfs/work1/huiguan/lijunzhang/multibranch/checkpoint/'
weight_PATH = ckpt_PATH + 'NYUv2/ind/mobilenet/segment_semantic_normal_depth_zbuffer.model' # NYUv2 + MobileNetV2, from the same init
# weight_PATH = ckpt_PATH + 'NYUv2/baseline/WPreMobile/2/segment_semantic_normal_depth_zbuffer.model' # NYUv2 + MobileNetV2, from the same init

# load independent model weights

In [9]:
with torch.no_grad():
    backbone = MTSeqBackbone(prototxt)
    fined_B = len(backbone.basic_blocks)
    feature_dim = backbone(torch.rand(1,3,224,224)).shape[1]

In [10]:
# ind. layout
S = []
for i in range(fined_B):
    S.append([set([x]) for x in range(T)])
layout = Layout(T, fined_B, S) 
print('Ind. Layout:', flush=True)
print(layout, flush=True)

# model
with torch.no_grad():
    model = MTSeqModel(prototxt, layout=layout, feature_dim=feature_dim, cls_num=cls_num)
#     model = model.cuda()

    # load ind. model weights
    model.load_state_dict(torch.load(weight_PATH)['state_dict'])

Ind. Layout:
[[{0}, {1}, {2}], [{0}, {1}, {2}], [{0}, {1}, {2}], [{0}, {1}, {2}], [{0}, {1}, {2}], [{0}, {1}, {2}], [{0}, {1}, {2}], [{0}, {1}, {2}], [{0}, {1}, {2}], [{0}, {1}, {2}], [{0}, {1}, {2}], [{0}, {1}, {2}], [{0}, {1}, {2}], [{0}, {1}, {2}], [{0}, {1}, {2}], [{0}, {1}, {2}], [{0}, {1}, {2}], [{0}, {1}, {2}], [{0}, {1}, {2}], [{0}, {1}, {2}], [{0}, {1}, {2}], [{0}, {1}, {2}], [{0}, {1}, {2}], [{0}, {1}, {2}], [{0}, {1}, {2}], [{0}, {1}, {2}], [{0}, {1}, {2}], [{0}, {1}, {2}], [{0}, {1}, {2}], [{0}, {1}, {2}], [{0}, {1}, {2}], [{0}, {1}, {2}]]
Construct MTSeqModel from Layout:
[[{0}, {1}, {2}], [{0}, {1}, {2}], [{0}, {1}, {2}], [{0}, {1}, {2}], [{0}, {1}, {2}], [{0}, {1}, {2}], [{0}, {1}, {2}], [{0}, {1}, {2}], [{0}, {1}, {2}], [{0}, {1}, {2}], [{0}, {1}, {2}], [{0}, {1}, {2}], [{0}, {1}, {2}], [{0}, {1}, {2}], [{0}, {1}, {2}], [{0}, {1}, {2}], [{0}, {1}, {2}], [{0}, {1}, {2}], [{0}, {1}, {2}], [{0}, {1}, {2}], [{0}, {1}, {2}], [{0}, {1}, {2}], [{0}, {1}, {2}], [{0}, {1}, {2}],

In [13]:
# compute r0 --> stop convergency loss for layout convergency iter estimation
loss_lst = {task:[] for task in tasks}
model = model.cuda()
model.eval()
for i, data in enumerate(trainDataloader):
    x = data['input'].cuda()
    output = model(x)
    for task in tasks:
        y = data[task].cuda()
        if task + '_mask' in data:
            tloss = criterionDict[task](output[task], y, data[task + '_mask'].cuda())
        else:
            tloss = criterionDict[task](output[task], y)
        loss_lst[task].append(tloss.item())
        print('{}: {:.4f}'.format(task,tloss.item()))
    print('-'*30)

target = {task: np.mean(loss_lst[task]) for task in tasks}
print('r0: {}'.format(target))
save_obj(target, 'r0')

segment_semantic: 0.6924
------------------------------
normal: 0.0691
------------------------------
depth_zbuffer: 1.0278
------------------------------
segment_semantic: 0.7632
------------------------------
normal: 0.0823
------------------------------
depth_zbuffer: 0.9296
------------------------------
segment_semantic: 0.6722
------------------------------
normal: 0.0681
------------------------------
depth_zbuffer: 1.1606
------------------------------
segment_semantic: 0.7691
------------------------------
normal: 0.0812
------------------------------
depth_zbuffer: 1.0446
------------------------------
segment_semantic: 0.7071
------------------------------
normal: 0.0729
------------------------------
depth_zbuffer: 1.2141
------------------------------
segment_semantic: 0.6427
------------------------------
normal: 0.0636
------------------------------
depth_zbuffer: 0.8945
------------------------------
segment_semantic: 0.7339
------------------------------
normal: 0.0732

In [17]:
target = load_obj('r0')

In [19]:
target

{'segment_semantic': 0.7240976285934448,
 'normal': 0.07379929304122924,
 'depth_zbuffer': 1.1055510187149047}

# enum layouts and channel alignment

In [20]:
# enum layout
layout_list = [] 
S0 = init_S(T, coarse_B) # initial state
L = Layout(T, coarse_B, S0) # initial layout
layout_list.append(L)
enum_layout_wo_rdt(L, layout_list)

In [21]:
align_choice = 2 # 0: no align; 1: simple align (use out_ord only); 2: complex align

In [23]:
model = model.cpu()
if align_choice == 1:
    simple_alignment(model, tasks)
elif align_choice == 2:
    complex_alignment(model, tasks)
elif align_choice == 0:
    pass

# est. convergence rate

In [24]:
def smooth(scalars, weight):  # Weight between 0 and 1
    last = scalars[0]  # First value in the plot (first timestep)
    smoothed = list()
    for point in scalars:
        smoothed_val = last * weight + (1 - weight) * point  # Calculate smoothed value
        smoothed.append(smoothed_val)                        # Save it
        last = smoothed_val                                  # Anchor the last smoothed value
    return smoothed

In [159]:
total_iter, short_iter, start, step = 20000, 101, 20, 40
# loss_slice = [x for x in range(start,short_iter,step)]
# if len(loss_slice) < 3:
#     sys.exit('Not enough loss samples!')
smooth_weight = 0.5

layout_est_iter = []
layout_est_loss = []

# For each layout
for L in layout_list:
    layout = coarse_to_fined(L, fined_B, mapping)
    print('Fined Layout:', flush=True)
    print(layout, flush=True)
    
    mtl_model = MTSeqModel(prototxt, layout=layout, feature_dim=feature_dim, cls_num=cls_num, verbose=False)
    
    # Step 1: create weight init state_dict
    mtl_init = OrderedDict()
    for name, module in mtl_model.named_modules():
        if isinstance(module, ComputeBlock):
            task_set = module.task_set
            layer_idx = module.layer_idx
            if len(task_set) > 1:
                merge_flag = True
            else:
                # Type 1: save the whole block weights from the corresponding ind. model when no merging
                merge_flag = False
                for block in model.backbone.mtl_blocks:
                    if task_set == block.task_set and block.layer_idx == layer_idx:
                        for ind_name, param in block.named_parameters():
                            mtl_init['.'.join([name, ind_name])] = param  
                        # for BN running mean and running var
                        for ind_name, param in block.named_buffers():
                            mtl_init['.'.join([name, ind_name])] = param

        # # Type 2: when the current block have merged operators, save mean weights for convs
        elif isinstance(module, Conv2dNode) and merge_flag: 
            task_convs = [] # store conv weights from task's ind. block
            for task in task_set:
                # identify task-corresponding block in the well-trained ind. models 
                for block in model.backbone.mtl_blocks:
                    if task in block.task_set and block.layer_idx == layer_idx:
                        task_module = block.compute_nodes[int(name.split('.')[-1])]  
                        temp_weight = task_module.basicOp.weight # no channel alignment or no align variable
                        if align_choice == 1 and task_module.out_ord is not None: # simple alignment
                            temp_weight = temp_weight[task_module.out_ord]
                        elif align_choice == 2: # complex alignment
                            if task_module.in_ord is not None:
                                temp_weight = temp_weight[:,task_module.in_ord]
                            if task_module.out_ord is not None: 
                                temp_weight = temp_weight[task_module.out_ord]
                        task_convs.append(temp_weight)
            weight_anchor = torch.mean(torch.stack(task_convs),dim=0)
            mtl_init[name+'.basicOp.weight'] = weight_anchor

        # Type 3: save heads' weights
        elif 'heads' in name and isinstance(module, ASPPHeadNode): 
            ind_head = model.heads[name.split('.')[-1]]
            for ind_name, param in ind_head.named_parameters():
                mtl_init['.'.join([name, ind_name])] = param
            for ind_name, param in ind_head.named_buffers():
                mtl_init['.'.join([name, ind_name])] = param
    mtl_model.load_state_dict(mtl_init,strict=False)
    print('Finish Weight Loading.', flush=True)

    # Step 2: Save short train loss list
    loss_lst = {task: [] for task in tasks}
    
    mtl_model = mtl_model.cuda()
    mtl_model.train()
    optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad, mtl_model.parameters()), lr=0.0001)
    trainIter = iter(trainDataloader)
    for i in range(short_iter):
        try:
            data = next(trainIter)
        except StopIteration:
            trainIter = iter(trainDataloader)
            data = next(trainIter)
            
        x = data['input'].cuda()
        optimizer.zero_grad()
        output = mtl_model(x)

        loss = 0
        for task in tasks:
            y = data[task].cuda()
            if task + '_mask' in data:
                tloss = criterionDict[task](output[task], y, data[task + '_mask'].cuda())
            else:
                tloss = criterionDict[task](output[task], y)
            loss_lst[task].append(tloss.item())
            loss += tloss
        loss.backward()
        optimizer.step()
    
    # For each task
    est_iter ={}
    final_loss = {}
    for task in tasks:
        print('Task {}:'.format(task))
        
        # Step 3: Smooth and take loss samples
        sm_loss_lst = smooth(loss_lst[task], smooth_weight)
        loss_samples = sm_loss_lst[start:short_iter:step]
        print('loss samples: {}'.format(loss_samples))
        
        # Step 4: Compute convergence rate 
        alpha = compute_alpha(loss_samples)
        print('alpha: {}'.format(alpha))
        
        # Step 5: Estimate iters to reach target loss
        n = est_recov_n(loss_samples, target[task], alpha)
        est_iter[task] = start + n*step
        print('est iter: {}'.format(est_iter[task]))
        
        # Step 6: Estimate final loss after 20000 iters
        n = (total_iter - start)//step
        final_loss[task] = est_final_loss(loss_samples, n, alpha)
        print('final loss: {}'.format(final_loss[task]))
        print('-'*50)
    layout_est_iter.append(est_iter)
    layout_est_loss.append(final_loss)
    print('='*50)

    break

Fined Layout:
[[{0, 1, 2}], [{0, 1, 2}], [{0, 1, 2}], [{0, 1, 2}], [{0, 1, 2}], [{0, 1, 2}], [{0, 1, 2}], [{0, 1, 2}], [{0, 1, 2}], [{0, 1, 2}], [{0, 1, 2}], [{0, 1, 2}], [{0, 1, 2}], [{0, 1, 2}], [{0, 1, 2}], [{0, 1, 2}], [{0, 1, 2}], [{0, 1, 2}], [{0, 1, 2}], [{0, 1, 2}], [{0, 1, 2}], [{0, 1, 2}], [{0, 1, 2}], [{0, 1, 2}], [{0, 1, 2}], [{0, 1, 2}], [{0, 1, 2}], [{0, 1, 2}], [{0, 1, 2}], [{0, 1, 2}], [{0, 1, 2}], [{0, 1, 2}]]
Finish Weight Loading.
Task segment_semantic:
loss samples: [6.37286070721575, 4.903586675667633, 4.374546643914657]
alpha: 0.4356049592840212
est iter: nan
final loss: 4.005586908478982
--------------------------------------------------
Task normal:
loss samples: [0.07546639882031059, 0.07293267534603777, 0.0638933990429129]
alpha: 3.874606316806775
est iter: 51.242273300114135
final loss: 0.0
--------------------------------------------------
Task depth_zbuffer:
loss samples: [1.0968553205361786, 0.9696726088789933, 1.10702626406812]
alpha: -1.0748927959042853


  after removing the cwd from sys.path.


In [160]:
total_iter, short_iter, start, step = 20000, 100, 30, 30
smooth_weight = 0.6

est_iter ={}
final_loss = {}
for task in tasks:
    print('Task {}:'.format(task))

    # Step 3: Smooth and take loss samples
    sm_loss_lst = smooth(loss_lst[task], smooth_weight)
    loss_samples = sm_loss_lst[start:short_iter:step]
    print('loss samples: {}'.format(loss_samples))

    # Step 4: Compute convergence rate 
    alpha = compute_alpha(loss_samples)
    print('alpha: {}'.format(alpha))

    # Step 5: Estimate iters to reach target loss
    n = est_recov_n(loss_samples, target[task], alpha)
    est_iter[task] = start + n*step
    print('est iter: {}'.format(est_iter[task]))

    # Step 6: Estimate final loss after 20000 iters
    n = (total_iter - start)//step
    final_loss[task] = est_final_loss(loss_samples, n, alpha)
    print('final loss: {}'.format(final_loss[task]))
    print('-'*50)
layout_est_iter.append(est_iter)
layout_est_loss.append(final_loss)
print('='*50)

Task segment_semantic:
loss samples: [5.311834793688948, 4.940007649283774, 4.768060193262548]
alpha: 0.48817851583175814
est iter: nan
final loss: 4.6096353339477485
--------------------------------------------------
Task normal:
loss samples: [0.07025943309965477, 0.07313488226350928, 0.07168730783228883]
alpha: -0.49841175400357385
est iter: nan
final loss: 0.07216560125634357
--------------------------------------------------
Task depth_zbuffer:
loss samples: [1.0837086341480626, 0.9685742503090615, 0.9558088031202223]
alpha: 0.11812092076281999
est iter: 27.95579893488618
final loss: 0.9541117952291159
--------------------------------------------------


  after removing the cwd from sys.path.


In [155]:
loss_samples=[2.51,2.50,2.48,1]
alpha = compute_alpha(loss_samples)
alpha2 = compute_alpha2(loss_samples)

In [156]:
est_final_loss(loss_samples, 10, alpha)

0.0345584412168006

In [157]:
alpha

2.0120563381154453

In [158]:
alpha2

6.209453365628749

In [106]:
est_recov_n(loss_samples, target[task], alpha2)

  after removing the cwd from sys.path.


nan

In [100]:
def compute_alpha(loss_samples):
    return np.log(loss_samples[2]/loss_samples[1])/ np.log(loss_samples[1]/loss_samples[0])

def compute_alpha2(loss_samples):
    return np.log(np.abs((loss_samples[3]-loss_samples[2])/(loss_samples[2]-loss_samples[1])))/ \
            np.log(np.abs((loss_samples[2]-loss_samples[1])/(loss_samples[1]-loss_samples[0])))

In [46]:
def est_recov_n(loss_samples, target, alpha):
    up = np.log(target/loss_samples[0]) * (alpha - 1)
    down = np.log(loss_samples[1]/loss_samples[0])
    return np.log(up/down + 1) / np.log(alpha)

In [89]:
def est_final_loss(loss_samples, n, alpha):
    x0, x1 = np.log(loss_samples[0]), np.log(loss_samples[1])
    for i in range(2, n+1):
        x2 = alpha * (x1-x0) + x1
        x0 = x1
        x1 = x2
    return np.exp(x2)

In [57]:
# sort
rate_list = []
for alpha in rate:
    rate_list.append(np.mean([alpha[task] for task in alpha]))
rate_order = sorted(range(len(rate_list)), key=lambda k: rate_list[k],reverse=True)

In [77]:
alpha

{'segment_semantic': 0.9267602453218206,
 'normal': 0.3232763318751182,
 'depth_zbuffer': 0.2815663163084714}

In [75]:
alpha

{'segment_semantic': 0.6359761891705582,
 'normal': 1.0929589100906791,
 'depth_zbuffer': 0.5271130297876526}

In [87]:
train_error

{'segment_semantic': [0.7516613602638245,
  0.7001743912696838,
  0.6683229207992554],
 'normal': [0.06778442859649658, 0.0672987699508667, 0.06692475080490112],
 'depth_zbuffer': [1.0590227842330933, 1.0507172346115112, 1.0439845323562622]}

In [48]:
target

{'segment_semantic': 0.5405179977416993,
 'normal': 0.05407652854919434,
 'depth_zbuffer': 0.8496493339538574}