# Imports and Hyper-params

In [1]:
path = 'D:\DLProject\DL_turnin'
output_path = 'D:\DLProject\DL_turnin\output'

%run tools/imports.py
%run -i tools/functions.py
%run -i tools/models.py
%run -i tools/phrasecut.py

if not os.path.exists(output_path):
    os.makedirs(output_path)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

opt={}
opt = dotdict(opt)

## General
opt.dataset = 'phrasecut'    # 'phrasecut'
opt.split = 'train'          # 'train' 'test'
opt.train_iter = 120000      # max iteration to train
opt.train_log_every = 25000  # num of iterations to log training info
opt.checkpoint_every = 300   # num of iterations to save checkpoint
opt.load_checkpoint = None   # path to .pth to continue training
opt.checkpoint = 'D:\DLProject\DL_turnin\output\checkpoint_125000.pth'  # path to .pth to continue training

## Train Hyperparams
opt.phrasecut_categories = ['c_coco'] # filter categories
opt.new_img_proc = True               # use improved image processing (scaling -> normalize)
opt.pos_weight = 2           # punish loss on positive labels
opt.weight_decay = 0.0005    # Weight decay parameter (0.0005 => 0.005 output became too negative)
opt.initial_lr = 0.00030     # initial learning rate (0.00025 => 0.00015)
opt.min_lr = 0.00015         # minimum learning rate after decay
opt.lr_decay = 0.9           # learning rate polyn decay rate
opt.max_iter_lr = 500000     # iters to reach min_lr (should be around max_iter)
opt.batch_size = 1           # Batch size (only support 1)
opt.im_h = 320
opt.im_w = 320
opt.vf_h = 40
opt.vf_w = 40
opt.curr_iter = 0

## Deeplab Hyperparams
opt.base = 'v3'                  # backbone deeplab model
opt.n_blocks = [3, 4, 23, 3]     # Resnet-101 layers
opt.atrous_rates = [3, 6, 9, 12] # ASPP rates

# RRN Hyperparams
opt.embed_size = 1000        # embed vector size
opt.hidden_size = 1000       # hidden layer size
opt.num_layers = 1           # number of hidden layer
opt.minval = -0.08
opt.maxval = 0.08
opt.num_steps = 20           # sentence length
opt.rnn_size = 1000
opt.mlp_dim = 500
opt.vf_dim = 2048
opt.scale_factor = 8


# Training

In [None]:
# Init
opt.train_loss = {'iter': 'val'}
opt.train_acc = {'iter': 'val'}
opt.train_acc_pos = {'iter': 'val'}
opt.train_acc_neg = {'iter': 'val'}
opt.train_loss = {'iter': 'val'}
opt.train_iou = {'iter': 'val'}
opt.train_overall_iou = {'iter': 'val'}
opt.vocab_size = 8407

# Initialize Model
model = Model(opt)

# Load pre-trained model
if opt.load_checkpoint:
    checkpoint = torch.load(opt.checkpoint)
    model.load_state_dict(checkpoint['model'])
    opt = checkpoint['opt']
    opt = dotdict(opt)
    
    if opt.curr_iter >= opt.max_iter_lr:
        lr = opt.min_lr
    else:
        lr = (opt.initial_lr - opt.min_lr) * ((1 - opt.curr_iter / opt.max_iter_lr) ** (opt.lr_decay)) + opt.min_lr
else:
    state_dict = torch.load(path + '/tools/deeplabv2_resnet101_msc-vocaug-20000.pth')
    model.load_state_dict(state_dict, strict=False)
    lr = opt.initial_lr
    
#Disable deeplab training except aspp
for param in model.base.parameters():
    param.requires_grad = False

if opt.base == 'v3':
    for param in model.base.aspp_v3.parameters():
        param.requires_grad = True

# List of params to train
parameters = []
parameters.extend(model.layer5_feat.parameters())
parameters.extend(model.LSTM.parameters())
parameters.extend(model.RRN.parameters())
if opt.base == 'v3':
    parameters.extend(model.base.aspp_v3.parameters())

# Criterion, Optimizer and lr decay
pos_weight = torch.full([1], opt.pos_weight, dtype=torch.float32).to(device)
criterion = torch.nn.BCEWithLogitsLoss(pos_weight = pos_weight)
optimizer = torch.optim.Adam(parameters, lr=lr, weight_decay=opt.weight_decay)
    
# Load Optimizer
if opt.load_checkpoint:
    optimizer.load_state_dict(checkpoint['optimizer'])
    
# activate GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if device == 'cuda':
    model.to(device)
    criterion.to(device)
    
# Data Loader
refvg_loader = RefVGLoader(split=opt.split)
img_ref_data = refvg_loader.get_img_ref_data()
task_i = -2
print('Loaded phrasecut: %s images, %s tasks' % (len(refvg_loader.img_ids), refvg_loader.task_num))

# Init Vocab
with open(str(dataset_dir) + '/name_att_rel_count.json', 'r') as file:
    data = json.load(file)
corpus = Corpus()
corpus.split_and_add_words_to_vocab_from_data(data)

image_batch = np.zeros((1, opt.im_h, opt.im_w, 3), dtype=np.float32)

#im preprocess
if opt.new_img_proc:
    preprocess = torchvision.transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
else:
    mu = np.array((104.00698793, 116.66876762, 122.67891434))

# Acc and loss initialize
running_acc, running_acc_pos, running_acc_neg, running_loss, running_iou = 0, 0, 0, 0, 0
I, U = 0, 0
n_batch, bw = 0, 0
model.train()

for iter in tq.tqdm(range(opt.curr_iter, opt.train_iter)):
    
    ############## Load Data ##############
    
    # Read next task
    match = 0
    while(match == 0):
        task_i += 1
        if (task_i >= len(img_ref_data['task_ids'])) or (task_i == -1):
            img_ref_data = refvg_loader.get_img_ref_data()       # load img
            img_id = img_ref_data['image_id']
            img_p = str(img_fpath) + '/' + str(img_id) + '.jpg'  # (original shape) image
            img = Image.open(img_p)
            img = img.resize((opt.im_h, opt.im_w))               # (320, 320) image
            image = np.array(img).astype(np.float32)             # (320, 320, 3) np float
            task_i = 0
            
        # get task and categories
        task_ids = img_ref_data['task_ids']
        task = task_ids[task_i]
        subsets_of_img = refvg_loader.get_task_subset(img_id, task)
        
        # Filter by category
        for i in opt.phrasecut_categories:
            if i in subsets_of_img:
                match = 1
                break

    # extract phrase    
    sentence = img_ref_data['phrases'][task_i]     # string 'large picture'
    text_pass = corpus.tokenize_sentence(sentence).type(torch.LongTensor) #need to be (1,1,20) torch

    # Ground truth mask
    original_h = img_ref_data['height']            # 600
    original_w = img_ref_data['width']             # 800
    mask_up = np.zeros((original_h, original_w))                # (320,320) np
    gt_Polygons = img_ref_data['gt_Polygons'][task_i]           # [plg0, plg1,..] for seperate objects
    for plg in gt_Polygons:
        mask_up += polygons_to_mask(plg, w=original_w, h=original_h) #(600,800) np 1/0
    mask_up = torch.from_numpy(mask_up).unsqueeze(0).unsqueeze(0)    #(1,1,600,800) torch 1/0
    #(1,1,320,320) torch 0/1
    mask_pass = nn.functional.interpolate(mask_up, size=(320, 320), mode='bilinear', align_corners=False).to(device)

    # Skip for black/white img
    if len(image.shape) == 2:
        bw += 1
        continue

    # processing image before pass
    image_flip = image[:,:,0:3]         #rgba
    image_flip = image_flip[:,:,::-1]   #(320, 320, 3)
    if opt.new_img_proc != True:
        image_flip -= mu

    # add batch_size dimension
    image_batch[n_batch, ...] = image_flip #(1, 320, 320, 3)

    # turn into tensor
    image_pass = torch.from_numpy(image_batch).permute(0,3,1,2).to(device) #(1,3,320,320) torch

    # normalize [0,1] => normalize mean, std
    if opt.new_img_proc:
        image_pass = preprocess(image_pass.view(3,opt.im_h,opt.im_w)/255).view(1,3,opt.im_h,opt.im_w) #(1,3,320,320)
        
        
    ############## Training Step ##############
    
    optimizer.zero_grad()

    # forward pass
    output_down, output_up = model(image_pass, text_pass)  # output: (1,1,40,40), output_up: (1,1,320,320) pre-activation (<0 for false, >0 for true)
    
    # loss and backpass
    loss = criterion(output_up,mask_pass) * opt.im_h * opt.im_w
    loss.backward()
    optimizer.step()
    
    # learning rate decay
    if iter >= opt.max_iter_lr:
        lr = opt.min_lr
    else:
        lr = (opt.initial_lr - opt.min_lr) * ((1 - opt.curr_iter / opt.max_iter_lr) ** (opt.lr_decay)) + opt.min_lr
        
    optimizer.param_groups[0]['lr'] = lr
    
    
    ############## Log Training Info ##############
    
    # Accuracy and IoU
    acc, acc_pos, acc_neg = compute_accuracy(output_up.detach().cpu(), mask_pass.detach().cpu())
    iou, intersect, union = compute_iou(output_up.detach().cpu(), mask_pass.detach().cpu())
    running_acc += acc
    running_acc_pos += acc_pos
    running_acc_neg += acc_neg
    running_loss += loss.item()
    running_iou += iou
    I += intersect
    U += union
    
    # log training info
    if iter % opt.train_log_every == 0 and iter != 0:
        avg_overall_iou = I/U
        avg_iou = running_iou/(opt.train_log_every-bw)
        avg_acc = running_acc/(opt.train_log_every-bw)
        avg_acc_pos = running_acc_pos/(opt.train_log_every-bw)
        avg_acc_neg = running_acc_neg/(opt.train_log_every-bw)
        avg_loss = running_loss/(opt.train_log_every-bw)
        opt.train_overall_iou[iter] = avg_overall_iou
        opt.train_iou[iter] = avg_iou
        opt.train_loss[iter] = avg_loss
        opt.train_acc[iter] = avg_acc
        opt.train_acc_pos[iter] = avg_acc_pos
        opt.train_acc_neg[iter] = avg_acc_neg
        print('\niter[%s]: train_loss=%.2f, lr=%.5f' % (iter, avg_loss, optimizer.param_groups[0]['lr']))
        print('mIoU=%.2f, overall_iou=%.2f, acc_pos=%.2f, acc_neg=%.2f' % (avg_iou, avg_overall_iou, avg_acc_pos, avg_acc_neg))
        running_acc, running_acc_pos, running_acc_neg, running_loss, running_iou = 0, 0, 0, 0, 0
        I, U = 0, 0
        bw = 0

            
    # Save checkpoint at 'output_path/checkpoint_<iter>.pth'
    if iter != 0 and (iter % opt.checkpoint_every == 0 or iter == (opt.train_iter-1)):
        
        # Save checkpoint at 'output_path/checkpoint_<iter>.pth'
        checkpoint_file = os.path.join(output_path + '/checkpoint_' + str(iter) + '.pth')
        opt.curr_iter = iter
        checkpoint = {}
        checkpoint['opt'] = opt.copy()
        checkpoint['model'] = model.state_dict()
        checkpoint['optimizer'] = optimizer.state_dict()
        torch.save(checkpoint, checkpoint_file)
        
        # Save log at 'output_path/log_<iter>.log'
        log_file = os.path.join(output_path + '/log_' + str(iter) + '.log')   
        with open(log_file, 'w') as file:
            for k, v in opt.items():
                file.write(str(k) + '='+ str(v) + '\n\n')
                
        print('iter[%s]: Checkpoint at %s' % (iter, checkpoint_file))

print('Done training')


# Test the output (for debug)


In [None]:
visualize(torch.from_numpy(image), mask_pass.detach().cpu(), sentence + ' - ground truth', show=True, save=None)
visualize(torch.from_numpy(image), (output_up>=0).detach().cpu(), sentence + ' - prediction', show=True, save=None)