In [None]:
import os, time

In [None]:
TRIAL_NAME='trial'

In [None]:
CONF={
    'niter':5,
    'ntest':50,
    'GPU':0,
    'BS':4,
    'test_BS':256,
    'N_neg':3,
    'name':TRIAL_NAME,
    'nz':2048,
    'ng':128,
    'seed':10715,
    'data_dir':'/DataSet/COCO', #Set it to where your COCO dataset is
    'dataType':'train2017',
    'valType':'val2017',
    'testType':'test2017',
    'max_len':512,
    'hidden_size':768,
    'bert_hdim':3072,
    'LAMBDAs':0.005,
    'use_super':False,
    'test_classes':80,
    'n_trials':5,
    'bert_pretrained':True,  #pretrain bert
    'resnet_pretrained':True  #pretrain resnet
}

In [None]:
os.environ["CUDA_VISIBLE_DEVICES"]=str(CONF['GPU'])

In [None]:
import texar.torch as tx
import random
import torch
import torch.nn as nn
import torch.nn.parallel
from torch.nn import functional as F
import torch.backends.cudnn as cudnn
import torch.optim as optim
import torch.utils.data
import torchvision
from torch.utils.tensorboard import SummaryWriter
import torchvision.datasets as dset
import torchvision.transforms as transforms
import torchvision.utils as vutils
import numpy as np
from torch import autograd
import multiprocessing
from PIL import Image
from sklearn import metrics

In [None]:
device = torch.device("cuda:0" if True else "cpu")

In [None]:
time.sleep(2)
import shutil
tb_dir=os.path.join('./runs', CONF['name'] + "_GLOBAL")
shutil.rmtree(tb_dir, ignore_errors=True)
time.sleep(5)
global_writer = SummaryWriter(log_dir=tb_dir)

In [None]:
random.seed(CONF['seed'])
torch.manual_seed(CONF['seed'])
np.random.seed(CONF['seed'])
cudnn.benchmark = True

In [None]:
batch_size = CONF['BS']

In [None]:
T = transforms.Compose([
    transforms.RandomResizedCrop((224,224), scale=(0.3, 1.0), ratio=(0.75, 1.3333333333333333)),
    transforms.ColorJitter(brightness=.1, contrast=.05, saturation=.05, hue=.05),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])
T_test = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

In [None]:
from torchvision.datasets.vision import VisionDataset
class CocoClassification(VisionDataset):
    """`MS Coco Detection <http://mscoco.org/dataset/#detections-challenge2016>`_ Dataset.

    Args:
        root (string): Root directory where images are downloaded to.
        annFile (string): Path to json annotation file.
        transform (callable, optional): A function/transform that  takes in an PIL image
            and returns a transformed version. E.g, ``transforms.ToTensor``
        target_transform (callable, optional): A function/transform that takes in the
            target and transforms it.
        transforms (callable, optional): A function/transform that takes input sample and its target as entry
            and returns a transformed version.
    """

    def sample_class(self, k):
        if CONF['use_super']:
            self.classes = np.array(["vehicle", "outdoor", "indoor", "person", "appliance", "furniture", "sports", "food", "kitchen", "accessory", "electronic", "animal"])
            self.class_description = ["vehicle", "outdoor", "indoor", "person", "appliance", "furniture", "sports", "food", "kitchen", "accessory", "electronic", "animal"]
            return
        class_list = self.coco.getCatIds()
        self.classes = np.sort(np.random.choice(class_list, size=k, replace=False))
        self.class_description = self.coco.loadCats(self.classes)
        arr = []
        for catId in self.classes:
            arr+=self.coco.getImgIds(catIds=[catId])
        self.ids = sorted(list(set(arr)))
        
    def __init__(self, root, annFile, transform=None, target_transform=None, transforms=None):
        super(CocoClassification, self).__init__(root, transforms, transform, target_transform)
        from pycocotools.coco import COCO
        self.coco = COCO(annFile)
        self.sample_class(len(self.coco.getCatIds()))

    def __getitem__(self, index):
        """
        Args:
            index (int): Index

        Returns:
            tuple: Tuple (image, target). target is the object returned by ``coco.loadAnns``.
        """
        coco = self.coco
        img_id = self.ids[index]
        ann_ids = coco.getAnnIds(imgIds=img_id)
        cat_ids = [ann['category_id'] for ann in coco.loadAnns(ann_ids)]
        target = coco.loadCats(cat_ids)
        if CONF['use_super']:
            target = np.array([x['supercategory'] for x in target])
        else:
            target = np.array([x['id'] for x in target if x['id'] in self.classes])
        targets = torch.FloatTensor([1 if (c in target) else 0 for c in self.classes])
        path = coco.loadImgs(img_id)[0]['file_name']
        img = Image.open(os.path.join(self.root, path)).convert('RGB')
        if self.transforms is not None:
            img, targets = self.transforms(img, targets)

        return img, targets


    def __len__(self):
        return len(self.ids)


In [None]:
dataset = dset.CocoCaptions(root = '{}/{}'.format(CONF['data_dir'],CONF['dataType']),
                        annFile = '{}/annotations/captions_{}.json'.format(CONF['data_dir'],CONF['dataType']),
                        transform=T)

clas_set = CocoClassification(root = '{}/{}'.format(CONF['data_dir'],CONF['dataType']),
                        annFile = '{}/annotations/instances_{}.json'.format(CONF['data_dir'],CONF['dataType']),
                        transform=T_test)
val_set = CocoClassification(root = '{}/{}'.format(CONF['data_dir'],CONF['valType']),
                        annFile = '{}/annotations/instances_{}.json'.format(CONF['data_dir'],CONF['valType']),
                        transform=T_test)

In [None]:
train_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size*(CONF['N_neg']+1),
                                           shuffle=True, num_workers=2, pin_memory=True, drop_last=True)

In [None]:
val_loader = torch.utils.data.DataLoader(val_set, batch_size=CONF['test_BS'],shuffle=False, num_workers=8, pin_memory=False)

In [None]:
hparams = {
    "pretrained_model_name": "bert-base-uncased",
    "vocab_file": None,
    "max_len": CONF['max_len'],
    "unk_token": "[UNK]",
    "sep_token": "[SEP]",
    "pad_token": "[PAD]",
    "cls_token": "[CLS]",
    "mask_token": "[MASK]",
    "tokenize_chinese_chars": True,
    "do_lower_case": True,
    "do_basic_tokenize": True,
    "non_split_tokens": None,
    "name": "bert_tokenizer",
}

In [None]:
tokenizer = tx.data.BERTTokenizer(hparams=hparams, pretrained_model_name='bert-base-uncased')

In [None]:
bert_hparams = {
 'embed': {'dim': CONF['hidden_size'], 'name': 'word_embeddings'},
 'vocab_size': 30522,
 'segment_embed': {'dim': CONF['hidden_size'], 'name': 'token_type_embeddings'},
 'type_vocab_size': 2,
 'position_embed': {'dim': CONF['hidden_size'], 'name': 'position_embeddings'},
 'position_size': CONF['max_len'],
 'encoder': {'dim': CONF['hidden_size'],
  'embedding_dropout': 0.1,
  'multihead_attention': {'dropout_rate': 0.1,
   'name': 'self',
   'num_heads': 6,
   'num_units': CONF['hidden_size'],
   'output_dim': CONF['hidden_size'],
   'use_bias': True},
  'name': 'encoder',
  'num_blocks': 4,
  'poswise_feedforward': {'layers': [{'kwargs': {'in_features': CONF['hidden_size'],
      'out_features': CONF['bert_hdim'],
      'bias': True},
     'type': 'Linear'},
    {'type': 'BertGELU'},
    {'kwargs': {'in_features': CONF['bert_hdim'], 'out_features': CONF['hidden_size'], 'bias': True},
     'type': 'Linear'}]},
  'residual_dropout': 0.1,
  'use_bert_config': True},
 'hidden_size': CONF['hidden_size'],
 'initializer': None,
 'name': 'bert_encoder',
 'pretrained_model_name':None}

if CONF['bert_pretrained']:
    bert_hparams["pretrained_model_name"]="bert-base-uncased"

In [None]:
class Flatten(nn.Module):
    def forward(self, x):
        x = x.view(x.size()[0], -1)
        return x

In [None]:
class PC_Embedder(nn.Module):


    def __init__(self, k, nz, hparams):
        super(PC_Embedder, self).__init__()
        self.bert = tx.modules.BERTEncoder(hparams=hparams)
        self.q = nn.Linear(CONF['hidden_size'], nz)
        resnet50 = torchvision.models.resnet50(pretrained=CONF['resnet_pretrained'])
        modules=list(resnet50.children())[:-1]
        modules.append(Flatten())
        self.p = nn.Sequential(*modules)
        self.k = k+1
        self.G = nn.Sequential(
            nn.Linear(2048, 512, bias=True),
            nn.ReLU(),
            nn.Linear(512, nz, bias=True),
        )

    def forward(self, v):
        return self.p(v)

    def get_loss(self, v, v1, LAMBDA):
        inputs, segment_ids = v1
        _, v1 = self.bert(inputs=inputs, segment_ids=segment_ids)
        batch_size = v.shape[0]//self.k
        z = self.G(self.p(v))
        z = z.squeeze().view(batch_size, self.k, z.shape[1])[:,0,:]
        z_l = z.unsqueeze(1).expand(z.shape[0],self.k,z.shape[1]).contiguous()
        z_p = self.q(v1).view(z_l.shape).contiguous()
        l1 = F.log_softmax(torch.sum(z_l*z_p, dim = -1), dim=1)[:,0]
        l2 = torch.sum((z - z_p[:,0,:])**2, dim=-1)
        return torch.mean(- l1 + LAMBDA*l2, dim=0),l1,l2

In [None]:
class Tracker(object):

    def __init__(self, VARS, ranks):
        self.var_dict = dict(zip(VARS, [(1000000.0 if ranks[i] else -1000000.0, int(ranks[i])) for i in range(len(VARS))]))
        
    def update(self, d):
        for k,v in d.items():
            o,r = self.var_dict[k]
            self.var_dict[k] = (np.minimum(v,o) if r else np.maximum(v,o), r)
    
    def return_dict(self):
        D = dict()
        for k,(v,_) in self.var_dict.items():
            D[k]=v
        return D

In [None]:
test_class = nn.Linear(CONF['nz'],CONF['test_classes']).cuda()

In [None]:
criterion = nn.BCEWithLogitsLoss()

In [None]:
from tqdm import notebook
tnrange=notebook.tnrange
tqdm_notebook = notebook.tqdm

In [None]:
def hamming_score(y_true, y_pred, normalize=True, sample_weight=None):
    '''
    Compute the Hamming score (a.k.a. label-based accuracy) for the multi-label case
    https://stackoverflow.com/q/32239577/395857
    '''
    acc_list = []
    for i in range(y_true.shape[0]):
        set_true = set( np.where(y_true[i])[0] )
        set_pred = set( np.where(y_pred[i])[0] )
        tmp_a = None
        if len(set_true) == 0 and len(set_pred) == 0:
            tmp_a = 1
        else:
            tmp_a = len(set_true.intersection(set_pred))/\
                    float( len(set_true.union(set_pred)) )
        acc_list.append(tmp_a)
    return np.mean(acc_list)

In [None]:
METRICS = ['train_avg', 'val_avg', 'subset_acc', 'hamming_loss', 'hamming_score',
           'micro_f1', 'macro_f1', 'micro_roc_auc', 'macro_roc_auc',
           'micro_precision', 'macro_precision', 'micro_recall', 'macro_recall']
RANKS = [0, 0, 0, 1, 0,
         0, 0, 0, 0,
         0, 0, 1, 1]

assert(len(METRICS)==len(RANKS))

In [None]:
def trial(enc, opt, LAMBDA, trial_number):
    c=0
    # Clear any logs from previous runs
    import shutil
    tb_dir=os.path.join('./runs', CONF['name'] + "_LAMBDA_{}_{}".format(LAMBDA, trial_number))
    shutil.rmtree(tb_dir, ignore_errors=True)
    time.sleep(5)
    tracker = Tracker(METRICS, RANKS)
    writer = SummaryWriter(log_dir=tb_dir)
    for it in tnrange(CONF['niter'], desc="training with LAMBDA:{}".format(LAMBDA)):

        test_class.reset_parameters()
        clas_set.sample_class(CONF['test_classes'])
        clas_loader = torch.utils.data.DataLoader(clas_set, batch_size=CONF['test_BS'],shuffle=True, num_workers=8, pin_memory=False)
        with torch.no_grad():
            enc.eval()
            test = []
            for img,lbl in tqdm_notebook(clas_loader,leave=False,desc="generating train set"):
                test.append((enc(img.cuda()).data.cpu(), lbl))
            enc.train()

        opt_test = optim.Adam(test_class.parameters(), lr=1e-2)
        for test_it in tnrange(CONF['ntest'],leave=False,desc="training linear classifier"):
            random.shuffle(test)
            for z,lbl in test:
                pred = test_class(z.cuda())
                loss = criterion(pred,lbl.cuda())
                opt_test.zero_grad()
                loss.backward()
                opt_test.step()

        with torch.no_grad():
            M = dict()
            count = 0
            corrects = torch.zeros(CONF['test_classes'])
            for z,lbl in test:
                count += z.shape[0]
                pred = torch.sigmoid(test_class(z.cuda()))>.5
                corrects += torch.sum(torch.eq(pred.cpu(), lbl), dim=0)
            acc = (corrects/float(count))
            writer.add_scalar("acc/train_avg", torch.mean(acc).item(), global_step=it)
            writer.add_histogram('acc_train', acc.data.cpu().numpy(), global_step=it)
            M['train_avg'] = torch.mean(acc).item()

            count = 0
            corrects = torch.zeros(CONF['test_classes'])
            LBL, Y = [], []
            for img,lbl in val_loader:
                LBL.append(lbl.numpy())
                count += img.shape[0]
                z = enc(img.cuda())
                pred = torch.sigmoid(test_class(z))>.5
                Y.append(pred.data.cpu().numpy())
                corrects += torch.sum(torch.eq(pred.cpu(), lbl), dim=0)
            acc = (corrects/float(count))
            writer.add_scalar("acc/val_avg", torch.mean(acc).item(), global_step=it)
            writer.add_histogram('acc_val', acc.data.cpu().numpy(), global_step=it)

            M['val_avg'] = torch.mean(acc).item()
            Y, LBL = np.concatenate(Y,axis=0), np.concatenate(LBL,axis=0)
            M['subset_acc'] = metrics.accuracy_score(LBL, Y, normalize=True)
            M['hamming_loss'] = metrics.hamming_loss(LBL, Y)
            M['hamming_score'] = hamming_score(LBL, Y)
            M['micro_f1'] = metrics.f1_score(LBL, Y, average='micro')
            M['macro_f1'] = metrics.f1_score(LBL, Y, average='macro')
            M['micro_roc_auc'] = metrics.roc_auc_score(LBL, Y, average='micro')
            M['macro_roc_auc'] = metrics.roc_auc_score(LBL, Y, average='macro')
            M['micro_precision'] = metrics.precision_score(LBL,Y,average='micro')
            M['macro_precision'] = metrics.precision_score(LBL,Y,average='macro')
            M['micro_recall'] = metrics.recall_score(LBL,Y,average='micro')
            M['macro_recall'] = metrics.recall_score(LBL,Y,average='macro')
            
            for k,v in M.items():
                writer.add_scalar("metrics/{}".format(k), v, global_step=it)
            tracker.update(M)
            writer.flush()

        for img,ann in tqdm_notebook(train_loader, leave=False):
            ann = np.take_along_axis(np.array(ann), np.random.randint(0, len(ann)-1, size=(1,img.shape[0])),0).squeeze()
            inputs, segment_ids = [],[]
            for s in ann:
                x,y,_ = tokenizer.encode_text(s)
                inputs.append(x)
                segment_ids.append(y)

            inputs = torch.LongTensor(inputs).cuda()
            segment_ids = torch.LongTensor(segment_ids).cuda()
            img = img.cuda()
            loss, l1, l2 = enc.get_loss(img,(inputs,segment_ids), LAMBDA)
            opt.zero_grad()
            loss.backward()
            opt.step()
            writer.add_scalar("loss/loss", loss.item(), global_step=c)
            writer.add_scalars("loss/parts", {'l1':- l1.data.mean().item(), 'l2':l2.data.mean().item()}, global_step=c)
            c+=1
        torch.save(enc.state_dict(), "./models/{}.pth".format(CONF['name'] + "_trial_{}_LAMBDA_{}_it_{}".format(trial_number,LAMBDA,it)))
    writer.close()
    return tracker.return_dict()

In [None]:
enc = PC_Embedder(CONF['N_neg'], CONF['nz'], bert_hparams)
enc.to(device)
opt = optim.Adam(enc.parameters(), lr=1e-4)
global_writer.add_hparams(hparam_dict={'lambda': CONF['LAMBDAs'], 'trial':t}, metric_dict=trial(enc, opt, CONF['LAMBDAs'], t))
global_writer.close()