In [1]:
import torch
import torch.nn as nn
import yaml
from layers import SimpleClassifier, WordEmbedding, VisualFeatEncoder, QuestionEmbedding
import torch.nn.functional as F
from dfaf import SingleBlock, Classifier
from torch.nn.utils.weight_norm import weight_norm
from basemodel import build_model

config = yaml.load(open('options/1/exp_1_8_3.yaml','rb'),Loader=yaml.FullLoader)

In [2]:
class LoRRA(nn.Module):
    def __init__(self, 
                 embedding, 
                 qembedding, 
                 image_encoder,
                 interaction_net, 
                 classifier, 
                 **arg):
        super().__init__()
        self.embedding = embedding
        self.qembedding = qembedding
        self.image_encoder = image_encoder
        
        self.interaction_net = interaction_net
        self.classifier = classifier

    def forward(self, image, bbox, text, context):
        
        '''
        image:     (b, N, 2048)
        bbox:      (b, N, 4)
        text :     (b, T)
        contex :   (b, M, 300)
        '''
        text_embedding = self.embedding(text)                        # we sequence
        text_encode ,_ = self.qembedding.forward_all(text_embedding)
        image_encode = self.image_encoder(image, bbox)
        
        '''
        obtain the mask of image , question and context feature
        '''
        text_mask = (self.get_mask(text_embedding) == False).float()
        context_mask = (self.get_mask(context) == False).float()
        image_mask = (self.get_mask(image_encode) == False).float()
        
        v, q, c = self.interaction_net(image_encode, text_encode, context, image_mask, text_mask, context_mask)
        print(v.shape, q.shape, c.shape)
        score1 = self.classifier[0](v, q, image_mask, text_mask)
        score2 = self.classifier[1](c).squeeze(dim=2)
        score = torch.cat((score1,score2), dim = 1)
        return score
    
    def get_mask(self, x):
        return (x.abs().sum(dim=2) == 0)
    
def build_model(dataset, config):
    
    embedding = WordEmbedding(
        len(dataset.dictionary), 
        dataset.dictionary.embedding_dim
    )
    
    qembedding = QuestionEmbedding(**config['text_embeddings'])
    
    image_encoder = VisualFeatEncoder(
            in_dim=config['image_feature_dim'],
            **config['image_feature_encodings']
    )
    
    interaction_net = SingleBlock(**config['interIntrablocks'])
    config["classifier"]["out_features"] = dataset.answer_process.length
    classifier1 = Classifier(**config["classifier"])
    classifier2 = weight_norm(nn.Linear(512, 1), dim=None)
    classifier = nn.ModuleList([classifier1, classifier2])
    
    modules = {
        'embedding': embedding,
        'qembedding': qembedding,
        'image_encoder': image_encoder,
        "interaction_net": interaction_net,
        'classifier' : classifier
    }

    return LoRRA(**modules)

In [2]:
from dataset import Dictionary, TextVQA
from torch.utils.data import DataLoader
dictionary = Dictionary()
embedding_weight = dictionary.create_glove_embedding_init(pre=True, pre_dir='../data/vocabs/embedding_weight.npy')
# train_dset = TextVQA('train', dictionary)
eval_dset = TextVQA('val', dictionary)
eval_loader = DataLoader(
            eval_dset, 
            2, 
            shuffle=False, 
            num_workers = 1, 
        )

Total 5000 val samples.
Use 5000 val samples.
no existing answer 1345


In [3]:
model = build_model(eval_dset, config["model_attributes"])
model = nn.DataParallel(model).cuda()

In [7]:
# F.softmax(model.alpha,dim=0)

tensor([nan, nan, nan], device='cuda:0', grad_fn=<SoftmaxBackward>)

In [4]:
model_parameter = torch.load("save/12/exp_1_8_3/model_best.pth")

In [5]:
model.load_state_dict(model_parameter.get('model_state', model_parameter))

<All keys matched successfully>

In [9]:
F.softmax(model.module.alpha,dim=0)

tensor([0.4960, 0.3180, 0.1860], device='cuda:0', grad_fn=<SoftmaxBackward>)

In [4]:
optim = torch.optim.Adamax(
        filter(lambda p:p.requires_grad, model.parameters()),
        lr = 0.0015
    )

##### 完成一次forward

In [5]:
from torch.autograd import Variable
from utils import LogitBinaryCrossEntropy
lbce = LogitBinaryCrossEntropy()
for i,sample in enumerate(eval_loader):
    input_ids = Variable(sample["input_ids"])
    token_type_ids = Variable(sample["token_type_ids"])
    attention_mask = Variable(sample["attention_mask"])
    img = Variable(sample["img_feature"])
    context = Variable(sample["context_feature"])
    labels = Variable(sample['answer'])
    bbox = Variable(sample['bbox'])
    ocrbbox = Variable(sample['ocrbbox'])
    answer = model(img.cuda(), bbox.cuda(), input_ids.cuda(), token_type_ids.cuda(), attention_mask.cuda(), context.cuda(), ocrbbox.cuda())  # image, bbox, text, context, cbbox
    loss = lbce(labels.cuda(), answer)
    break

In [19]:
a = nn.Parameter(torch.ones(10,2))

In [14]:
class VQAAccuracy(object):
    """
    Calculate VQAAccuracy. Find more information here_

    **Key**: ``vqa_accuracy``.

    .. _here: https://visualqa.org/evaluation.html
    """

    def __init__(self):
        super(VQAAccuracy,self).__init__()

    def _masked_unk_softmax(self, x, dim, mask_idx):
        x1 = torch.nn.functional.softmax(x, dim=dim)
        x1[:, mask_idx] = 0
        x1_sum = torch.sum(x1, dim=1, keepdim=True)
        y = x1 / x1_sum
        return y

    def calculate(self, expected, output, *args, **kwargs):
        """Calculate vqa accuracy and return it back.
        Args:
            output : score
            expected : label
        Returns:
            torch.FloatTensor: VQA Accuracy

        """
        output = self._masked_unk_softmax(output, 1, 0) # unknow 屏蔽掉了
        output = output.argmax(dim=1)  # argmax

        one_hots = expected.new_zeros(*expected.size())
        print("one hot:", one_hots,"output:", output)
        one_hots.scatter_(1, output.view(-1, 1), 1)
        print(one_hots)
        scores = one_hots * expected
        accuracy = torch.sum(scores) / expected.size(0)

        return accuracy
    def __call__(self, *args, **kwargs):
        return self.calculate(*args, **kwargs)
vqa_accuracy = VQAAccuracy()

In [6]:
import torch
output = torch.randn(1,10)
labels = (torch.randn(1,10)>0).float()
output, labels

(tensor([[-0.4101,  0.6817,  1.1464,  1.4361,  0.1126, -0.3576,  0.1192,  0.0676,
           2.2384, -1.7476]]),
 tensor([[0., 0., 0., 1., 1., 0., 0., 1., 1., 1.]]))

#### trilinear interaction

In [1]:
from trilinear import TriAttention, TCNet
from fc import FCNet

In [2]:
import torch
v = torch.randn(2,50,789)
q = torch.randn(2,12,789)
a = torch.randn(2,5,789)

In [7]:
class Trilinear_Classifier(nn.Sequential):
    def __init__(self, in_features, mid_features, out_features, drop=0.0):
        super(Classifier, self).__init__()
        self.drop = nn.Dropout(drop)
        self.relu = nn.ReLU()
        self.lin1 = nn.Linear(in_features, mid_features)
        self.lin2 = nn.Linear(mid_features, out_features)
        self.bn = nn.BatchNorm1d(mid_features)
        #_____________________________________________________________________________________
        v_dim = 768               
        q_dim = 768
        a_dim = 768               
        h_mm = 768               
        rank = 32               
        gamma = 1               
        k = 1               
        h_out = 1               
        t_att = TriAttention(v_dim, q_dim, a_dim, h_mm, 1, rank, gamma, k, dropout=[.2, .5])               
        t_net = TCNet(v_dim, q_dim, a_dim, h_mm, h_out, rank, 1, dropout=[.2, .5], k=1)               
        #______________________________________________________________________________________

    def forward(self, v, q, c, v_mask, q_mask):
        """
        v: visual feature      [batch, num_obj, 512]
        q: question            [batch, max_len, 512]
        v_mask                 [batch, num_obj]
        q_mask                 [batch, max_len]
        """
        att, logits = t_att(v, q, c)  # b x v x q x a x g
        fusion_f = t_net.forward_with_weights(v, q, c, att[:, :, :, :, 0])
        out = self.lin1(self.drop(fusion_f))
        out = self.lin2(self.drop(self.relu(self.bn(out))))
        return out

torch.Size([2, 1024])

In [46]:
import torch.nn.functional as F
lable = torch.Tensor([[0,1]])
prediction = torch.rand(1,2)
weight = torch.Tensor([0.5, 0.5])
lable, prediction, weight

(tensor([[0., 1.]]), tensor([[0.8401, 0.4474]]), tensor([0.5000, 0.5000]))

In [47]:
F.binary_cross_entropy_with_logits(prediction, lable)

tensor(0.8466)

In [48]:
F.binary_cross_entropy_with_logits(prediction, lable, weight)

tensor(0.4233)

In [60]:
mlevelr = []
for l in range(3):
    v = torch.randn(1, 100, 512)
    q = torch.randn(1, 14, 512)
    c = torch.randn(1, 50, 512)
    mlevelr.append((v,q,c))
mask_q = (torch.randn(1,100)>0.5).float()
mask_v = (torch.randn(1,14)>0.5).float()
mask_c = (torch.randn(1,50)>0.5).float()

In [62]:
atten = m(mlevelr, mask_q, mask_v, mask_c)

In [64]:
atten.shape

torch.Size([1, 3, 1])