<a href="https://colab.research.google.com/github/tejas4888/VQA-685/blob/main/VisualBERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [1]:
## code from Detectron library
## added due to compatibility issues

class FastRCNNOutputs:
    """
    An internal implementation that stores information about outputs of a Fast R-CNN head,
    and provides methods that are used to decode the outputs of a Fast R-CNN head.
    """
    def __init__(
        self,
        box2box_transform,
        pred_class_logits,
        pred_proposal_deltas,
        proposals,
        smooth_l1_beta=0.0,
        box_reg_loss_type="smooth_l1",
    ):
        """
        Args:
            box2box_transform (Box2BoxTransform/Box2BoxTransformRotated):
                box2box transform instance for proposal-to-detection transformations.
            pred_class_logits (Tensor): A tensor of shape (R, K + 1) storing the predicted class
                logits for all R predicted object instances.
                Each row corresponds to a predicted object instance.
            pred_proposal_deltas (Tensor): A tensor of shape (R, K * B) or (R, B) for
                class-specific or class-agnostic regression. It stores the predicted deltas that
                transform proposals into final box detections.
                B is the box dimension (4 or 5).
                When B is 4, each row is [dx, dy, dw, dh (, ....)].
                When B is 5, each row is [dx, dy, dw, dh, da (, ....)].
            proposals (list[Instances]): A list of N Instances, where Instances i stores the
                proposals for image i, in the field "proposal_boxes".
                When training, each Instances must have ground-truth labels
                stored in the field "gt_classes" and "gt_boxes".
                The total number of all instances must be equal to R.
            smooth_l1_beta (float): The transition point between L1 and L2 loss in
                the smooth L1 loss function. When set to 0, the loss becomes L1. When
                set to +inf, the loss becomes constant 0.
            box_reg_loss_type (str): Box regression loss type. One of: "smooth_l1", "giou"
        """
        self.box2box_transform = box2box_transform
        self.num_preds_per_image = [len(p) for p in proposals]
        self.pred_class_logits = pred_class_logits
        self.pred_proposal_deltas = pred_proposal_deltas
        self.smooth_l1_beta = smooth_l1_beta
        self.box_reg_loss_type = box_reg_loss_type

        self.image_shapes = [x.image_size for x in proposals]

        if len(proposals):
            box_type = type(proposals[0].proposal_boxes)
            # cat(..., dim=0) concatenates over all images in the batch
            self.proposals = box_type.cat([p.proposal_boxes for p in proposals])
            assert (
                not self.proposals.tensor.requires_grad
            ), "Proposals should not require gradients!"

            # "gt_classes" exists if and only if training. But other gt fields may
            # not necessarily exist in training for images that have no groundtruth.
            if proposals[0].has("gt_classes"):
                self.gt_classes = cat([p.gt_classes for p in proposals], dim=0)

                # If "gt_boxes" does not exist, the proposals must be all negative and
                # should not be included in regression loss computation.
                # Here we just use proposal_boxes as an arbitrary placeholder because its
                # value won't be used in self.box_reg_loss().
                gt_boxes = [
                    p.gt_boxes if p.has("gt_boxes") else p.proposal_boxes for p in proposals
                ]
                self.gt_boxes = box_type.cat(gt_boxes)
        else:
            self.proposals = Boxes(torch.zeros(0, 4, device=self.pred_proposal_deltas.device))
        self._no_instances = len(self.proposals) == 0  # no instances found

    def softmax_cross_entropy_loss(self):
        """
        Deprecated
        """
        _log_classification_stats(self.pred_class_logits, self.gt_classes)
        return cross_entropy(self.pred_class_logits, self.gt_classes, reduction="mean")

    def box_reg_loss(self):
        """
        Deprecated
        """
        if self._no_instances:
            return 0.0 * self.pred_proposal_deltas.sum()

        box_dim = self.proposals.tensor.size(1)  # 4 or 5
        cls_agnostic_bbox_reg = self.pred_proposal_deltas.size(1) == box_dim
        device = self.pred_proposal_deltas.device

        bg_class_ind = self.pred_class_logits.shape[1] - 1
        # Box delta loss is only computed between the prediction for the gt class k
        # (if 0 <= k < bg_class_ind) and the target; there is no loss defined on predictions
        # for non-gt classes and background.
        # Empty fg_inds should produce a valid loss of zero because reduction=sum.
        fg_inds = nonzero_tuple((self.gt_classes >= 0) & (self.gt_classes < bg_class_ind))[0]

        if cls_agnostic_bbox_reg:
            # pred_proposal_deltas only corresponds to foreground class for agnostic
            gt_class_cols = torch.arange(box_dim, device=device)
        else:
            # pred_proposal_deltas for class k are located in columns [b * k : b * k + b],
            # where b is the dimension of box representation (4 or 5)
            # Note that compared to Detectron1,
            # we do not perform bounding box regression for background classes.
            gt_class_cols = box_dim * self.gt_classes[fg_inds, None] + torch.arange(
                box_dim, device=device
            )

        if self.box_reg_loss_type == "smooth_l1":
            gt_proposal_deltas = self.box2box_transform.get_deltas(
                self.proposals.tensor, self.gt_boxes.tensor
            )
            loss_box_reg = smooth_l1_loss(
                self.pred_proposal_deltas[fg_inds[:, None], gt_class_cols],
                gt_proposal_deltas[fg_inds],
                self.smooth_l1_beta,
                reduction="sum",
            )
        elif self.box_reg_loss_type == "giou":
            fg_pred_boxes = self.box2box_transform.apply_deltas(
                self.pred_proposal_deltas[fg_inds[:, None], gt_class_cols],
                self.proposals.tensor[fg_inds],
            )
            loss_box_reg = giou_loss(
                fg_pred_boxes,
                self.gt_boxes.tensor[fg_inds],
                reduction="sum",
            )
        else:
            raise ValueError(f"Invalid bbox reg loss type '{self.box_reg_loss_type}'")

        loss_box_reg = loss_box_reg / self.gt_classes.numel()
        return loss_box_reg

    def losses(self):
        """
        Deprecated
        """
        return {"loss_cls": self.softmax_cross_entropy_loss(), "loss_box_reg": self.box_reg_loss()}

    def predict_boxes(self):
        """
        Deprecated
        """
        pred = self.box2box_transform.apply_deltas(self.pred_proposal_deltas, self.proposals.tensor)
        return pred.split(self.num_preds_per_image, dim=0)

    def predict_probs(self):
        """
        Deprecated
        """
        probs = F.softmax(self.pred_class_logits, dim=-1)
        return probs.split(self.num_preds_per_image, dim=0)

In [2]:
!pip install pyyaml==5.1
!python -m pip install 'git+https://github.com/facebookresearch/detectron2.git'

Collecting pyyaml==5.1
  Downloading PyYAML-5.1.tar.gz (274 kB)
[?25l[K     |█▏                              | 10 kB 17.8 MB/s eta 0:00:01[K     |██▍                             | 20 kB 24.1 MB/s eta 0:00:01[K     |███▋                            | 30 kB 24.8 MB/s eta 0:00:01[K     |████▉                           | 40 kB 19.3 MB/s eta 0:00:01[K     |██████                          | 51 kB 9.3 MB/s eta 0:00:01[K     |███████▏                        | 61 kB 9.9 MB/s eta 0:00:01[K     |████████▍                       | 71 kB 8.0 MB/s eta 0:00:01[K     |█████████▋                      | 81 kB 8.9 MB/s eta 0:00:01[K     |██████████▊                     | 92 kB 7.7 MB/s eta 0:00:01[K     |████████████                    | 102 kB 8.4 MB/s eta 0:00:01[K     |█████████████▏                  | 112 kB 8.4 MB/s eta 0:00:01[K     |██████████████▍                 | 122 kB 8.4 MB/s eta 0:00:01[K     |███████████████▌                | 133 kB 8.4 MB/s eta 0:00:01[K     |██

In [3]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.14.1-py3-none-any.whl (3.4 MB)
[K     |████████████████████████████████| 3.4 MB 8.5 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 42.1 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 495 kB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 46.3 MB/s 
Installing collected packages: tokenizers, sacremoses, huggingface-hub, transformers
Successfully installed huggingface-hub-0.2.1 sacremoses-0.0.46 tokenizers-0.10.3 transformers-4.14.1


In [4]:
import sys
import os
from torch.nn import functional as F
import torch, torchvision
import yaml
import json 
import cv2

In [5]:
from detectron2.modeling import build_model
from detectron2.checkpoint import DetectionCheckpointer
from detectron2.structures.image_list import ImageList
from detectron2.data import transforms as T
from detectron2.modeling.box_regression import Box2BoxTransform
# from detectron2.modeling.roi_heads.fast_rcnn import FastRCNNOutputs
from detectron2.structures.boxes import Boxes
from detectron2.layers import nms
from detectron2 import model_zoo
from detectron2.config import get_cfg

# Model to extract visual embeddings

In [None]:
class PretrainedCNN:

    def __init__(self, cfg_path):

        self.cfg = self.load_config_and_model_weights(cfg_path)
        self.model = self.get_model(self.cfg)

    def load_config_and_model_weights(self, cfg_path):
        cfg = get_cfg()
        cfg.merge_from_file(model_zoo.get_config_file(cfg_path))

        # ROI HEADS SCORE THRESHOLD
        cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.5

        # Comment the next line if you're using 'cuda'
        # cfg['MODEL']['DEVICE']='cpu'

        cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url(cfg_path)

        return cfg

    def get_model(self, cfg):
        # build model
        model = build_model(cfg)

        # load weights
        checkpointer = DetectionCheckpointer(model)
        checkpointer.load(cfg.MODEL.WEIGHTS)

        # eval mode
        model.eval()
        return model        

    def prepare_image_inputs(self, img_list):

        #get model's cfg
        cfg = self.cfg 

        # Resizing the image according to the configuration
        transform_gen = T.ResizeShortestEdge([cfg.INPUT.MIN_SIZE_TEST, cfg.INPUT.MIN_SIZE_TEST], cfg.INPUT.MAX_SIZE_TEST)
        img_list = [transform_gen.get_transform(img).apply_image(img) for img in img_list]

        # Convert to C,H,W format
        convert_to_tensor = lambda x: torch.Tensor(x.astype("float32").transpose(2, 0, 1))

        batched_inputs = [{"image":convert_to_tensor(img), "height": img.shape[0], "width": img.shape[1]} for img in img_list]

        # Normalizing the image
        num_channels = len(cfg.MODEL.PIXEL_MEAN)
        pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).view(num_channels, 1, 1)
        pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).view(num_channels, 1, 1)
        normalizer = lambda x: (x - pixel_mean) / pixel_std
        images = [normalizer(x["image"]) for x in batched_inputs]

        # Convert to ImageList
        images =  ImageList.from_tensors(images,self.model.backbone.size_divisibility)
        
        return images, batched_inputs

        def get_visual_embeddings(self, img):

            with torch.no_grad():
                if (self.is_vision_model_loaded == False):
                    print("Loading CNN\n")
                    cfg_path = "COCO-InstanceSegmentation/mask_rcnn_R_101_FPN_3x.yaml"
                    self.cnn = PretrainedCNN(cfg_path)      
                    # self.cfg = cfg_path
                    # self.model = PretrainedCNN.get_model(self.cfg)
                    self.is_vision_model_loaded = True

            cfg = self.cfg
            images, batched_inputs = self.prepare_image_inputs(img)
            
            features = self.model.backbone(images.tensor.cuda())
            proposals, _ = self.model.proposal_generator(images, features)
            
            features_list = [features[f] for f in ['p2', 'p3', 'p4', 'p5']]
            box_features = self.model.roi_heads.box_pooler(features_list, [x.proposal_boxes for x in proposals])
            box_features = self.model.roi_heads.box_head.flatten(box_features)
            box_features = self.model.roi_heads.box_head.fc1(box_features)
            box_features = self.model.roi_heads.box_head.fc_relu1(box_features)
            box_features = self.model.roi_heads.box_head.fc2(box_features)
            # print (box_features.shape)
            box_features = box_features.reshape(1, -1, 1024) # depends on your config and batch size
            # box_features = box_features.reshape(1, -1, 2048) # depends on your config and batch size
        
            cls_features = self.model.roi_heads.box_pooler(features_list, [x.proposal_boxes for x in proposals])
            cls_features = self.model.roi_heads.box_head(cls_features)
            pred_class_logits, pred_proposal_deltas = self.model.roi_heads.box_predictor(cls_features)

            box2box_transform = Box2BoxTransform(weights=cfg.MODEL.ROI_BOX_HEAD.BBOX_REG_WEIGHTS)
            smooth_l1_beta = cfg.MODEL.ROI_BOX_HEAD.SMOOTH_L1_BETA

            outputs = FastRCNNOutputs(
                box2box_transform,
                pred_class_logits,
                pred_proposal_deltas,
                proposals,
                smooth_l1_beta,
            )

            boxes = outputs.predict_boxes()
            scores = outputs.predict_probs()
            image_shapes = outputs.image_shapes

            output_boxes = [self.get_output_boxes(boxes[i], batched_inputs[i], proposals[i].image_size) for i in range(len(proposals))]

            temp = [self.select_boxes(output_boxes[i], scores[i]) for i in range(len(scores))]

            keep_boxes, max_conf = [],[]
            for keep_box, mx_conf in temp:
                keep_boxes.append(keep_box)
                max_conf.append(mx_conf)

            MIN_BOXES=10
            MAX_BOXES=100

            keep_boxes = [self.filter_boxes(keep_box, mx_conf, MIN_BOXES, MAX_BOXES) for keep_box, mx_conf in zip(keep_boxes, max_conf)]
            visual_embeds = [ box_feature[keep_box.copy()] for box_feature, keep_box in zip(box_features, keep_boxes)]

        return visual_embeds

    def get_output_boxes(self, boxes, batched_inputs, image_size):
        proposal_boxes = boxes.reshape(-1, 4).clone()
        scale_x, scale_y = (batched_inputs["width"] / image_size[1], batched_inputs["height"] / image_size[0])
        output_boxes = Boxes(proposal_boxes)

        output_boxes.scale(scale_x, scale_y)
        output_boxes.clip(image_size)

        return output_boxes

    def filter_boxes(self, keep_boxes, max_conf, min_boxes, max_boxes):
        
        keep_boxes = keep_boxes.cpu()
        max_conf = max_conf.cpu()

        if len(keep_boxes) < min_boxes:
            keep_boxes = np.argsort(max_conf).numpy()[::-1][:min_boxes]
        elif len(keep_boxes) > max_boxes:
            keep_boxes = np.argsort(max_conf).numpy()[::-1][:max_boxes]
        return keep_boxes


    def select_boxes(self, output_boxes, scores):

        cfg = self.cnn.cfg
        test_score_thresh = cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST
        test_nms_thresh = cfg.MODEL.ROI_HEADS.NMS_THRESH_TEST
        cls_prob = scores.detach()
        # print (output_boxes.shape)
        cls_boxes = output_boxes.tensor.detach().reshape(-1,80,4)
        # cls_boxes = output_boxes.tensor.detach().reshape(1000,80,4)
        max_conf = torch.zeros((cls_boxes.shape[0])).to(torch.device("cuda:0"))
        for cls_ind in range(0, cls_prob.shape[1]-1):
            cls_scores = cls_prob[:, cls_ind+1]
            det_boxes = cls_boxes[:,cls_ind,:]
            keep = torch.from_numpy(np.array(nms(det_boxes, cls_scores, test_nms_thresh).cpu())).to(torch.device("cuda:0"))
            max_conf[keep] = torch.where(cls_scores[keep] > max_conf[keep], cls_scores[keep], max_conf[keep])
        keep_boxes = torch.where(max_conf >= test_score_thresh)[0]
        return keep_boxes, max_conf

    def generate_visual_embeddings(self, data, split='train'):

        img_folder = {'train':path+'images/train/', 'test':path+'images/test/', 'val':path+'images/val/'}
        img_fmt = '.jpg'
        
        if (split=='train'):
            train = data
            for i in range(len(train)):    
                if (train[i][0] not in self.visual_embeddings):
                    img_path = img_folder['train'] + train[i][0] + img_fmt
                    img = cv2.imread(img_path)
                    visual_embeds = self.get_visual_embeddings([img])
                    self.visual_embeddings[train[i][0]] = visual_embeds
            f = open("/content/drive/MyDrive/CS685/project/train_img_features.pkl","wb")
            pickle.dump(self.visual_embeddings,f)
            f.close()
            
        elif (split=='test'):
            test = data
            self.visual_embeddings = {}
            for i in range(len(test)):    
                if (test[i][0] not in self.visual_embeddings):
                    img_path = img_folder['test'] + test[i][0] + img_fmt
                    img = cv2.imread(img_path)
                    visual_embeds = self.get_visual_embeddings([img])
                    self.visual_embeddings[test[i][0]] = visual_embeds
            f = open("/content/drive/MyDrive/CS685/project/img_features_test.pkl","wb")
            pickle.dump(self.visual_embeddings,f)
            f.close()
        
        else:
            self.visual_embeddings = {}
            for i in range(len(val)):    
                if (val[i][0] not in self.visual_embeddings):
                    img_path = img_folder['val'] + val[i][0] + img_fmt
                    img = cv2.imread(img_path)
                    visual_embeds = self.get_visual_embeddings([img])
                    self.visual_embeddings[val[i][0]] = visual_embeds
            f = open("/content/drive/MyDrive/CS685/project/img_features_val.pkl","wb")
            pickle.dump(self.visual_embeddings,f)
            f.close()


'''
    Extract visual embeddings and save in pickle file
    DONT RUN AGAIN UNLESS NEEDED
'''

cfg_path = "COCO-InstanceSegmentation/mask_rcnn_R_101_FPN_3x.yaml"
cnn = PretrainedCNN(cfg_path)

# path = "/content/drive/MyDrive/PathVQA/split/"
# train, test, val, ans2label = load_data(path)
# print ("Generating visual embeddings")
# cnn.generate_visual_embeddings(train, 'train')
# cnn.generate_visual_embeddings(test, 'test')

NameError: ignored

# Load data and mount google drive

Please create a shortcut in your Google Drive to this folder: **INSERT LINK**

In [6]:
import pandas as pd
import cv2
import numpy as np
from matplotlib import pyplot as plt
from google.colab import drive 

def mount_drive():
    drive.mount('/content/drive')

def load_data(path):

    # path = "/content/drive/MyDrive/PathVQA/split/"    

    train_path = path + 'qas/train/'
    test_path = path + 'qas/test/'
    val_path = path + 'qas/val/'

    print ("\nLoading PathVQA")
    train_qa = pd.read_pickle(train_path + "train_qa.pkl")
    test_qa = pd.read_pickle(test_path + "test_qa.pkl")
    val_qa = pd.read_pickle(val_path + "val_qa.pkl")
    ans2label = pd.read_pickle(path+'qas/ans2label.pkl')
    
    train = []
    for row in train_qa:
        if row['answer'] in ans2label:
            train.append([ row['image'], row['question'], int(ans2label[row['answer']]) ] )

    test = []
    for row in test_qa:
        if row['answer'] in ans2label:
            test.append([row['image'], row['question'], int(ans2label[row['answer']]) ])

    val = []
    for row in val_qa:
            if row['answer'] in ans2label:
                val.append([row['image'], row['question'], int(ans2label[row['answer']]) ])

    # train = np.array()

    # return train_qa, test_qa, val_qa, ans2label
    return np.asarray(train), np.asarray(test), np.asarray(val), ans2label

mount_drive()
path = "/content/drive/MyDrive/PathVQA/split/"
train, test, val, ans2label = load_data(path)

Mounted at /content/drive

Loading PathVQA


# Frozen VisualBERT

Losses are backpropogated only through the classifier.

In [41]:
import pickle
from transformers import BertTokenizer, VisualBertForQuestionAnswering, VisualBertForPreTraining

class Classifier(torch.nn.Module):
    def __init__(self, input_dims, output_dims):

        '''
            Build a classification head
        '''

        super().__init__()
        self.fc1 = torch.nn.Linear(input_dims, 1024)
        self.fc2 = torch.nn.Linear(1024, 2048)
        self.fc3 = torch.nn.Linear(2048, output_dims)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

class Model:

    #take necessary inputs
    #input_dims, output_dims, batch_size_test, batch_size_train
    def __init__(self, output_dims, lr):

        # self.model = VisualBertForQuestionAnswering.from_pretrained("uclanlp/visualbert-vqa")
        # model below takes vis embeds of dim 1024 
        self.model = VisualBertForPreTraining.from_pretrained('uclanlp/visualbert-nlvr2-coco-pre', output_hidden_states=True)
        self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
        self.model = self.model.cuda()

        # load classifier model also 
        self.input_dims = 768
        self.output_dims = output_dims

        # tested with both a single layer classifier and 
        # a more deep classification head
        # self.classifier = torch.nn.Linear(self.input_dims, self.output_dims)
        self.classifier = Classifier(self.input_dims, self.output_dims)
        self.loss =torch.nn.CrossEntropyLoss()

        # updating only classifier head
        # visualBERT weights are frozen
        self.optimizer = torch.optim.Adam(self.classifier.parameters(), lr=lr)
        
        # self.is_vision_model_loaded = False
        self.visual_embeddings = {}
        self.visual_embeddings_train = {}
        self.visual_embeddings_test = {}
        
        self.batch_size = 1
        self.lr = lr

    def load_visual_embeddings(self, path, split='train'):

        if split=='train':
            self.visual_embeddings_train = pd.read_pickle(path)
        elif split=='test':
            self.visual_embeddings_test = pd.read_pickle(path)

    def make_prediction(self, img_id, question,split='train'):
        '''
            should be a list of imgs/ques
        '''

        tokens = self.tokenizer(question, padding='max_length', max_length=100)
        input_ids = torch.tensor(tokens["input_ids"]).cuda()#.unsqueeze(0)
        attention_mask = torch.tensor(tokens["attention_mask"]).cuda()
        token_type_ids = torch.tensor(tokens["token_type_ids"]).cuda()
        # visual_embeds = torch.stack(self.get_visual_embeddings(img_id)).cuda()

        if split=='train':
            for id in img_id:
                visual_embeds = torch.stack(self.visual_embeddings_train[img_id[0]]).cuda()        
        else:
            for id in img_id:
                visual_embeds = torch.stack(self.visual_embeddings_test[img_id[0]]).cuda()        

        visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.long).cuda()
        visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long).cuda()
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, visual_embeds=visual_embeds, visual_attention_mask=visual_attention_mask, visual_token_type_ids=visual_token_type_ids)
        
        return outputs

    def test(self, batch_size=2, load_path=None):

        print ("Evaluating model")
        num_batches = int(len(test)) #iterating one example at a time
        test_loss = 0.0
        total_correct = 0.0

        #load weights from given checkpoint
        if load_path is not None:
            print (f'Loading from {load_path}')
            checkpt = torch.load(load_path)
            train_loss_log = checkpt['loss']
            self.classifier.load_state_dict(checkpt['classifier_model_state_dict'])
            self.model.load_state_dict(checkpt['vb_model_state_dict'])
            start_epoch = checkpt['epoch'] + 1
            max_test = checkpt['max_test']
            test_acc_log = checkpt['test_acc_log']
            batch_size = checkpt['batch_size']
            self.lr = checkpt['lr']

        #eval mode
        self.model.eval()
        self.classifier.eval()

        for i in range(num_batches):

            batch = test[i:i+batch_size]
            imgs = batch[:,0]
            questions = list(batch[:,1])
            labels = batch[:,2]

            if (imgs == []) or (questions == []) or (labels == []):
                continue

            pred = self.make_prediction(imgs, questions, 'test')
            # Extracting [CLS] token representation
            features = pred.hidden_states[11][0][0]

            with torch.no_grad():

                class_pred = self.classifier(features)
                y_pred = torch.log_softmax(class_pred.unsqueeze(0), dim=1)
                _, y_pred_tags = torch.max(y_pred, dim=1)
                gt = torch.zeros(class_pred.shape).cuda()
                gt[int(labels[0])] = 1.0

                if (int(labels[0]) == y_pred_tags[0].item()):
                    total_correct+=1

        print (f"Test accuracy is {total_correct/len(test)} \n")
        return total_correct/len(test)

    def train(self, start_epoch=0, epochs=2, batch_size=1, load_path=None, save_path=None):

        num_batches = int(len(train)) #iterate one example at a time
        train_loss_log = []
        test_acc_log = []
        max_test = 0.0

        print ("Saving to: ", save_path)

        self.model.train()
        self.classifier.train()
        
        #load weights from given checkpoint
        if load_path is not None:
            print (load_path)
            checkpt = torch.load(load_path)
            train_loss_log = checkpt['loss']
            self.classifier.load_state_dict(checkpt['classifier_model_state_dict'])
            self.model.load_state_dict(checkpt['vb_model_state_dict'])
            start_epoch = checkpt['epoch'] + 1
            max_test = checkpt['max_test']
            test_acc_log = checkpt['test_acc_log']
            batch_size = checkpt['batch_size']
            self.lr = checkpt['lr']

        for ep in range(start_epoch, epochs):
            self.model.train()
            self.classifier.train()
            train_loss = 0.0

            for i in range(num_batches):

                assert(self.model.training and self.classifier.training)

                if (i%4000==0 and i>0):
                    print (f'Epoch {ep}, {i}/{num_batches} batches, loss is {train_loss/i}')
                    # break

                batch = train[i:i+self.batch_size]
                imgs = batch[:,0]
                questions = list(batch[:,1])
                labels = batch[:,2]

                if (imgs == []) or (questions == []) or (labels == []):
                    continue

                pred = self.make_prediction(imgs, questions, 'train')
                features = pred.hidden_states[11][0][0]

                class_pred = self.classifier(features).cuda()
                gt = torch.zeros(class_pred.shape).cuda()
                gt[int(labels[0])] = 1.0
                loss = self.loss(class_pred.unsqueeze(0), gt.unsqueeze(0))
                loss.backward()
                train_loss += loss.item()
                
                #update weights for the batch
                if ((i+1)%batch_size==0 or i==len(train)):
                    self.optimizer.step()
                    self.optimizer.zero_grad()
                    
            (train_loss_log.append(train_loss))
            
            #save weights every 3 epochs
            if (save_path != None and (ep+1)%3==0):
                torch.save({
                    'epoch': ep,
                    'classifier_model_state_dict': self.classifier.state_dict(),
                    'optimizer_state_dict': self.optimizer.state_dict(),
                    'loss': train_loss_log,
                    'vb_model_state_dict': self.model.state_dict(),
                    'max_test': max_test,
                    'test_acc_log': test_acc_log,
                    'batch_size': batch_size,
                    'lr' : self.lr
                }, save_path+f"/upgraded_b{batch_size}_lr{int(self.lr)}_{ep}.pth")
                print ("Saved model to: ", save_path+f"/upgraded_b{batch_size}_lr{int(10000*self.lr)}_{ep}.pth")

            
            print (f'Completed {ep+1} epochs out of {epochs}, loss is {train_loss_log[ep]/len(train)} \n')
            test_acc = self.test(batch_size=1)

            #compare test accuracy at this epoch, save best weights so far
            try:
                if (test_acc > max_test):
                    max_test = test_acc
                    torch.save({
                        'epoch': ep,
                        'classifier_model_state_dict': self.classifier.state_dict(),
                        'optimizer_state_dict': self.optimizer.state_dict(),
                        'loss': train_loss_log,
                        'vb_model_state_dict': self.model.state_dict(),
                        'max_test': max_test,
                        'test_acc_log': test_acc_log,
                        'batch_size': batch_size,
                        'lr': self.lr
                    }, save_path+f"/upgraded_b{batch_size}_lr{int(self.lr)}_best.pth")
                    print ("Saved model to: ", save_path+f"/upgraded_b{batch_size}_lr{int(10000*self.lr)}_best.pth")
            except:
                print ("Could not check for the best model")
            

In [None]:
# path = "/content/drive/MyDrive/PathVQA/split/"
# train, test, val, ans2label = load_data(path)
# train_images, test_images, load_images(train, test, val)
# classes = len(ans2label)
# #output of BERT's lasthidden state is 768
# feature_len = 768

# Uncomment and run if not able to load pickle files
# print ("Loading model")
# visualbert = Model(len(ans2label))
# visualbert.generate_visual_embeddings('test')

In [None]:
print ("Loading model")
visualbert = Model(len(ans2label), lr=0.001)

#load visual embeddings from pickle file
print ("Loading visual embeddings")
visual_embeddings_path = "/content/drive/MyDrive/CS685/project/img_features"
visualbert.load_visual_embeddings(visual_embeddings_path+"_train.pkl", 'train')
visualbert.load_visual_embeddings(visual_embeddings_path+"_test.pkl", 'test')

In [None]:
### DONT RUN WITH THE SAME PATH AS IT WILL OVERWRITE FILE

load_path = None #'/content/drive/MyDrive/CS685/project/vb/batched_best.pth')
visualbert.classifier.cuda()
visualbert.train(epochs=20,batch_size=8,save_path='/content/drive/MyDrive/CS685/project/vb', load_path=load_path) 

In [None]:
load_path = None #'/content/drive/MyDrive/CS685/project/vb/batched_best.pth')
visualbert.classifier.cuda()
visualbert.test(batch_size=1, load_path=load_path)

# Unfrozen VisualBERT

Losses are backpropogated throughout the entire model.


In [7]:
import pickle
from transformers import BertTokenizer, VisualBertForQuestionAnswering, VisualBertForPreTraining

class CustomVB(torch.nn.Module):
    def __init__(self, input_dims, output_dims):
        super().__init__()

        '''
            Integrates the classification head on top ob base VisualBERT
            Loss is backpropogated throughout the model
        '''

        self.model = VisualBertForPreTraining.from_pretrained('uclanlp/visualbert-nlvr2-coco-pre', output_hidden_states=True)
        self.fc1 = torch.nn.Linear(input_dims, 1024)
        self.fc2 = torch.nn.Linear(1024, 2048)
        self.fc3 = torch.nn.Linear(2048, output_dims)

    def forward(self, input_ids, attention_mask, token_type_ids, visual_embeds, visual_attention_mask, visual_token_type_ids):
              
        x = self.model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, visual_embeds=visual_embeds, visual_attention_mask=visual_attention_mask, visual_token_type_ids=visual_token_type_ids)
        # print (x.hidden_states[0].shape)
        x = x.hidden_states[12][0][0]
        
        # x = x.hidden_states[0]
        # x = x[:,0,:]

        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

class VisualBERTModel:

    #take necessary inputs
    #input_dims, output_dims, batch_size_test, batch_size_train
    def __init__(self, output_dims, lr, use_weights=False):
        
        self.input_dims = 768
        self.output_dims = output_dims
        self.model = CustomVB(self.input_dims, self.output_dims)
        self.model = self.model.cuda()
        self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
        
        if (use_weights):
            self.compute_class_weights()
        else:
            self.loss = torch.nn.CrossEntropyLoss()
    
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=lr)
        
        self.visual_embeddings = {}
        self.visual_embeddings_train = {}
        self.visual_embeddings_test = {}
        
        self.batch_size = 1
        self.lr = lr

    def compute_class_weights(self):

        try:
            print (f"Total {len(ans2label)} classes, computing weights")
            weights = [0.0]*len(ans2label)
            for example in train:
                weights[int(example[2])] += 1

            normedWeights = [1 - (x / sum(weights)) for x in weights]
            weights = torch.FloatTensor(weights)
            # /float(len(train))
            # weights = 1.0 / weights
            # weights = torch.nan_to_num(weights, posinf=0.0)
            # weights = weights / weights.sum()
            # weights = torch.nan_to_num(weights)
            print ((weights).shape, weights)

        except:
            print (f"{ans2label} or {train} pickle files not loaded, check environment setup")

        self.loss = torch.nn.CrossEntropyLoss(weight = weights).cuda()


    def load_visual_embeddings(self, path, split='train'):

        if split=='train':
            self.visual_embeddings_train = pd.read_pickle(path)
        elif split=='test':
            self.visual_embeddings_test = pd.read_pickle(path)

    def make_prediction(self, img_id, question,split='train'):
        '''
            should be a list of imgs/ques
        '''

        tokens = self.tokenizer(question, padding='max_length', max_length=100)
        # tokens = self.tokenizer(question, padding='max_length', max_length=32, add 
        input_ids = torch.tensor(tokens["input_ids"]).cuda() #.unsqueeze(0)
        attention_mask = torch.tensor(tokens["attention_mask"]).cuda()
        token_type_ids = torch.tensor(tokens["token_type_ids"]).cuda()
        # visual_embeds = torch.stack(self.get_visual_embeddings(img_id)).cuda()

        if split=='train':
            for id in img_id:
                visual_embeds = torch.stack(self.visual_embeddings_train[img_id[0]]).cuda()        
        else:
            for id in img_id:
                # print ((self.visual_embeddings_test[id]))
                visual_embeds = torch.stack(self.visual_embeddings_test[img_id[0]]).cuda()        

        visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.long).cuda()
        visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long).cuda()
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, visual_embeds=visual_embeds, visual_attention_mask=visual_attention_mask, visual_token_type_ids=visual_token_type_ids)
        
        return outputs

    def test(self, batch_size=2, load_path=None, return_all=False):

        num_batches = int(len(test))
        test_loss = 0.0
        total_correct = 0.0
        print ("Evaluating")

        if load_path is not None:
            print (f'Loading path from {load_path}')
            checkpt = torch.load(load_path)
            train_loss_log = checkpt['loss']
            self.model.load_state_dict(checkpt['vb_model_state_dict'])
            start_epoch = checkpt['epoch'] + 1
            max_test = checkpt['max_test']
            test_acc_log = checkpt['test_acc_log']
            batch_size = checkpt['batch_size']
            self.lr = checkpt['lr']

        self.model.eval()

        predictions = []
        for i in range(num_batches):

            batch = test[i:i+self.batch_size]
            imgs = batch[:,0]
            questions = list(batch[:,1])
            labels = batch[:,2]
            if (imgs == []) or (questions == []) or (labels == []):
                continue

            with torch.no_grad():

                class_pred = self.make_prediction(imgs, questions, 'test')
                # print (class_pred.shape)
                y_pred = torch.log_softmax(class_pred.unsqueeze(0), dim=1)
                _, y_pred_tags = torch.max(y_pred, dim=1)
                gt = torch.zeros(class_pred.shape).cuda()
                gt[int(labels[0])] = 1.0
                if (int(labels[0]) == y_pred_tags[0].item()):
                    total_correct+=1
                
                predictions.append([int(labels[0]), y_pred_tags[0].item()])
                
        print (f"Test accuracy is {total_correct/len(test)} \n")

        if (return_all == True):
            return np.asarray(predictions)
        else:
            return total_correct/len(test)

    def train(self, start_epoch=0, epochs=2, batch_size=1, load_path=None, save_path=None):

        num_batches = int(len(train))
        train_loss_log = []
        test_acc_log = []
        max_test = 0.0

        print (save_path)

        self.model.train()
        
        if load_path is not None:
            print (load_path)
            checkpt = torch.load(load_path)
            train_loss_log = checkpt['loss']
            self.model.load_state_dict(checkpt['vb_model_state_dict'])
            start_epoch = checkpt['epoch'] + 1
            max_test = checkpt['max_test']
            test_acc_log = checkpt['test_acc_log']
            batch_size = checkpt['batch_size']
            self.lr = checkpt['lr']

        for ep in range(start_epoch, epochs):
            self.model.train()
            train_loss = 0.0

            for i in range(num_batches):

                assert(self.model.training)

                if (i%4000==0 and i>0):
                    print (f'Epoch {ep}, {i}/{num_batches} batches, loss is {train_loss/i}')
        
                batch = train[i:i+self.batch_size]
                imgs = batch[:,0]
                questions = list(batch[:,1])
                labels = batch[:,2]

                if (imgs == []) or (questions == []) or (labels == []):
                    continue

                class_pred = self.make_prediction(imgs, questions, 'train')
                # print (class_pred.shape)
                gt = torch.zeros(class_pred.shape).cuda()
                gt[int(labels[0])] = 1.0
                loss = self.loss(class_pred.unsqueeze(0), gt.unsqueeze(0))
                loss.backward()
                train_loss += loss.item()
                
                if ((i+1)%batch_size==0 or i==len(train)):
                    self.optimizer.step()
                    self.optimizer.zero_grad()
                    
            (train_loss_log.append(train_loss))
            
            if (save_path != None and (ep+1)%3==0):
                torch.save({
                    'epoch': ep,
                    'optimizer_state_dict': self.optimizer.state_dict(),
                    'loss': train_loss_log,
                    'vb_model_state_dict': self.model.state_dict(),
                    'max_test': max_test,
                    'test_acc_log': test_acc_log,
                    'batch_size': batch_size,
                    'lr' : self.lr
                }, save_path+f"/unfrozen_b{batch_size}_lr{int(self.lr)}_{ep}.pth")
                print ("Saved model to: ", save_path+f"/unfrozen_b{batch_size}_lr{int(1000000*self.lr)}_{ep}.pth")

            
            print (f'Completed {ep+1} epochs out of {epochs}, loss is {train_loss_log[ep]/len(train)} \n')
            test_acc = self.test(batch_size=1)
            try:
                if (test_acc > max_test):
                    max_test = test_acc
                    torch.save({
                        'epoch': ep,
                        'optimizer_state_dict': self.optimizer.state_dict(),
                        'loss': train_loss_log,
                        'vb_model_state_dict': self.model.state_dict(),
                        'max_test': max_test,
                        'test_acc_log': test_acc_log,
                        'batch_size': batch_size,
                        'lr': self.lr
                    }, save_path+f"/unfrozen_b{batch_size}_lr{int(self.lr)}_best.pth")
                    print ("Saved model to: ", save_path+f"/unfrozen_b{batch_size}_lr{int(1000000*self.lr)}_best.pth")
            except:
                print ("Could not check for the best model")
            

In [8]:
print ("Loading model")
visualbert = VisualBERTModel(len(ans2label), lr=0.00001, use_weights=False)

#load visual embeddings from pickle file
print ("Loading visual embeddings from file")
visual_embeddings_path = "/content/drive/MyDrive/CS685/project/img_features"
visualbert.load_visual_embeddings(visual_embeddings_path+"_train.pkl", 'train')
visualbert.load_visual_embeddings(visual_embeddings_path+"_test.pkl", 'test')

print ("Success")

Loading model


Downloading:   0%|          | 0.00/631 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/425M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Loading visual embeddings from file
Success


In [None]:
load_path = None #'/content/drive/MyDrive/CS685/project/vb/batched_best.pth')
visualbert.loss = visualbert.loss.cuda()
visualbert.train(epochs=20,batch_size=8,save_path='/content/drive/MyDrive/CS685/project/vb/hs',load_path=load_path)

/content/drive/MyDrive/CS685/project/vb/hs




Epoch 0, 4000/19755 batches, loss is 5.432911711528897
Epoch 0, 8000/19755 batches, loss is 4.607990379914641
Epoch 0, 12000/19755 batches, loss is 4.394262281943423
Epoch 0, 16000/19755 batches, loss is 4.2308385299672375
Completed 1 epochs out of 20, loss is 4.091028908154803 

Evaluating




Test accuracy is 0.3208132322536182 

Saved model to:  /content/drive/MyDrive/CS685/project/vb/hs/unfrozen_b8_lr10_best.pth
Epoch 1, 4000/19755 batches, loss is 3.9027823251811786
Epoch 1, 8000/19755 batches, loss is 3.0773398000537417
Epoch 1, 12000/19755 batches, loss is 2.827122273199532
Epoch 1, 16000/19755 batches, loss is 2.707432395949989
Completed 2 epochs out of 20, loss is 2.582336950076235 

Evaluating
Test accuracy is 0.4405582356995176 

Saved model to:  /content/drive/MyDrive/CS685/project/vb/hs/unfrozen_b8_lr10_best.pth
Epoch 2, 4000/19755 batches, loss is 3.4968660045759754
Epoch 2, 8000/19755 batches, loss is 2.746453888438846
Epoch 2, 12000/19755 batches, loss is 2.5389018848154423
Epoch 2, 16000/19755 batches, loss is 2.412454244262095
Saved model to:  /content/drive/MyDrive/CS685/project/vb/hs/unfrozen_b8_lr10_2.pth
Completed 3 epochs out of 20, loss is 2.2906906412851225 

Evaluating
Test accuracy is 0.4703652653342522 

Saved model to:  /content/drive/MyDrive/CS68

KeyboardInterrupt: ignored

In [None]:
#test accuracy for the given model
load_path = '/content/drive/MyDrive/CS685/project/vb/unfrozen/unfrozen_b8_lr0_best.pth'
preds = visualbert.test(batch_size=1,load_path=load_path, return_all=True)

# Data Augmentation

In [None]:
from transformers import pipeline
nlp = pipeline('fill-mask')

ModuleNotFoundError: ignored

In [None]:
!pip install googletrans==4.0.0rc1
import pickle
import googletrans
from googletrans import Translator
translator = Translator()


languages = [
    'en', # english
    'cs',  # czech
    'de',  # german
    'es', # spanish
    'fi',  # finnish
    'fr', # french
    'hi', # hindi
    'it', # italian
    'ja', # japanese
    'pt', # portuguese
    'ru', # russian
    'vi', # vietnamese
    'zh-cn',  # chinese
    ]
num_langs = len(languages)

def paraphrase_q(ques):

    ques = ques.split(' ')
    n = len(ques)
    idx = np.random.randint(0,n,1)
    ques[int(idx)] = '<mask>'
    ques = ' '.join(ques)
    paraphrased = nlp(ques)[0]
    return paraphrased['sequence']

def backtranslate(ques):

    target_idx = np.random.randint(low=1, high=num_langs, size=1)
    translated_example = translator.translate(ques, src='en', languages[idx])
    backtranslated_ques = translator.translate(translated_example.text, src=languages[idx], src='en')
    return backtranslated_ques.text

def augment_data(train, ratio=0.2, augment_type='replace_mask'):

    augmented_train = list(train)
    augmentation_examples = {}

    path = '/content/drive/MyDrive/PathVQA/split/qas/'
    num_examples_to_augment = int(ratio*len(train))
    examples_to_augment = np.random.randint(0, len(train), num_examples_to_augment)

    cnt = 0    
    for idx in examples_to_augment:
        img_id, ques, label = train[idx]
        if (augment_type='replace_mask'):
            q = paraphrase_q(ques)
        else:
            q = backtranslate(ques)
            
        if (q!=ques):
            print (ques, "\n", q, "\n")
            augmentation_examples[cnt] = [ques, q]
            cnt += 1
            augmented_train.append([img_id, q, label])


    save_path = "/content/drive/MyDrive/CS685/project/augmented"
    f = open(save_path + f"{augment_type}_examples.pkl","wb")
    pickle.dump(augmentation_examples,f)
    f.close()

    f = open(save_path + f"{augment_type}_train.pkl","wb")
    pickle.dump(augmented_train,f)
    f.close()
    
    print (f'Length of dataset before augmentation : {len(train)}')
    print (f'Length of dataset after augmentation : {len(augmented_train)}')

    return augmented_train

augmented_train= augment_data(train, 0.15)

In [None]:
def load_augmented_data():

    path = "/content/drive/MyDrive/CS685/project/augmented_train.pkl"    
    train_qa = pd.read_pickle(path)    
    return np.asarray(train_qa)

train = load_augmented_data()

In [None]:
from transformers import BertTokenizer, VisualBertForQuestionAnswering, VisualBertForPreTraining

class CustomVB(torch.nn.Module):
    def __init__(self, input_dims, output_dims):
        super().__init__()

        '''
            Integrates the classification head on top ob base VisualBERT
            Loss is backpropogated throughout the model
        '''

        self.model = VisualBertForPreTraining.from_pretrained('uclanlp/visualbert-nlvr2-coco-pre', output_hidden_states=True)
        self.fc1 = torch.nn.Linear(input_dims, 1024)
        self.fc2 = torch.nn.Linear(1024, 2048)
        self.fc3 = torch.nn.Linear(2048, output_dims)

    def forward(self, input_ids, attention_mask, token_type_ids, visual_embeds, visual_attention_mask, visual_token_type_ids):
              
        x = self.model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, visual_embeds=visual_embeds, visual_attention_mask=visual_attention_mask, visual_token_type_ids=visual_token_type_ids)
        x = x.hidden_states[11][0][0]
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

class VisualBERTModel:

    #take necessary inputs
    #input_dims, output_dims, batch_size_test, batch_size_train
    def __init__(self, output_dims, lr, use_weights=False):
        
        self.input_dims = 768
        self.output_dims = output_dims
        self.model = CustomVB(self.input_dims, self.output_dims)
        self.model = self.model.cuda()
        self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
        
        if (use_weights):
            self.compute_class_weights()
        else:
            self.loss = torch.nn.CrossEntropyLoss()
    
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=lr)
        
        self.visual_embeddings = {}
        self.visual_embeddings_train = {}
        self.visual_embeddings_test = {}
        
        self.batch_size = 1
        self.lr = lr

    def compute_class_weights(self):

        try:
            print (f"Total {len(ans2label)} classes, computing weights")
            weights = [0.0]*len(ans2label)
            for example in train:
                weights[int(example[2])] += 1

            normedWeights = [1 - (x / sum(weights)) for x in weights]
            weights = torch.FloatTensor(weights)
            # /float(len(train))
            # weights = 1.0 / weights
            # weights = torch.nan_to_num(weights, posinf=0.0)
            # weights = weights / weights.sum()
            # weights = torch.nan_to_num(weights)
            print ((weights).shape, weights)

        except:
            print (f"{ans2label} or {train} pickle files not loaded, check environment setup")

        self.loss = torch.nn.CrossEntropyLoss(weight = weights).cuda()


    def load_visual_embeddings(self, path, split='train'):

        if split=='train':
            self.visual_embeddings_train = pd.read_pickle(path)
        elif split=='test':
            self.visual_embeddings_test = pd.read_pickle(path)

    def make_prediction(self, img_id, question,split='train'):
        '''
            should be a list of imgs/ques
        '''

        tokens = self.tokenizer(question, padding='max_length', max_length=100)
        input_ids = torch.tensor(tokens["input_ids"]).cuda() #.unsqueeze(0)
        attention_mask = torch.tensor(tokens["attention_mask"]).cuda()
        token_type_ids = torch.tensor(tokens["token_type_ids"]).cuda()
        # visual_embeds = torch.stack(self.get_visual_embeddings(img_id)).cuda()

        if split=='train':
            for id in img_id:
                visual_embeds = torch.stack(self.visual_embeddings_train[img_id[0]]).cuda()        
        else:
            for id in img_id:
                # print ((self.visual_embeddings_test[id]))
                visual_embeds = torch.stack(self.visual_embeddings_test[img_id[0]]).cuda()        

        visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.long).cuda()
        visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long).cuda()
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, visual_embeds=visual_embeds, visual_attention_mask=visual_attention_mask, visual_token_type_ids=visual_token_type_ids)
        
        return outputs

    def test(self, batch_size=2, load_path=None, return_all=False):

        num_batches = int(len(test))
        test_loss = 0.0
        total_correct = 0.0
        print ("Evaluating")

        if load_path is not None:
            print (f'Loading path from {load_path}')
            checkpt = torch.load(load_path)
            train_loss_log = checkpt['loss']
            self.model.load_state_dict(checkpt['vb_model_state_dict'])
            start_epoch = checkpt['epoch'] + 1
            max_test = checkpt['max_test']
            test_acc_log = checkpt['test_acc_log']
            batch_size = checkpt['batch_size']
            self.lr = checkpt['lr']

        self.model.eval()

        predictions = []
        for i in range(num_batches):

            batch = test[i:i+self.batch_size]
            imgs = batch[:,0]
            questions = list(batch[:,1])
            labels = batch[:,2]
            if (imgs == []) or (questions == []) or (labels == []):
                continue

            with torch.no_grad():

                class_pred = self.make_prediction(imgs, questions, 'test')
                y_pred = torch.log_softmax(class_pred.unsqueeze(0), dim=1)
                _, y_pred_tags = torch.max(y_pred, dim=1)
                gt = torch.zeros(class_pred.shape).cuda()
                gt[int(labels[0])] = 1.0
                if (int(labels[0]) == y_pred_tags[0].item()):
                    total_correct+=1
                
                predictions.append([int(labels[0]), y_pred_tags[0].item()])
                
        print (f"Test accuracy is {total_correct/len(test)} \n")

        if (return_all == True):
            return predictions
        else:
            return total_correct/len(test)

    def train(self, start_epoch=0, epochs=2, batch_size=1, load_path=None, save_path=None):

        num_batches = int(len(train))
        train_loss_log = []
        test_acc_log = []
        max_test = 0.0

        print (save_path)

        self.model.train()
        
        if load_path is not None:
            print (load_path)
            checkpt = torch.load(load_path)
            train_loss_log = checkpt['loss']
            self.model.load_state_dict(checkpt['vb_model_state_dict'])
            start_epoch = checkpt['epoch'] + 1
            max_test = checkpt['max_test']
            test_acc_log = checkpt['test_acc_log']
            batch_size = checkpt['batch_size']
            self.lr = checkpt['lr']

        for ep in range(start_epoch, epochs):
            self.model.train()
            train_loss = 0.0

            for i in range(num_batches):

                assert(self.model.training)

                if (i%4000==0 and i>0):
                    print (f'Epoch {ep}, {i}/{num_batches} batches, loss is {train_loss/i}')
        
                batch = (train[i:i+self.batch_size])
                imgs = batch[:,0]
                questions = list(batch[:,1])
                labels = batch[:,2]

                if (imgs == []) or (questions == []) or (labels == []):
                    continue

                class_pred = self.make_prediction(imgs, questions, 'train')
                gt = torch.zeros(class_pred.shape).cuda()
                gt[int(labels[0])] = 1.0
                loss = self.loss(class_pred.unsqueeze(0), gt.unsqueeze(0))
                loss.backward()
                train_loss += loss.item()
                
                if ((i+1)%batch_size==0 or i==len(train)):
                    self.optimizer.step()
                    self.optimizer.zero_grad()
                    
            (train_loss_log.append(train_loss))
            
            if (save_path != None and (ep+1)%3==0):
                torch.save({
                    'epoch': ep,
                    'optimizer_state_dict': self.optimizer.state_dict(),
                    'loss': train_loss_log,
                    'vb_model_state_dict': self.model.state_dict(),
                    'max_test': max_test,
                    'test_acc_log': test_acc_log,
                    'batch_size': batch_size,
                    'lr' : self.lr
                }, save_path+f"/augmented_b{batch_size}_lr{int(self.lr)}_{ep}.pth")
                print ("Saved model to: ", save_path+f"/augmented_b{batch_size}_lr{int(1000000*self.lr)}_{ep}.pth")

            
            print (f'Completed {ep+1} epochs out of {epochs}, loss is {train_loss_log[ep]/len(train)} \n')
            test_acc = self.test(batch_size=1)
            try:
                if (test_acc > max_test):
                    max_test = test_acc
                    torch.save({
                        'epoch': ep,
                        'optimizer_state_dict': self.optimizer.state_dict(),
                        'loss': train_loss_log,
                        'vb_model_state_dict': self.model.state_dict(),
                        'max_test': max_test,
                        'test_acc_log': test_acc_log,
                        'batch_size': batch_size,
                        'lr': self.lr
                    }, save_path+f"/augmented_b{batch_size}_lr{int(self.lr)}_best.pth")
                    print ("Saved model to: ", save_path+f"/augmented_b{batch_size}_lr{int(1000000*self.lr)}_best.pth")
            except:
                print ("Could not check for the best model")

In [None]:
print ("Loading model")
visualbert = VisualBERTModel(len(ans2label), lr=0.00001, use_weights=False)

#load visual embeddings from pickle file
print ("Loading visual embeddings from file")
visual_embeddings_path = "/content/drive/MyDrive/CS685/project/img_features"
visualbert.load_visual_embeddings(visual_embeddings_path+"_train.pkl", 'train')
visualbert.load_visual_embeddings(visual_embeddings_path+"_test.pkl", 'test')

print ("Success")

Loading model
Loading visual embeddings from file
Success


In [None]:
# train = augmented_train
# print (train[0])
load_path = None #'/content/drive/MyDrive/CS685/project/vb/batched_best.pth')
visualbert.loss = visualbert.loss.cuda()
visualbert.train(epochs=20,batch_size=8,save_path='/content/drive/MyDrive/CS685/project/vb/augmented',load_path=load_path)

/content/drive/MyDrive/CS685/project/vb/augmented




Epoch 0, 4000/22051 batches, loss is 5.2291851657256485
Epoch 0, 8000/22051 batches, loss is 4.379922498886473
Epoch 0, 12000/22051 batches, loss is 4.11580168449316
Epoch 0, 16000/22051 batches, loss is 3.927877879515894
Epoch 0, 20000/22051 batches, loss is 3.7756617886486215
Completed 1 epochs out of 20, loss is 3.7224465140403518 

Evaluating




Test accuracy is 0.48862853204686424 

Saved model to:  /content/drive/MyDrive/CS685/project/vb/augmented/augmented_b8_lr10_best.pth
Epoch 1, 4000/22051 batches, loss is 3.9510797196064957
Epoch 1, 8000/22051 batches, loss is 2.986276704346048
Epoch 1, 12000/22051 batches, loss is 2.7146937706920724
Epoch 1, 16000/22051 batches, loss is 2.602974610980036
Epoch 1, 20000/22051 batches, loss is 2.493798980933771
Completed 2 epochs out of 20, loss is 2.523988846100198 

Evaluating
Test accuracy is 0.5075809786354238 

Saved model to:  /content/drive/MyDrive/CS685/project/vb/augmented/augmented_b8_lr10_best.pth
Epoch 2, 4000/22051 batches, loss is 3.48714043974239
Epoch 2, 8000/22051 batches, loss is 2.6173380722278963
Epoch 2, 12000/22051 batches, loss is 2.3670299600637277
Epoch 2, 16000/22051 batches, loss is 2.2581948030234926
Epoch 2, 20000/22051 batches, loss is 2.154470825381561
Saved model to:  /content/drive/MyDrive/CS685/project/vb/augmented/augmented_b8_lr10_2.pth
Completed 3 epo

# Does VisualBERT really see?

In [None]:
import pickle
from transformers import BertTokenizer, VisualBertForQuestionAnswering, VisualBertForPreTraining

class CustomVB(torch.nn.Module):
    def __init__(self, input_dims, output_dims):
        super().__init__()

        '''
            Integrates the classification head on top ob base VisualBERT
            Loss is backpropogated throughout the model
        '''

        self.model = VisualBertForPreTraining.from_pretrained('uclanlp/visualbert-nlvr2-coco-pre', output_hidden_states=True)
        self.fc1 = torch.nn.Linear(input_dims, 1024)
        self.fc2 = torch.nn.Linear(1024, 2048)
        self.fc3 = torch.nn.Linear(2048, output_dims)

    def forward(self, input_ids, attention_mask, token_type_ids, visual_embeds, visual_attention_mask, visual_token_type_ids):
              
        x = self.model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, visual_embeds=visual_embeds, visual_attention_mask=visual_attention_mask, visual_token_type_ids=visual_token_type_ids)
        x = x.hidden_states[11][0][0]
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

class VisualBERTModel:

    #take necessary inputs
    #input_dims, output_dims, batch_size_test, batch_size_train
    def __init__(self, output_dims, lr, use_weights=False):
        
        self.input_dims = 768
        self.output_dims = output_dims
        self.model = CustomVB(self.input_dims, self.output_dims)
        self.model = self.model.cuda()
        self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
        
        if (use_weights):
            self.compute_class_weights()
        else:
            self.loss = torch.nn.CrossEntropyLoss()
    
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=lr)
        
        self.visual_embeddings = {}
        self.visual_embeddings_train = {}
        self.visual_embeddings_test = {}
        
        self.batch_size = 1
        self.lr = lr

    def compute_class_weights(self):

        try:
            print (f"Total {len(ans2label)} classes, computing weights")
            weights = [0.0]*len(ans2label)
            for example in train:
                weights[int(example[2])] += 1

            normedWeights = [1 - (x / sum(weights)) for x in weights]
            weights = torch.FloatTensor(weights)
            # /float(len(train))
            # weights = 1.0 / weights
            # weights = torch.nan_to_num(weights, posinf=0.0)
            # weights = weights / weights.sum()
            # weights = torch.nan_to_num(weights)
            print ((weights).shape, weights)

        except:
            print (f"{ans2label} or {train} pickle files not loaded, check environment setup")

        self.loss = torch.nn.CrossEntropyLoss(weight = weights).cuda()


    def load_visual_embeddings(self, path, split='train'):

        if split=='train':
            self.visual_embeddings_train = pd.read_pickle(path)
        elif split=='test':
            self.visual_embeddings_test = pd.read_pickle(path)

    def make_prediction(self, img_id, question,split='train'):
        '''
            should be a list of imgs/ques
        '''

        tokens = self.tokenizer(question, padding='max_length', max_length=100)
        input_ids = torch.tensor(tokens["input_ids"]).cuda() #.unsqueeze(0)
        attention_mask = torch.tensor(tokens["attention_mask"]).cuda()
        token_type_ids = torch.tensor(tokens["token_type_ids"]).cuda()
        # visual_embeds = torch.stack(self.get_visual_embeddings(img_id)).cuda()

        if split=='train':
            for id in img_id:
                visual_embeds = torch.stack(self.visual_embeddings_train[img_id[0]]).cuda()        
        else:
            for id in img_id:
                # print ((self.visual_embeddings_test[id]))
                visual_embeds = torch.stack(self.visual_embeddings_test[img_id[0]]).cuda()        

        visual_embed = torch.zeros(visual_embeds.shape).cuda()

        visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.long).cuda()
        visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long).cuda()
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, visual_embeds=visual_embeds, visual_attention_mask=visual_attention_mask, visual_token_type_ids=visual_token_type_ids)
        
        return outputs

    def test(self, batch_size=2, load_path=None, return_all=False):

        num_batches = int(len(test))
        test_loss = 0.0
        total_correct = 0.0
        print ("Evaluating")

        if load_path is not None:
            print (f'Loading path from {load_path}')
            checkpt = torch.load(load_path)
            train_loss_log = checkpt['loss']
            self.model.load_state_dict(checkpt['vb_model_state_dict'])
            start_epoch = checkpt['epoch'] + 1
            max_test = checkpt['max_test']
            test_acc_log = checkpt['test_acc_log']
            batch_size = checkpt['batch_size']
            self.lr = checkpt['lr']

        self.model.eval()

        predictions = []
        for i in range(num_batches):

            batch = test[i:i+self.batch_size]
            imgs = batch[:,0]
            questions = list(batch[:,1])
            labels = batch[:,2]
            if (imgs == []) or (questions == []) or (labels == []):
                continue

            with torch.no_grad():

                class_pred = self.make_prediction(imgs, questions, 'test')
                y_pred = torch.log_softmax(class_pred.unsqueeze(0), dim=1)
                _, y_pred_tags = torch.max(y_pred, dim=1)
                gt = torch.zeros(class_pred.shape).cuda()
                gt[int(labels[0])] = 1.0
                if (int(labels[0]) == y_pred_tags[0].item()):
                    total_correct+=1
                
                predictions.append([int(labels[0]), y_pred_tags[0].item()])
                
        print (f"Test accuracy is {total_correct/len(test)} \n")

        if (return_all == True):
            return predictions
        else:
            return total_correct/len(test)

    def train(self, start_epoch=0, epochs=2, batch_size=1, load_path=None, save_path=None):

        num_batches = int(len(train))
        train_loss_log = []
        test_acc_log = []
        max_test = 0.0

        print (save_path)

        self.model.train()
        
        if load_path is not None:
            print (load_path)
            checkpt = torch.load(load_path)
            train_loss_log = checkpt['loss']
            self.model.load_state_dict(checkpt['vb_model_state_dict'])
            start_epoch = checkpt['epoch'] + 1
            max_test = checkpt['max_test']
            test_acc_log = checkpt['test_acc_log']
            batch_size = checkpt['batch_size']
            self.lr = checkpt['lr']

        for ep in range(start_epoch, epochs):
            self.model.train()
            train_loss = 0.0

            for i in range(num_batches):

                assert(self.model.training)

                if (i%4000==0 and i>0):
                    print (f'Epoch {ep}, {i}/{num_batches} batches, loss is {train_loss/i}')
        
                batch = train[i:i+self.batch_size]
                imgs = batch[:,0]
                questions = list(batch[:,1])
                labels = batch[:,2]

                if (imgs == []) or (questions == []) or (labels == []):
                    continue

                class_pred = self.make_prediction(imgs, questions, 'train')
                gt = torch.zeros(class_pred.shape).cuda()
                gt[int(labels[0])] = 1.0
                loss = self.loss(class_pred.unsqueeze(0), gt.unsqueeze(0))
                loss.backward()
                train_loss += loss.item()
                
                if ((i+1)%batch_size==0 or i==len(train)):
                    self.optimizer.step()
                    self.optimizer.zero_grad()
                    
            (train_loss_log.append(train_loss))
            
            if (save_path != None and (ep+1)%3==0):
                torch.save({
                    'epoch': ep,
                    'optimizer_state_dict': self.optimizer.state_dict(),
                    'loss': train_loss_log,
                    'vb_model_state_dict': self.model.state_dict(),
                    'max_test': max_test,
                    'test_acc_log': test_acc_log,
                    'batch_size': batch_size,
                    'lr' : self.lr
                }, save_path+f"/onlytext_b{batch_size}_lr{int(self.lr)}_{ep}.pth")
                print ("Saved model to: ", save_path+f"/onlytext_b{batch_size}_lr{int(1000000*self.lr)}_{ep}.pth")

            
            print (f'Completed {ep+1} epochs out of {epochs}, loss is {train_loss_log[ep]/len(train)} \n')
            test_acc = self.test(batch_size=1)
            try:
                if (test_acc > max_test):
                    max_test = test_acc
                    torch.save({
                        'epoch': ep,
                        'optimizer_state_dict': self.optimizer.state_dict(),
                        'loss': train_loss_log,
                        'vb_model_state_dict': self.model.state_dict(),
                        'max_test': max_test,
                        'test_acc_log': test_acc_log,
                        'batch_size': batch_size,
                        'lr': self.lr
                    }, save_path+f"/onlytext_b{batch_size}_lr{int(self.lr)}_best.pth")
                    print ("Saved model to: ", save_path+f"/onlytext_b{batch_size}_lr{int(1000000*self.lr)}_best.pth")
            except:
                print ("Could not check for the best model")
            

In [None]:
print ("Loading model")
visualbert = VisualBERTModel(len(ans2label), lr=0.00001, use_weights=False)

#load visual embeddings from pickle file
print ("Loading visual embeddings from file")
visual_embeddings_path = "/content/drive/MyDrive/CS685/project/img_features"
visualbert.load_visual_embeddings(visual_embeddings_path+"_train.pkl", 'train')
visualbert.load_visual_embeddings(visual_embeddings_path+"_test.pkl", 'test')

print ("Success")

Loading model
Loading visual embeddings from file
Success


In [None]:
load_path = None #'/content/drive/MyDrive/CS685/project/vb/batched_best.pth')
visualbert.loss = visualbert.loss.cuda()
visualbert.train(epochs=20,batch_size=8,save_path='/content/drive/MyDrive/CS685/project/vb/onlytext',load_path=load_path)

/content/drive/MyDrive/CS685/project/vb/onlytext




Epoch 0, 4000/19755 batches, loss is 5.474832760185003
Epoch 0, 8000/19755 batches, loss is 4.527506230980507
Epoch 0, 12000/19755 batches, loss is 4.281121228742918
Epoch 0, 16000/19755 batches, loss is 4.107028461766705
Completed 1 epochs out of 20, loss is 3.91989210078091 

Evaluating




Test accuracy is 0.41436940041350795 

Saved model to:  /content/drive/MyDrive/CS685/project/vb/onlytext/onlytext_b8_lr10_best.pth
Epoch 1, 4000/19755 batches, loss is 3.955085031014867
Epoch 1, 8000/19755 batches, loss is 2.9828177307977604
Epoch 1, 12000/19755 batches, loss is 2.69743658782955
Epoch 1, 16000/19755 batches, loss is 2.6197090527741964
Completed 2 epochs out of 20, loss is 2.4692756544705503 

Evaluating
Test accuracy is 0.47484493452791177 

Saved model to:  /content/drive/MyDrive/CS685/project/vb/onlytext/onlytext_b8_lr10_best.pth
Epoch 2, 4000/19755 batches, loss is 3.5999522047850623
Epoch 2, 8000/19755 batches, loss is 2.661463652789462
Epoch 2, 12000/19755 batches, loss is 2.3852718366539047
Epoch 2, 16000/19755 batches, loss is 2.2729843676850723
Saved model to:  /content/drive/MyDrive/CS685/project/vb/onlytext/onlytext_b8_lr10_2.pth
Completed 3 epochs out of 20, loss is 2.1385722459412557 

Evaluating
Test accuracy is 0.47070985527222603 

Epoch 3, 4000/19755 ba

KeyboardInterrupt: ignored

# Additonal Testing/Analysis

In [11]:
# path = "/content/drive/MyDrive/PathVQA/split/"
# train, test, val, ans2label = load_data(path)
label2ans = {}
for key in ans2label:
    label2ans[ans2label[key]] = key
    # print (len(label2ans))

In [None]:
preds = np.asarray(preds)
print ((preds).shape)
label_coverage = preds[:,0]
print (len(set(label_coverage))) 

(5804, 2)
464


# Verifying test accuracies

In [None]:
load_paths = ['/content/drive/MyDrive/CS685/project/vb/linear_regressor/batched_best.pth',
              '/content/drive/MyDrive/CS685/project/vb/deep_classifier/upgraded_b8_lr0_17.pth',
              '/content/drive/MyDrive/CS685/project/vb/unfrozen/unfrozen_b8_lr0_best.pth',
              '/content/drive/MyDrive/CS685/project/vb/weighted/unfrozen_b8_lr0_best.pth',
              '/content/drive/MyDrive/CS685/project/vb/onlytext/onlytext_b8_lr0_best.pth',
              '/content/drive/MyDrive/CS685/project/vb/augmented/augmented_b8_lr0_best.pth']

In [None]:
# load_path = '/content/drive/MyDrive/CS685/project/vb/unfrozen/unfrozen_b8_lr0_best.pth'
for load_path in load_paths:
    print (load_path)
    checkpt = torch.load(load_path)
    try:
        print (checkpt['lr'])
        print (checkpt['batch_size'])
    except:
        print ("Did not store lr and batch size earlier")
    print (checkpt['test_acc_log'])
    print (checkpt['max_test'])
    print ("\n")
# preds = visualbert.test(batch_size=1,load_path=load_path,return_all=False)

/content/drive/MyDrive/CS685/project/vb/linear_regressor/batched_best.pth
Did not store lr and batch size earlier
[]
0.27705031013094417


/content/drive/MyDrive/CS685/project/vb/deep_classifier/upgraded_b8_lr0_17.pth
0.001
8
[]
0.3282219159200551


/content/drive/MyDrive/CS685/project/vb/unfrozen/unfrozen_b8_lr0_best.pth
1e-05
8
[]
0.47088215024121294


/content/drive/MyDrive/CS685/project/vb/weighted/unfrozen_b8_lr0_best.pth
1e-05
8
[]
0.4739834596829773


/content/drive/MyDrive/CS685/project/vb/onlytext/onlytext_b8_lr0_best.pth
1e-05
8
[]
0.47484493452791177


/content/drive/MyDrive/CS685/project/vb/augmented/augmented_b8_lr0_best.pth
1e-05
8
[]
0.5430737422467264




# Error analysis

In [18]:
from sklearn import metrics
load_paths = ['/content/drive/MyDrive/CS685/project/vb/augmented/augmented_b8_lr0_best.pth']
incorrect_exs = []

for load_path in load_paths:
    print(load_path)
    gt_preds = visualbert.test(batch_size=1,load_path=load_path,return_all=True)
    gt, preds = gt_preds[:,0], gt_preds[:,1]
    print (f'Predicted class spans over {len(set(preds))} out of {len(set(gt))} classes in test set')
    print (metrics.f1_score(gt, preds, average='weighted'))

/content/drive/MyDrive/CS685/project/vb/augmented/augmented_b8_lr0_best.pth
Evaluating
Loading path from /content/drive/MyDrive/CS685/project/vb/augmented/augmented_b8_lr0_best.pth




Test accuracy is 0.5366988283942109 

Predicted class spans over 43 out of 464 classes in test set
0.5142479599612664
