In [86]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [93]:
from IPython.display import clear_output, Image, display
import PIL.Image
import io
import json
import torch
import numpy as np
from _lxmert.processing_image import Preprocess
from _lxmert.visualizing_image import SingleImageViz
from _lxmert.modeling_frcnn import GeneralizedRCNN
from _lxmert._utils import Config
import _lxmert._utils
from transformers import LxmertForQuestionAnswering, LxmertTokenizer
import wget
import pickle
import os
from transformers import LxmertForPreTraining
# for visualizing output
def showarray(a, fmt="jpeg"):
    a = np.uint8(np.clip(a, 0, 255))
    f = io.BytesIO()
    PIL.Image.fromarray(a).save(f, fmt)
    display(Image(data=f.getvalue()))
    
lxmert_base = LxmertForPreTraining.from_pretrained("unc-nlp/lxmert-base-uncased").to(device)
# load models and model components
frcnn_cfg = Config.from_pretrained("unc-nlp/frcnn-vg-finetuned")

frcnn = GeneralizedRCNN.from_pretrained("unc-nlp/frcnn-vg-finetuned", config=frcnn_cfg)

image_preprocess = Preprocess(frcnn_cfg)

lxmert_tokenizer = LxmertTokenizer.from_pretrained("unc-nlp/lxmert-base-uncased")

loading configuration file cache
loading weights file https://cdn.huggingface.co/unc-nlp/frcnn-vg-finetuned/pytorch_model.bin from cache at /afs/cs.pitt.edu/usr0/arr159/.cache/torch/transformers/57f6df6abe353be2773f2700159c65615babf39ab5b48114d2b49267672ae10f.77b59256a4cf8343ae0f923246a81489fc8d82f98d082edc2d2037c977c0d9d0
All model checkpoint weights were used when initializing GeneralizedRCNN.

All the weights of GeneralizedRCNN were initialized from the model checkpoint at unc-nlp/frcnn-vg-finetuned.
If your task is similar to the task the model of the checkpoint was trained on, you can already use GeneralizedRCNN for predictions without further training.


In [44]:
import _lxmert
OBJ_URL = "https://raw.githubusercontent.com/airsplay/py-bottom-up-attention/master/demo/data/genome/1600-400-20/objects_vocab.txt"
objids = _lxmert._utils.get_data(OBJ_URL)


In [96]:
import torch.nn as nn
import random

def get_object_detection_output(img_paths):
    images, sizes, scales_yx = image_preprocess(img_paths)
    output_dict = frcnn(
        images,
        sizes,
        scales_yx=scales_yx,
        padding="max_detections",
        max_detections=frcnn_cfg.max_detections,
        return_tensors="pt",
    )
    return output_dict
def get_objects(output_dict):
    objects=[]
    for tmp in zip(output_dict.get("obj_ids"),output_dict.get("obj_probs")):
        objects.append([objids[i] for i, p in zip(tmp[0].tolist(), tmp[1].tolist()) if p > 0.5])
    return objects
def pretrained_model_fwd_pass(img_paths, txt):
    output_dict=get_object_detection_output(img_paths)
    objects=get_objects(output_dict)
    normalized_boxes = output_dict.get("normalized_boxes").to(device)
    features = output_dict.get("roi_features").to(device)
    inputs = lxmert_tokenizer(
        txt,
        padding="max_length",
        max_length=77,
        truncation=True,
        return_token_type_ids=True,
        return_attention_mask=True,
        add_special_tokens=True,
        return_tensors="pt",
    ).to(device)
    cross_relationship_score= nn.Sigmoid()(lxmert_base(input_ids=inputs.input_ids,
        attention_mask=inputs.attention_mask,
        visual_feats=features,
        visual_pos=normalized_boxes,
        token_type_ids=inputs.token_type_ids,
        output_attentions=False)['cross_relationship_score'])
    shuffled_ids=list(range(len(txt)))
    random.shuffle(shuffled_ids)
    inputs = lxmert_tokenizer(
        [txt[_id] for _id in shuffled_ids],
        padding="max_length",
        max_length=77,
        truncation=True,
        return_token_type_ids=True,
        return_attention_mask=True,
        add_special_tokens=True,
        return_tensors="pt",
    ).to(device)
    actually_different=[shuffled_ids[_id] == _id for _id in list(range(len(txt)))]
    random_cross_relationship_score= nn.Sigmoid()(lxmert_base(input_ids=inputs.input_ids,
        attention_mask=inputs.attention_mask,
        visual_feats=features,
        visual_pos=normalized_boxes,
        token_type_ids=inputs.token_type_ids,
        output_attentions=False)['cross_relationship_score'])
    return objects, cross_relationship_score.tolist(), random_cross_relationship_score.tolist(), actually_different

False

In [97]:
import importlib 
import Dataset
importlib.reload(Dataset)
from Dataset import ReduceMAMIDataset, collate2
num_epochs = 30

# transform = transforms.Compose([
#     transforms.RandomHorizontalFlip(),
#     transforms.Resize((256, 256)),
#     transforms.ToTensor(),
#     transforms.Normalize((0.485, 0.456, 0.406),
#                           (0.229, 0.224, 0.225))])
from params import *
train_dataset = ReduceMAMIDataset(MAX_LEN, MAX_VOCAB, split='train', path_to_dataset='./Data/MASKED_TEXT_TRAINING', transform=None)
val_dataset = ReduceMAMIDataset(MAX_LEN, MAX_VOCAB, split='val', path_to_dataset='./Data/MASKED_TEXT_TRAINING', transform=None)

from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size=16, num_workers=8, collate_fn=collate2, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, collate_fn=collate2)

dataloader = {
    'train': train_loader,
    'val': val_loader
}

num_classes = 2



In [98]:
toy_data=[train_dataset[0], train_dataset[1]]
batch_img_id, batch_img_paths, batch_text=collate2(toy_data)
pretrained_model_fwd_pass(batch_img_paths, batch_text)

([['nose',
   'ear',
   'face',
   'dog',
   'eye',
   'eye',
   'eye',
   'eye',
   'mouth',
   'hair',
   'paw',
   'paw',
   'mouth',
   'hand'],
  ['eye',
   'eye',
   'face',
   'eye',
   'face',
   'face',
   'mouth',
   'nose',
   'dog',
   'face',
   'mouth',
   'dog',
   'mouth']],
 [[0.861376166343689, 0.16290029883384705],
  [0.5193160772323608, 0.4576598107814789]],
 [[0.861376166343689, 0.16290029883384705],
  [0.5193160772323608, 0.4576598107814789]],
 [True, True])

In [75]:
inputs = lxmert_tokenizer(
        shuffle(batch_text),
        padding="max_length",
        max_length=77,
        truncation=True,
        return_token_type_ids=True,
        return_attention_mask=True,
        add_special_tokens=True,
        return_tensors="pt",
    )

NameError: name 'shuffle' is not defined

In [74]:
inputs

{'input_ids': tensor([[  101,  1045,  5086,  1037, 17074,  2138,  1045,  2001,  2894,  2006,
          2026,  5798, 22843,  2014,  2005,  2321,  2781,  2059,  2081,  2014,
          2079,  1996, 10447,  1998, 11641,  2005,  1996,  2717,  1997,  1996,
          3178,  2033,  4168,  3993,  1012,  4012,   102,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0],
        [  101,  2023,  7966,  2056,  2026,  8407,  2298,  2066,  2070,  6167,
          4842,  2725,  1037,  3975,  1999,  2070, 17074,  6879,  1998,  2085,
          1045,  2064,  1005,  1056,  4895, 19763,  2009,  1012,  1030, 14405,
          9299,  2015,  2005,   102,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0, 

In [8]:
import pickle
coco_data_path='/own_files/datasets/mscoco/train2014/'
with open(f'/afs/cs.pitt.edu/usr0/arr159/erhan_code/t/t/data/COCO2017_train_capdata.pkl', 'rb') as f:
    coco_df=pickle.load(f)

In [24]:
import random
caption, _, _, _= random.choice(coco_df['capdata'][0])
caption


'a long table with a flower arrangement in the middle for meetings'

In [28]:
from torch.utils.data import Dataset
import random
import os
class COCODataset(Dataset):
    def __init__(self, path='/own_files/datasets/mscoco'):
        with open(f'/afs/cs.pitt.edu/usr0/arr159/erhan_code/t/t/data/COCO2017_train_capdata.pkl', 'rb') as f:
            df=pickle.load(f)
            
        self.ids=df['path']
        self.captions=df['capdata']
        self.img_paths=[self.get_path(path, img_pth)  for img_pth in df['path']]
    def __len__(self):
        return len(self.ids)
    def get_path(self, path, filename):
        return os.path.join(path, ('train2014' if 'train2014' in filename else 'val2014'), filename)
    def __getitem__(self, i):
        caption, _, _, _= random.choice(self.captions[i])# extract cap
        return (self.ids[i], caption, self.img_paths[i])
    @staticmethod
    def collate_fn(batch):
        ids=[data[0] for data in batch]
        img_paths=[data[2] for data in batch]
        captions=[data[1] for data in batch]
        return ids, img_paths, captions
    

In [31]:
COCODataset()[7000]

('COCO_train2014_000000454708.jpg',
 'a computer sits on a l shaped desk',
 '/own_files/datasets/mscoco/train2014/COCO_train2014_000000454708.jpg')