In [1]:
import ruamel.yaml as yaml
import numpy as np
import torch
from torch import nn
from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize, PILToTensor
from tqdm import tqdm
import matplotlib.pyplot as plt
from PIL import Image
import json
from torch.utils.data import DataLoader
import torch
from torchvision.datasets import CocoCaptions

from collections import OrderedDict
import gc
from typing import Any, Tuple, Callable, Optional, List
from sklearn.metrics import recall_score

from torchvision import transforms
from ruamel.yaml import YAML
import os
import utils
import spacy

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

cuda_card = 0

In [2]:
pos_tagger = spacy.load('en_core_web_sm')

In [3]:
class CocoCustom(CocoCaptions): 
    def __init__(
        self,
        root: str,
        annFile: str,
        transform: Optional[Callable] = None,
        target_transform: Optional[Callable] = None,
        transforms: Optional[Callable] = None,
    ) -> None:
        super().__init__(root, annFile, transform, target_transform, transforms)
        from pycocotools.coco import COCO

        self.annotations = json.load(open(annFile))
        self.num_captions = len(self.annotations['annotations'])

    def __getitem__(self, index: int) -> Tuple[Any, Any]:
        id = self.ids[index]
        image = self._load_image(id)
        target = self._load_target(id)

        if self.transforms is not None:
            image, target = self.transforms(image, target)

        return image, id, target

    def getAnnotationRange(self, index: int, count: int) -> List[Any]:
        return [self.annotations['annotations'][index]['caption'] for index in range(index, index+count)]

    def getImgIdFromAnnotationIndex(self, annotation_index: int) -> int:
        return self.annotations['annotations'][annotation_index]['image_id']
    
    def buildFaissIndex(self, text_encoder, tokenize, batch_size, nlist) :
        tokenized = tokenize(self.getAnnotationRange(0, batch_size)).cuda(cuda_card)
        encoded_captions = normalize_vector(text_encoder(tokenized, get_all_token=False).detach().cpu().numpy().astype('float32'))
        vector_dimension = encoded_captions.shape[1]
        
        quantizer = faiss.IndexFlatIP(vector_dimension)
        index = faiss.IndexIVFFlat(quantizer, vector_dimension, nlist)
        index.train(encoded_captions)
        index.add(encoded_captions)
        
        for i in tqdm(range(batch_size, self.num_captions - batch_size, batch_size)):
            tokenized = clip.tokenize(self.getAnnotationRange(i, batch_size)).cuda(cuda_card)
            encoded_captions = normalize_vector(model.encode_text(tokenized, get_all_token=False).detach().cpu().numpy().astype('float32'))
            index.add(encoded_captions)

        return index

    def __len__(self) -> int:
        return len(self.ids)


In [4]:
yaml = YAML(typ='rt')
config = yaml.load(open("./configs/Pretrain.yaml", 'r'))

In [5]:
normalize = transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
test_transform = transforms.Compose([
    transforms.Resize((config['image_res'],config['image_res']),interpolation=Image.BICUBIC),
    transforms.ToTensor(),
    normalize,
])

def collate_coco(data):
    images, image_ids, captions = zip(*data)

    # Stack image tensors into a single batch tensor
    images = torch.stack(images, dim=0)  # Shape: (batch_size, C, H, W)

    # Convert image_ids into a list
    image_ids = list(image_ids)  # Or use torch.tensor(image_ids) if they are numeric

    # Collate captions
    # If captions have variable lengths, you can return them as a list
    first_captions = [cap[0] for cap in captions]  # First caption of each sample

    return {
        "images": images,
        "image_ids": image_ids,
        "captions": first_captions,  # Only the first caption
    }

path = '../../Dataset/CV/mscoco/2014'
cocoCaptions = CocoCustom(root = path + '/val2014',
                        annFile = path + '/annotations/captions_val2014.json',
                        transform=test_transform)

print('Number of samples: ', len(cocoCaptions))
img, img_id, target = cocoCaptions[3]

batch_size = 64
data_loader = DataLoader(cocoCaptions, collate_fn=collate_coco, batch_size=batch_size)

print("Image Size:", img.size())
print("Captions:", target)
print("Image Id:", img_id)


loading annotations into memory...
Done (t=0.12s)
creating index...
index created!
Number of samples:  40504
Image Size: torch.Size([3, 256, 256])
Captions: ['A loft bed with a dresser underneath it.', 'A bed and desk in a small room.', 'Wooden bed on top of a white dresser.', 'A bed sits on top of a dresser and a desk.', 'Bunk bed with a narrow shelf sitting underneath it. ']
Image Id: 133


In [2]:
from models.model_pretrain import ALBEF
from models.vit import interpolate_pos_embed
from models.tokenization_bert import BertTokenizer
import tokenizations

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

index = 0
caption = cocoCaptions[index][2][0]
out = tokenizer(caption)
out

2025-06-03 15:57:02.632791: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-06-03 15:57:02.639294: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748966222.647129 1237903 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748966222.649496 1237903 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-06-03 15:57:02.658755: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

NameError: name 'cocoCaptions' is not defined

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = ALBEF(config=config, text_encoder="bert-base-uncased", tokenizer=tokenizer)

model = model.cuda(cuda_card)
model.eval()
print('Done')

In [26]:
from types import SimpleNamespace

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

args = {
    'distributed': False
}
args = SimpleNamespace(**args)
utils.init_distributed_mode(args)

with torch.no_grad():
    batch = next(iter(data_loader))
    images = batch['images']
    captions = tokenizer(batch['captions'], padding='longest', truncation=True, max_length=25, return_tensors="pt")
    images = images.to(device)
    captions = captions.to(device)
    output = model(images, captions, masking_pos='all')

Not using distributed mode


In [41]:
input_ids = captions.input_ids.clone()
pos_ids = captions.pos_ids.clone()
labels = input_ids.clone()
# input_ids, labels = model.mask(input_ids, pos_ids, 30522, images.device, targets=labels, masking_pos="ADJ")

In [44]:
masking_pos_id = pos_hash["ADJ"]
masked_indices = torch.zeros(input_ids.shape)
masked_indices[pos_ids==masking_pos_id] = 0.7

torch.Size([64, 20])

In [62]:
pos_ids[7]

tensor([-1,  5,  0,  7,  1,  0,  5,  7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1], device='cuda:0')

In [61]:
(pos_ids==masking_pos_id)[7]

tensor([False, False,  True, False, False,  True, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False],
       device='cuda:0')

In [39]:
input_ids[4]

tensor([  101,  2048, 21025, 27528,  7959,  2015,  1999,  1037,  2282,  2007,
         2111,  2559,  2012,  2068,  1012,   102,     0,     0,     0,     0],
       device='cuda:0')

In [31]:
pos_classes = ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X', 'SPACE']
pos_hash = {c: i for i, c in enumerate(pos_classes)}

with pos_tagger.select_pipes(enable=['morphologizer', 'tok2vec', 'tagger', 'attribute_ruler']):
    spacy_doc = pos_tagger(batch['captions'][0])
spacy_pos = torch.tensor([pos_hash[t.pos_] for t in spacy_doc])
spacy_tokens = [t.text for t in spacy_doc]
spacy_tokens

['This',
 'wire',
 'metal',
 'rack',
 'holds',
 'several',
 'pairs',
 'of',
 'shoes',
 'and',
 'sandals']

In [12]:
from ARO.dataset_zoo import VG_Relation, VG_Attribution, COCO_Order, Flickr30k_Order
from torchvision import transforms
from PIL import Image

In [13]:
normalize = transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
preprocess = transforms.Compose([
    transforms.Resize((256,256),interpolation=Image.BICUBIC),
    transforms.ToTensor(),
    normalize,
])

root_dir="/home/pasitt/Dataset/CV/ARO"
# Setting download=True will download the dataset to `root_dir` if it's not already there. 
# For VG-R and VG-A, this is a 1GB zip file that is a subset of GQA.

vgr_dataset = VG_Relation(image_preprocess=preprocess, download=True, root_dir=root_dir)
vga_dataset = VG_Attribution(image_preprocess=preprocess, download=True, root_dir=root_dir)

In [14]:
coco_order_dataset = COCO_Order(image_preprocess=preprocess, download=False, root_dir='/home/pasitt/Dataset/CV/mscoco/2014') 

Using downloaded and verified file: /home/pasitt/Dataset/CV/mscoco/2014/coco_karpathy_test.json


49it [00:03, 15.82it/s]


KeyboardInterrupt: 

In [None]:
flickr_order_dataset = Flickr30k_Order(image_preprocess=preprocess, root_dir='/home/pasitt/Dataset/CV/flickr/', split='test')

In [None]:
vga_dataset[0]['caption_options']