In [1]:
import ruamel.yaml as yaml
import numpy as np
import torch
from torch import nn
from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize, PILToTensor
from tqdm import tqdm
import matplotlib.pyplot as plt
from PIL import Image
import json
from torch.utils.data import DataLoader
import torch
from torchvision.datasets import CocoCaptions

from collections import OrderedDict
from datasets import load_dataset
import gc
from typing import Any, Tuple, Callable, Optional, List
from sklearn.metrics import recall_score

from torchvision import transforms
from ruamel.yaml import YAML
import os
import utils
import spacy

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

cuda_card = 0

In [2]:
pos_tagger = spacy.load('en_core_web_sm')

In [3]:
class CocoCustom(CocoCaptions): 
    def __init__(
        self,
        root: str,
        annFile: str,
        transform: Optional[Callable] = None,
        target_transform: Optional[Callable] = None,
        transforms: Optional[Callable] = None,
    ) -> None:
        super().__init__(root, annFile, transform, target_transform, transforms)
        from pycocotools.coco import COCO

        self.annotations = json.load(open(annFile))
        self.num_captions = len(self.annotations['annotations'])

    def __getitem__(self, index: int) -> Tuple[Any, Any]:
        id = self.ids[index]
        image = self._load_image(id)
        target = self._load_target(id)

        if self.transforms is not None:
            image, target = self.transforms(image, target)

        return image, id, target

    def getAnnotationRange(self, index: int, count: int) -> List[Any]:
        return [self.annotations['annotations'][index]['caption'] for index in range(index, index+count)]

    def getImgIdFromAnnotationIndex(self, annotation_index: int) -> int:
        return self.annotations['annotations'][annotation_index]['image_id']
    
    def buildFaissIndex(self, text_encoder, tokenize, batch_size, nlist) :
        tokenized = tokenize(self.getAnnotationRange(0, batch_size)).cuda(cuda_card)
        encoded_captions = normalize_vector(text_encoder(tokenized, get_all_token=False).detach().cpu().numpy().astype('float32'))
        vector_dimension = encoded_captions.shape[1]
        
        quantizer = faiss.IndexFlatIP(vector_dimension)
        index = faiss.IndexIVFFlat(quantizer, vector_dimension, nlist)
        index.train(encoded_captions)
        index.add(encoded_captions)
        
        for i in tqdm(range(batch_size, self.num_captions - batch_size, batch_size)):
            tokenized = clip.tokenize(self.getAnnotationRange(i, batch_size)).cuda(cuda_card)
            encoded_captions = normalize_vector(model.encode_text(tokenized, get_all_token=False).detach().cpu().numpy().astype('float32'))
            index.add(encoded_captions)

        return index

    def __len__(self) -> int:
        return len(self.ids)


In [4]:
yaml = YAML(typ='rt')
config = yaml.load(open("./configs/Pretrain.yaml", 'r'))

In [5]:
normalize = transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
test_transform = transforms.Compose([
    transforms.Resize((config['image_res'],config['image_res']),interpolation=Image.BICUBIC),
    transforms.ToTensor(),
    normalize,
])

def collate_coco(data):
    images, image_ids, captions = zip(*data)

    # Stack image tensors into a single batch tensor
    images = torch.stack(images, dim=0)  # Shape: (batch_size, C, H, W)

    # Convert image_ids into a list
    image_ids = list(image_ids)  # Or use torch.tensor(image_ids) if they are numeric

    # Collate captions
    # If captions have variable lengths, you can return them as a list
    first_captions = [cap[0] for cap in captions]  # First caption of each sample

    return {
        "images": images,
        "image_ids": image_ids,
        "captions": first_captions,  # Only the first caption
    }

path = '../../Dataset/CV/mscoco/2017'
cocoCaptions = CocoCustom(root = path + '/val2017',
                        annFile = path + '/annotations/captions_val2017.json',
                        transform=test_transform)

print('Number of samples: ', len(cocoCaptions))
img, img_id, target = cocoCaptions[3]

batch_size = 64
data_loader = DataLoader(cocoCaptions, collate_fn=collate_coco, batch_size=batch_size)

print("Image Size:", img.size())
print("Captions:", target)
print("Image Id:", img_id)


loading annotations into memory...
Done (t=0.01s)
creating index...
index created!
Number of samples:  5000
Image Size: torch.Size([3, 256, 256])
Captions: ["A stop sign is mounted upside-down on it's post. ", 'A stop sign that is hanging upside down.', 'An upside down stop sign by the road.', 'a stop sign put upside down on a metal pole ', 'A stop sign installed upside down on a street corner']
Image Id: 724


In [6]:
from models.model_pretrain import ALBEF
from models.vit import interpolate_pos_embed
from models.tokenization_bert import BertTokenizer
import tokenizations

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

index = 0
caption = cocoCaptions[index][2][0]
out = tokenizer(caption)
out



{'input_ids': [101, 1037, 2450, 4832, 1999, 1996, 7759, 2181, 2012, 1996, 2795, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'pos_ids': [-1, 5, 7, 15, 1, 5, 7, 7, 1, 5, 7, 12, -1]}

In [7]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = ALBEF(config=config, text_encoder="bert-base-uncased", tokenizer=tokenizer)

model = model.cuda(cuda_card)
model.eval()
print('Done')

Downloading: "https://dl.fbaipublicfiles.com/deit/deit_base_patch16_224-b5f2ef4d.pth" to /home/pasitt/.cache/torch/hub/checkpoints/deit_base_patch16_224-b5f2ef4d.pth
100%|██████████| 330M/330M [00:29<00:00, 11.7MB/s] 


reshape position embedding from 196 to 256
_IncompatibleKeys(missing_keys=[], unexpected_keys=['head.weight', 'head.bias'])


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Done


In [8]:
from types import SimpleNamespace

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

args = {
    'distributed': False
}
args = SimpleNamespace(**args)
utils.init_distributed_mode(args)

with torch.no_grad():
    batch = next(iter(data_loader))
    images = batch['images']
    captions = tokenizer(batch['captions'], padding='longest', truncation=True, max_length=25, return_tensors="pt")
    images = images.to(device)
    captions = captions.to(device)
    output = model(images, captions)

Not using distributed mode


In [9]:
captions['pos_ids'][0]

tensor([-1,  5,  7, 15,  1,  5,  7,  7,  1,  5,  7, 12, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1], device='cuda:0')

In [10]:
batch['captions'][0]

'A woman stands in the dining area at the table.'

In [11]:
pos_classes = ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X']
pos_hash = {c: i for i, c in enumerate(pos_classes)}

with pos_tagger.select_pipes(enable=['morphologizer', 'tok2vec', 'tagger', 'attribute_ruler']):
    spacy_doc = pos_tagger(batch['captions'][4])
spacy_pos = torch.tensor([pos_hash[t.pos_] for t in spacy_doc])
spacy_tokens = [t.text for t in spacy_doc]

In [12]:
spacy_tokens

['Three',
 'teddy',
 'bears',
 ',',
 'each',
 'a',
 'different',
 'color',
 ',',
 'snuggling',
 'together',
 '.']

In [13]:
bert_tokens = tokenizer.convert_ids_to_tokens(captions.input_ids[4])
bert_tokens[1:16]

['three',
 'teddy',
 'bears',
 ',',
 'each',
 'a',
 'different',
 'color',
 ',',
 's',
 '##nu',
 '##gg',
 '##ling',
 'together',
 '.']

In [14]:
a2b, b2a = tokenizations.get_alignments(spacy_tokens, bert_tokens[1:16])
a2b

[[0], [1], [2], [3], [4], [5], [6], [7], [8], [9, 10, 11, 12], [13], [14]]

In [15]:
pos_ids = torch.zeros(len(bert_tokens[1:16]))
for idx, id_map in enumerate(a2b):
    for i in id_map:
        pos_ids[i] = spacy_pos[idx]
pos_ids

tensor([ 8.,  7.,  7., 12.,  5.,  5.,  0.,  7., 12., 15., 15., 15., 15.,  2.,
        12.])

In [18]:
import json
temp = json.load(open('data/cc3m_validation.json','r'))

JSONDecodeError: Extra data: line 2 column 1 (char 131)