In [5]:
import torch
import skimage.io as io
import skimage.transform as transform
import torchvision
import clip
import pandas as pd
from PIL import Image
import pickle
import json
import os
from tqdm import tqdm
import argparse
import string
import random
import numpy as np
from transformers import set_seed, GPT2Config, GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
def isEglish(s):
    return s.isascii()

def punc(s):
    for c in string.punctuation:
        s=s.replace(c,"")
    return s.lower() 

In [2]:


def preprocess_pathvqa(split, out_path):
    device = torch.device('cuda:0')
    clip_model, preprocess = clip.load("ViT-B/32", device=device, jit=False)
    data =  pd.read_pickle('/media/tjvsonsbeek/Data1/vqa_datasets/pathvqa/pathVQAprocessed/split/qas/{}/{}_qa.pkl'.format(split,split))
    print("%0d captions loaded from json " % len(data))
    all_img_prefixes = []
    all_txt_prefixes = []
    img_ids = []
    img_paths = []
    all_questions = []
    all_answers = []
    compact_dict = {} 
    for i in tqdm(range(len(data))):
        d = data[i]
        if d['answer']!="yes" and d['answer']!="no":
            img_id = d["image"]
            filename = "/media/tjvsonsbeek/Data1/vqa_datasets/pathvqa/pathVQAprocessed/split/images/{}/{}.jpg".format(split,img_id)
            with torch.no_grad():
                prefix_i = clip_model.encode_image(preprocess(Image.open(filename)).unsqueeze(0).to(device)).cpu()
                prefix_t = clip_model.encode_text(clip.tokenize(d['question']).to(device)).cpu()
                
            if img_id not in compact_dict.keys():
                compact_dict[img_id] = [[d['question']],[d['answer']],[prefix_t],prefix_i,filename]
            else:
                compact_dict[img_id][0].append(d['question'])
                compact_dict[img_id][1].append(d['answer'])
                compact_dict[img_id][2].append(prefix_t)
    for img_id, imgs in enumerate(compact_dict.keys()):
        all_img_prefixes.append(compact_dict[imgs][3])
        for q in range(len(compact_dict[imgs][0])):
            all_txt_prefixes.append(compact_dict[imgs][2][q])
            all_questions.append(compact_dict[imgs][0][q])
            all_answers.append(compact_dict[imgs][1][q])
            img_ids.append(img_id)
            img_paths.append(compact_dict[imgs][4])

    all_data = {"img_prefix": torch.cat(all_img_prefixes, dim=0), "img_ids": img_ids, "questions": all_questions,'answers': all_answers, 'txt_prefix': torch.cat(all_txt_prefixes, dim=0),'img_path': img_paths}

    with open(out_path, 'wb') as f:
        pickle.dump(all_data,f)
    print('Done')
    print("%0d embeddings saved " % len(all_txt_prefixes))
for split in ['train','val','test']:
    out_path = "/media/tjvsonsbeek/Data1/vqa_datasets/pathvqa/oa_{}.pkl".format(split)
    preprocess_pathvqa(split,out_path)






19755 captions loaded from json 


100%|██████████| 19755/19755 [04:32<00:00, 72.49it/s] 


Done
9949 embeddings saved 
6279 captions loaded from json 


100%|██████████| 6279/6279 [01:26<00:00, 72.71it/s] 


Done
3144 embeddings saved 
6761 captions loaded from json 


100%|██████████| 6761/6761 [01:32<00:00, 73.14it/s] 


Done
3370 embeddings saved 


In [3]:
def update_classes(pkl_train, pkl_val, pkl_test):
    with open(pkl_train, 'rb') as f:
            data_train = pickle.load(f)
    with open(pkl_val, 'rb') as f:
            data_val = pickle.load(f)
    with open(pkl_test, 'rb') as f:
            data_test = pickle.load(f)
    
    cur_id = 0
    class_names_list = []
    class_ids_list = [[],[],[]]



    for i, data in enumerate([data_train,data_val,data_test]):
        
        for answer in data['answers']:
            if answer not in class_names_list:
                class_names_list.append(answer)
                class_ids_list[i].append(cur_id)
                cur_id+=1
            else:
                class_ids_list[i].append(class_names_list.index(answer))
    q_lens = []
    a_lens = []
    for question in data_train['questions']:
        q_lens.append(len(tokenizer.encode(question)))
    for answer in data_train['answers']:
        a_lens.append(len(tokenizer.encode(str(answer))))
    print((int(np.mean(q_lens)+2*np.std(q_lens)),int(np.mean(a_lens)+2*np.std(a_lens))))
    

    
    data_train['class_ids'] = class_ids_list[0]
    data_val['class_ids'] = class_ids_list[1]
    data_test['class_ids'] = class_ids_list[2]
    
    data_train['class_names'] = class_names_list
    data_val['class_names'] = class_names_list
    data_test['class_names'] = class_names_list
    
    data_train['max_classes']=len(class_names_list)
    data_val['max_classes']=len(class_names_list)
    data_test['max_classes']=len(class_names_list)
    print(len(class_names_list))
    print(len(data_train['questions']))
    data_train['max_seqs_len']=(int(np.mean(q_lens)+2*np.std(q_lens)),int(np.mean(a_lens)+2*np.std(a_lens)))
    data_val['max_seqs_len']=(int(np.mean(q_lens)+2*np.std(q_lens)),int(np.mean(a_lens)+2*np.std(a_lens)))
    data_test['max_seqs_len']=(int(np.mean(q_lens)+2*np.std(q_lens)),int(np.mean(a_lens)+2*np.std(a_lens)))
    
    with open(pkl_train, 'wb') as f:
        pickle.dump(data_train,f)
    with open(pkl_val, 'wb') as f:
        pickle.dump(data_val,f)
    with open(pkl_test, 'wb') as f:
        pickle.dump(data_test,f)
# update_classes("/media/tjvsonsbeek/Data1/vqa_datasets/pathvqa/oa_{}.pkl".format('train'),
#                "/media/tjvsonsbeek/Data1/vqa_datasets/pathvqa/oa_{}.pkl".format('val'),
#                "/media/tjvsonsbeek/Data1/vqa_datasets/pathvqa/oa_{}.pkl".format('test'))
# update_classes('/media/tjvsonsbeek/Data1/vqa_datasets/radvqa/train.pkl',
#                '/media/tjvsonsbeek/Data1/vqa_datasets/radvqa/val.pkl',
#                '/media/tjvsonsbeek/Data1/vqa_datasets/radvqa/test.pkl')
# update_classes('/media/tjvsonsbeek/Data1/vqa_datasets/slake/train.pkl',
#                '/media/tjvsonsbeek/Data1/vqa_datasets/slake/val.pkl',
#                '/media/tjvsonsbeek/Data1/vqa_datasets/slake/test.pkl')
update_classes('/media/tjvsonsbeek/Data1/vqa_datasets/ovqa/train.pkl',
               '/media/tjvsonsbeek/Data1/vqa_datasets/ovqa/val.pkl',
               '/media/tjvsonsbeek/Data1/vqa_datasets/ovqa/test.pkl')
# update_classes('/media/tjvsonsbeek/Data1/vqa_datasets/medvqa2019/train.pkl',
#                '/media/tjvsonsbeek/Data1/vqa_datasets/medvqa2019/val.pkl',
#                '/media/tjvsonsbeek/Data1/vqa_datasets/medvqa2019/test.pkl')

(37, 21)
1056
15189


In [None]:

answer_tokens = []
question2img = []
max_seq_len = 0
for answer, img_id in zip(answers_raw,img_ids):
    answer_tokens.append(torch.tensor(self.tokenizer.encode(str(answer)), dtype=torch.int64))
    question2img.append(img_id)
    max_seq_len = max(max_seq_len, self.answer_tokens[-1].shape[0])

# self.max_seq_len = max_seq_len
with open(f"{data_path[:-4]}_tokens.pkl", 'wb') as f:
    pickle.dump([self.answer_tokens, self.question2img, max_seq_len], f)

In [6]:
def preprocess_ovqa(split, out_path):
    device = torch.device('cuda:0')
    clip_model, preprocess = clip.load("ViT-B/32", device=device, jit=False)
    with open('/media/tjvsonsbeek/Data1/vqa_datasets/ovqa/{}set.json'.format(split)) as f:
        data =  json.load(f)
    print("%0d captions loaded from json " % len(data))
    all_img_prefixes = []
    all_txt_prefixes = []
    img_ids = []
    img_paths = []
    all_questions = []
    all_answers = []
    compact_dict = {} 
    for i in tqdm(range(len(data))):
        d = data[i]
        if isEglish(d['answer']) and isEglish(d['question']):
            img_id = d["image_name"][:-4]
            filename = "/media/tjvsonsbeek/Data1/vqa_datasets/ovqa/img/"+d['image_name']
            with torch.no_grad():
                prefix_i = clip_model.encode_image(preprocess(Image.open(filename)).unsqueeze(0).to(device)).cpu()          
            if img_id not in compact_dict.keys():
                compact_dict[img_id] = [[punc(d['question'])],[punc(d['answer'])],prefix_i,filename]
            else:
                compact_dict[img_id][0].append(punc(d['question']))
                compact_dict[img_id][1].append(punc(d['answer']))
    for img_id, imgs in enumerate(compact_dict.keys()):
        all_img_prefixes.append(compact_dict[imgs][2])
        for q in range(len(compact_dict[imgs][0])):
            all_questions.append(compact_dict[imgs][0][q])
            all_answers.append(compact_dict[imgs][1][q])
            img_ids.append(img_id)
            img_paths.append(compact_dict[imgs][2])
    check=torch.cat(all_img_prefixes, dim=0)

    all_data = {"img_prefix": torch.cat(all_img_prefixes, dim=0), "img_ids": img_ids, "questions": all_questions,'answers': all_answers,'img_path': img_paths}

    with open(out_path, 'wb') as f:
        pickle.dump(all_data,f)
    print('Done')
    print("%0d embeddings saved " % len(all_questions))
for split in ['train','test','val']:
    out_path = "/media/tjvsonsbeek/Data1/vqa_datasets/ovqa/{}.pkl".format(split)
    preprocess_ovqa(split,out_path)

15216 captions loaded from json 


  0%|          | 0/15216 [00:00<?, ?it/s]


NameError: name 'string' is not defined

In [3]:
def preprocess_slake(split, out_path):
    device = torch.device('cuda:0')
    clip_model, preprocess = clip.load("ViT-B/32", device=device, jit=False)
    with open('/media/tjvsonsbeek/Data1/vqa_datasets/slake/Slake1.0/{}.json'.format(split)) as f:
        data =  json.load(f)
    print("%0d captions loaded from json " % len(data))
    all_img_prefixes = []
    all_txt_prefixes = []
    img_ids = []
    img_paths = []
    all_questions = []
    all_answers = []
    compact_dict = {} 
    for i in tqdm(range(len(data))):
        d = data[i]
        if isEglish(d['answer']) and isEglish(d['question']):
            img_id = d["img_id"]
            filename = "/media/tjvsonsbeek/Data1/vqa_datasets/slake/Slake1.0/imgs/"+d['img_name']
            with torch.no_grad():
                prefix_i = clip_model.encode_image(preprocess(Image.open(filename)).unsqueeze(0).to(device)).cpu()          
            if img_id not in compact_dict.keys():
                compact_dict[img_id] = [[d['question']],[d['answer']],prefix_i,filename]
            else:
                compact_dict[img_id][0].append(d['question'])
                compact_dict[img_id][1].append(d['answer'])
    for img_id, imgs in enumerate(compact_dict.keys()):
        all_img_prefixes.append(compact_dict[imgs][2])
        for q in range(len(compact_dict[imgs][0])):
            all_questions.append(compact_dict[imgs][0][q])
            all_answers.append(compact_dict[imgs][1][q])
            img_ids.append(img_id)
            img_paths.append(compact_dict[imgs][2])
    check=torch.cat(all_img_prefixes, dim=0)

    all_data = {"img_prefix": torch.cat(all_img_prefixes, dim=0), "img_ids": img_ids, "questions": all_questions,'answers': all_answers,'img_path': img_paths}

    with open(out_path, 'wb') as f:
        pickle.dump(all_data,f)
    print('Done')
    print("%0d embeddings saved " % len(all_questions))
for split in ['train','test']:
    out_path = "/media/tjvsonsbeek/Data1/vqa_datasets/slake/{}.pkl".format(split)
    preprocess_slake(split,out_path)

9835 captions loaded from json 


100%|██████████| 9835/9835 [01:23<00:00, 117.41it/s]


Done
4919 embeddings saved 
2094 captions loaded from json 


100%|██████████| 2094/2094 [00:18<00:00, 114.58it/s]

Done
1061 embeddings saved 





: 

In [23]:

device = torch.device('cuda:0')
clip_model, preprocess = clip.load("ViT-B/32", device=device, jit=False)
with open("/media/tjvsonsbeek/Data1/vqa_datasets/radvqa/radvqa/VQA_RAD Dataset Public.json", 'r') as f:
  data = json.load(f)
compact_dict = {} 
compact_dict_test = {}
for i in tqdm(range(len(data))):
    d = data[i]
    img_id = d["image_name"]
    filename = "/media/tjvsonsbeek/Data1/vqa_datasets/radvqa/radvqa/VQA_RAD_Image_Folder/{}".format(img_id)
    with torch.no_grad():
        prefix_i = clip_model.encode_image(preprocess(Image.open(filename)).unsqueeze(0).to(device)).cpu()
        prefix_t = clip_model.encode_text(clip.tokenize(d['question']).to(device)).cpu()
    if 'test' in d['phrase_type']:
        if img_id not in compact_dict_test.keys():
            compact_dict_test[img_id] = [[d['question']],[d['answer']],[prefix_t],prefix_i]
        else:
            compact_dict_test[img_id][0].append(d['question'])
            compact_dict_test[img_id][1].append(d['answer'])
            compact_dict_test[img_id][2].append(prefix_t)
    else:
        if img_id not in compact_dict.keys():
            compact_dict[img_id] = [[d['question']],[d['answer']],[prefix_t],prefix_i]
        else:
            compact_dict[img_id][0].append(d['question'])
            compact_dict[img_id][1].append(d['answer'])
            compact_dict[img_id][2].append(prefix_t)
#trainval
all_img_prefixes = []
for img_id, imgs in enumerate(compact_dict.keys()):
    all_img_prefixes.append(compact_dict[imgs][3])
n = len(all_img_prefixes)
indices = random.sample(range(n), int(0.8 * n)) # generate random indices for the training set
t_img_prefixes = [ all_img_prefixes[i] for i in indices ]
v_img_prefixes = [ all_img_prefixes[i] for i in range(n) if i not in indices ]


all_txt_prefixes = []
img_ids = []
all_questions = []
all_answers = []

val_all_txt_prefixes = []
val_img_ids = []
val_all_questions = []
val_all_answers = []

for img_id, imgs in tqdm(enumerate(compact_dict.keys())):
    if np.any([np.all(np.equal(compact_dict[imgs][3].numpy(), tensor_i.numpy())) for tensor_i in t_img_prefixes]):
        for q in range(len(compact_dict[imgs][0])):
            all_txt_prefixes.append(compact_dict[imgs][2][q])
            all_questions.append(compact_dict[imgs][0][q])
            all_answers.append(compact_dict[imgs][1][q])
            img_ids.append(img_id)
    elif np.any([np.all(np.equal(compact_dict[imgs][3].numpy(), tensor_i.numpy())) for tensor_i in v_img_prefixes]):
        for q in range(len(compact_dict[imgs][0])):
            val_all_txt_prefixes.append(compact_dict[imgs][2][q])
            val_all_questions.append(compact_dict[imgs][0][q])
            val_all_answers.append(compact_dict[imgs][1][q])
            val_img_ids.append(img_id)
    else:
        print('shouldnt happend')


train_data = {"img_prefix": torch.cat(t_img_prefixes, dim=0), "img_ids": img_ids, "questions": all_questions,'answers': all_answers, 'txt_prefix': torch.cat(all_txt_prefixes, dim=0)}
with open('/media/tjvsonsbeek/Data1/vqa_datasets/radvqa/train.pkl', 'wb') as f:
    pickle.dump(train_data,f)

val_data = {"img_prefix": torch.cat(v_img_prefixes, dim=0), "img_ids": val_img_ids, "questions": val_all_questions,'answers': val_all_answers, 'txt_prefix': torch.cat(val_all_txt_prefixes, dim=0)}
with open('/media/tjvsonsbeek/Data1/vqa_datasets/radvqa/val.pkl', 'wb') as f:
    pickle.dump(val_data,f)


#test
t_all_img_prefixes = []
t_all_txt_prefixes = []
t_img_ids = []
t_all_questions = []
t_all_answers = []
for img_id, imgs in enumerate(compact_dict_test.keys()):
    t_all_img_prefixes.append(compact_dict_test[imgs][3])
    for q in range(len(compact_dict_test[imgs][0])):
        t_all_txt_prefixes.append(compact_dict_test[imgs][2][q])
        t_all_questions.append(compact_dict_test[imgs][0][q])
        t_all_answers.append(compact_dict_test[imgs][1][q])
        t_img_ids.append(img_id)

test_data = {"img_prefix": torch.cat(all_img_prefixes, dim=0), "img_ids": img_ids, "questions": all_questions,'answers': all_answers, 'txt_prefix': torch.cat(all_txt_prefixes, dim=0)}
with open('/media/tjvsonsbeek/Data1/vqa_datasets/radvqa/test.pkl', 'wb') as f:
    pickle.dump(test_data,f)
print('Done')
print("%0d embeddings saved " % len(all_txt_prefixes))



100%|██████████| 2248/2248 [00:59<00:00, 37.78it/s]
313it [00:00, 356.33it/s]


Done
1455 embeddings saved 


In [13]:
def preprocess_medvqa(split, out_path):
    device = torch.device('cuda:0')
    img_base_path = '/media/tjvsonsbeek/Data1/vqa_datasets/medvqa2019/Train_images/'
    clip_model, preprocess = clip.load("ViT-B/32", device=device, jit=False)
    with open('/media/tjvsonsbeek/Data1/vqa_datasets/medvqa2019/All_QA_Pairs_train.txt') as f:
        data = f.readlines()
    # print(data.head())
    # with open("/media/tjvsonsbeek/Data1/vqa_datasets/radvqa/radvqa/VQA_RAD_Dataset_Public.xlsx") as f:
    #    print(f)
    print("%0d captions loaded from json " % len(data))
    all_img_prefixes = []
    all_txt_prefixes = []
    img_ids = []
    all_questions = []
    all_answers = []
    compact_dict = {} 
    for i in tqdm(range(len(data))):
        d = data[i].strip().split('|')
        img_id = d[0]
        
        filename = img_base_path + img_id+'.jpg'
        with torch.no_grad():
            prefix_i = clip_model.encode_image(preprocess(Image.open(filename)).unsqueeze(0).to(device)).cpu()
            prefix_t = clip_model.encode_text(clip.tokenize(d[1]).to(device)).cpu()
            
        if img_id not in compact_dict.keys():
            compact_dict[img_id] = [[d[1]],[d[2]],[prefix_t],prefix_i]
        else:
            compact_dict[img_id][0].append(d[1])
            compact_dict[img_id][1].append(d[2])
            compact_dict[img_id][2].append(prefix_t)
            


    for img_id, imgs in enumerate(compact_dict.keys()):
        all_img_prefixes.append(compact_dict[imgs][3])
        for q in range(len(compact_dict[imgs][0])):
            all_txt_prefixes.append(compact_dict[imgs][2][q])
            all_questions.append(compact_dict[imgs][0][q])
            all_answers.append(compact_dict[imgs][1][q])
            img_ids.append(img_id)
    all_data = {"img_prefix": torch.cat(all_img_prefixes, dim=0), "img_ids": img_ids, "questions": all_questions,'answers': all_answers, 'txt_prefix': torch.cat(all_txt_prefixes, dim=0)}    
    with open(out_path, 'wb') as f:
        pickle.dump(all_data,f)
    print('Done')
    print("%0d embeddings saved " % len(all_txt_prefixes))
for split in ['train','val','test']:
    out_path = "/media/tjvsonsbeek/Data1/vqa_datasets/medvqa2019/processed_{}.pkl".format(split)
    preprocess_pathvqa(split,out_path)

12792 captions loaded from json 


100%|██████████| 12792/12792 [06:09<00:00, 34.59it/s]


Done
12792 embeddings saved 


In [15]:
with open(out_path, 'wb') as f:
    pickle.dump({"img_prefix": torch.cat(all_img_prefixes, dim=0), "img_ids": img_ids, "questions": all_questions,'answers': all_answers, 'txt_prefix': torch.cat(all_txt_prefixes, dim=0)},f)
print('Done')
print("%0d embeddings saved " % len(all_txt_prefixes))

Done
6761 embeddings saved 


In [3]:
# def expand_vqa_set(in_file, out_path):
#     with open(in_file,'rb') as f:
#         compact_dict = pickle.load(f)
#     all_imgs = []
#     all_questions = []
#     all_answers = []
#     for q_id, q in tqdm(enumerate(compact_dict['questions'])):
#         for s_id, subquestions in enumerate(q):
#             all_imgs.append(q_id)
#             # print(type(compact_dict[imgs][2]))
#             all_questions.append(compact_dict['questions'][s_id])
#             all_answers.append(compact_dict['answers'][s_id])
#     with open(out_path, 'wb') as f:
#         pickle.dump({"clip_embedding": compact_dict['clip_embedding'], "img_ids": all_imgs, "questions": all_questions,'answers': all_answers},f)
    
    

In [5]:
expand_vqa_set("/media/tjvsonsbeek/Data1/vqa_datasets/pathvqa/processed_train.pkl","/media/tjvsonsbeek/Data1/vqa_datasets/pathvqa/processed_long_train.pkl")
expand_vqa_set("/media/tjvsonsbeek/Data1/vqa_datasets/pathvqa/processed_val.pkl","/media/tjvsonsbeek/Data1/vqa_datasets/pathvqa/processed_long_val.pkl")
expand_vqa_set("/media/tjvsonsbeek/Data1/vqa_datasets/pathvqa/processed_test.pkl","/media/tjvsonsbeek/Data1/vqa_datasets/pathvqa/processed_long_test.pkl")

2599it [00:00, 282760.84it/s]
832it [00:00, 285967.46it/s]
858it [00:00, 417726.39it/s]


In [4]:
data_path = "/media/tjvsonsbeek/Data1/vqa_datasets/radvqa/radvqa/VQA_RAD%20Dataset%20Public.json"
with open(data_path, 'rb') as f:
    all_data = pickle.load(f)



In [3]:
with open("/media/tjvsonsbeek/Data1/vqa_datasets/radvqa/radvqa/VQA_RAD Dataset Public.json", 'r') as f:
  data = json.load(f)



In [11]:
data[0]

{'qid': '0',
 'phrase_type': 'freeform',
 'qid_linked_id': '03f451ca-de62-4617-9679-e836026a7642',
 'image_case_url': 'https://medpix.nlm.nih.gov/case?id=48e1dd0e-8552-46ad-a354-5eb55be86de6',
 'image_name': 'synpic54610.jpg',
 'image_organ': 'HEAD',
 'evaluation': 'not evaluated',
 'question': 'Are regions of the brain infarcted?',
 'question_rephrase': 'NULL',
 'question_relation': 'NULL',
 'question_frame': 'NULL',
 'question_type': 'PRES',
 'answer': 'Yes',
 'answer_type': 'CLOSED'}