In [1]:
import copy
import pdb
import yaml
from typing import List, Union, Dict, Tuple
from PIL import Image
import random
from tqdm import tqdm
import json

import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

from data.aokvqa import AOKVQADataset
from utils.okvqa_utils import postprocess_ok_vqa_generation, lemmatize
from utils.openai_utils import openai_caller

import logging

logger = logging.getLogger(__name__)
logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = AOKVQADataset('train')
device = torch.device("cuda")

01/14/2024 00:11:44 - INFO - data.aokvqa - Loaded A-OKVQA train dataset with 17056 examples!


In [3]:
from models.llm import LLM_CLASS_MAP
from models.vlm import VLM_CLASS_MAP
from models.qgen import QGEN_CLASS_MAP

config_file = '/net/nfs.cirrascale/mosaic/tejass/code/ReCoVERR/configs/recoverr_configs/aokvqa/chatgpt_qgen-flant5xl_llm-instructblipft5xl_vlm.yaml'

# Create agent and environment
config = yaml.safe_load(open(config_file))

# Load VLM
vlm_class = config['vlm']['class_name']
vlm_model_class = VLM_CLASS_MAP[vlm_class]
vlm_config = yaml.safe_load(open(config['vlm']['model_config_path']))
vlm_model = vlm_model_class(vlm_config, device)
vlm_model.set_vqa_inference_params(config['vlm']['vqa_inference_params'])
vlm_model.set_caption_inference_params(config['vlm']['caption_inference_params'])

  deprecate(
01/14/2024 00:12:01 - INFO - root - freeze vision encoder
Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:01<00:00,  1.41it/s]
01/14/2024 00:12:29 - INFO - root - load checkpoint from https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_flanxl_trimmed.pth
01/14/2024 00:12:32 - INFO - models.vlm - Loaded BLIP (model=InstructBLIP-FlanT5XL)!
01/14/2024 00:12:32 - INFO - models.vlm - Model size: 4.02B parameters (0.19B trainable), 8.42GB in memory
01/14/2024 00:12:32 - INFO - models.vlm - ----------------------------------------------------------------------------------------------------


In [None]:
yn_outputs_file = '/net/nfs.cirrascale/mosaic/tejass/experiments/vl_calibration/uncalibrated_yn_probs/instructblipflant5xl-aokvqa_train_direct_answer.json'
directvqa_rollouts = json.load(open('/net/nfs.cirrascale/mosaic/tejass/experiments/recoverr/aokvqa_direct_answer/train_outputs/instructblipflant5xl_direct_vqa-1rollouts-17056examples.json'))


results = []
for i, d in enumerate(tqdm(dataset)):
    question = d['question']
    image = d['raw_image']
    qid = d['qid']
    answer, logprobs_dict = vlm_model.ask(image, question)
    r = directvqa_rollouts[qid][0]
    result = {
        "qid": qid,
        "image_id": r['image_id'],
        "question": r['vqa_question'],
        "answer": answer,
        "yn_logits": logprobs_dict['yn_logits'], 
        "lave_score": r['lave_score'], 
    }
    results.append(result)
    if i == 5000:
        break

json.dump(results, open(yn_outputs_file, 'w'), indent=2)

 27%|██████████████████████████████▋                                                                                 | 4675/17056 [24:50<1:02:15,  3.31it/s]