# Convert augmented dataset into huggingface datasets ImageFolder format
* For detail check https://huggingface.co/docs/datasets/v2.4.0/en/image_load#image-captioning
* Containing RGB Image, Disparity, Text Captioning.

In [1]:
import os
os.environ["TRANSFORMERS_OFFLINE"]="1"
os.environ["HF_DATASETS_OFFLINE"]="1"

from transformers import AutoProcessor, Blip2ForConditionalGeneration
import torch

processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-2.7b")
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16)

RuntimeError: Failed to import transformers.models.blip_2.modeling_blip_2 because of the following error (look up to see its traceback):
/home/kailu/miniconda3/lib/python3.7/site-packages/nvidia/cublas/lib/libcublas.so.11: undefined symbol: cublasLtHSHMatmulAlgoInit, version libcublasLt.so.11

In [8]:
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

In [20]:
from PIL import Image
import torch
@torch.no_grad()
def get_caption(image: Image, prompt="A picture of "):
    inputs = processor(image, text=prompt, return_tensors="pt").to(device, torch.float16)
    generated_ids = model.generate(**inputs, max_new_tokens=30)
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
    return generated_text

In [10]:
data_folder = "/data/NYU_gen"
disparity_folder = data_folder + "/disparity"
image_folder = data_folder + "/images"
config_path = "/data/NYU_gen/metadata.jsonl"

In [11]:
import glob
import os
disparity_list = glob.glob(os.path.join(disparity_folder, "*.png"))
disparity_list = sorted(disparity_list, key=lambda x: int(x.split('/')[-1][:-4]))

In [12]:
import jsonlines
if os.path.exists(config_path):
    with jsonlines.open(config_path, 'r') as reader:
        metadata = [obj for obj in reader]
else:
    metadata = []

filename_meta = {d["file_name"]: d for d in metadata}

In [23]:
from PIL import Image
from tqdm import tqdm

skip_processed = True
for disparity_path in tqdm(disparity_list):
    idx = int(disparity_path.split('/')[-1][:-4])
    image_paths = glob.glob(os.path.join(image_folder,f"{idx}_gen_*.png"))
    for image_path in image_paths:
        if (image_path in filename_meta) and ("text" in filename_meta[image_path]) and skip_processed:
            pass
        image = Image.open(image_path).convert("RGB")
        prefix_list = ["A picture of ", "A styled room that ", "", "A indoor room that "]
        question_list = []
        caption = [prefix + get_caption(image, prefix) for prefix in prefix_list]
        caption += [get_caption(image, question) for question in question_list]
        info = {
            "file_name": image_path[len(data_folder):],
            "disparity": disparity_path[len(data_folder):],
            "text": caption,
        }
        metadata.append(info)
    with jsonlines.open(config_path, 'w') as writer:
        writer.write_all(metadata)

In [24]:
metadata

[{'file_name': '/images/0_gen_0.png',
  'disparity': '/disparity/0.png',
  'text': ["A picture of ikea's new store in the city of london",
   'A styled room that ia made of paper',
   'a room with colorful decorations and a large wall',
   '',
   '']},
 {'file_name': '/images/0_gen_1.png',
  'disparity': '/disparity/0.png',
  'text': ['A picture of ikegbo, a colorful african fabric',
   "A styled room that ia a part of the exhibition 'the art of the african fabric'",
   'a poster for the exhibition of african art',
   '',
   '']},
 {'file_name': '/images/0_gen_0.png',
  'disparity': '/disparity/0.png',
  'text': ["A picture of ikea's new store in the city of london",
   'A styled room that ia made of paper',
   'a room with colorful decorations and a large wall',
   'There is a colorful room with a lot of colorful items']},
 {'file_name': '/images/0_gen_1.png',
  'disparity': '/disparity/0.png',
  'text': ['A picture of ikegbo, a colorful african fabric',
   "A styled room that ia a pa