In [None]:
from transformers import InstructBlipProcessor, InstructBlipForConditionalGeneration, InstructBlipConfig, AutoModelForVision2Seq
from accelerate import init_empty_weights, infer_auto_device_map
import torch
from PIL import Image
import requests
import json
import os

In [None]:
# Determine if CUDA (GPU) is available.
device = "cuda" if torch.cuda.is_available() else "cpu"


# Load the model configuration.
config = InstructBlipConfig.from_pretrained("/120040051/instructblip-vicuna-7b")
config

In [None]:
# Initialize the model with the given configuration.
with init_empty_weights():
    model = AutoModelForVision2Seq.from_config(config)
    model.tie_weights()

In [None]:
model

In [None]:

# Infer device map based on the available resources.
device_map = infer_auto_device_map(model, max_memory={0: "25GiB", 1: "25GiB"}, no_split_module_classes=['InstructBlipEncoderLayer', 'InstructBlipQFormerLayer', 'LlamaDecoderLayer'])
device_map['language_model.lm_head'] = device_map['language_projection'] = device_map[('language_model.model.embed_tokens')]
# device_map['llm_model.model.embed_tokens'] = device_map['llm_model.lm_head'] = device_map['llm_proj']

# device_map
# offload = ""
# # Load the processor and model for image processing.
processor = InstructBlipProcessor.from_pretrained("/120040051/instructblip-vicuna-7b", device_map="auto")
model = InstructBlipForConditionalGeneration.from_pretrained("/120040051/instructblip-vicuna-7b",
                                                             device_map=device_map,
                                                             )

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

In [None]:
model = InstructBlipForConditionalGeneration.from_pretrained("/120040051/instructblip-vicuna-7b")
processor = InstructBlipProcessor.from_pretrained("/120040051/instructblip-vicuna-7b")

model.to(device)

In [None]:
image_1_path = f"/120040051/test_resource/images/verb_0308/q1_1.webp"
raw_image = Image.open(image_1_path).convert("RGB")
# choice_question_1 = "What is the dog doing in the refrigerator?\nA. mothering her kittens\nB. servicing tables\nC. seating customers\nD. washing its paws\nE. farming his land"

choice_question_1 = "What is the dog doing in the refrigerator? (A) mothering her kittens (B) servicing tables (C) seating customers (D) washing its paws (E) farming his land"

inputs = processor(raw_image, choice_question_1, return_tensors='pt').to(device)

In [None]:
output_ids = model.generate(**inputs, max_length=50, min_length=4, do_sample=True, num_beams=5, temperature=0.2, repetition_penalty=1.5, length_penalty=1.0)

In [None]:
torch.cuda.empty_cache()

In [None]:
output = processor.batch_decode(output_ids, skip_special_tokens=True)[0].strip()

In [None]:
output