In [1]:
from lavis.models import load_model_and_preprocess
import torch
import argparse
import time

device = torch.device("cuda") if torch.cuda.is_available() else "cpu"

print('Loading model...')
start_time = time.time()

model, vis_processors, _ = load_model_and_preprocess(
    name="blip2_vicuna_instruct",
    model_type="vicuna7b",
    is_eval=True,
    device=device,
)
end_time = time.time()
print('Loading model done! Time cost: ', end_time - start_time, 's')

Loading model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading model done! Time cost:  225.81879544258118 s


In [2]:
from PIL import Image

def inference2(image, prompt, num_captions):
    min_len = 15
    max_len = 95
    beam_size = 5
    len_penalty = 1
    repetition_penalty = 1.2
    top_p = 0.9
    decoding_method = "Beam search"

    use_nucleus_sampling = decoding_method == "Nucleus sampling"

    # Ensure prompts are a list of strings
    # prompts = prompts.split('\n')
    # assert len(prompts) == 4, "The number of prompts must be equal to 4."

    image = vis_processors["eval"](image).unsqueeze(0).to(device)
    samples = {
        "image": image,
        "prompt": prompt,
    }

    outputs = model.generate(
        samples,
        length_penalty=float(len_penalty),
        repetition_penalty=float(repetition_penalty),
        num_beams=beam_size,
        max_length=max_len,
        min_length=min_len,
        top_p=top_p,
        use_nucleus_sampling=use_nucleus_sampling,
        num_captions=num_captions
    )    

In [5]:
image_input = Image.open("/home/ubuntu/dev/captioning/test.jpg")
prompt = "Question: what's the image about? answer:"
inference2(image_input, prompt, 4)



In [4]:
from PIL import Image

def inference(image, prompts, num_captions):
    min_len = 15
    max_len = 95
    beam_size = 5
    len_penalty = 1
    repetition_penalty = 1.2
    top_p = 0.9
    decoding_method = "Beam search"

    use_nucleus_sampling = decoding_method == "Nucleus sampling"

    # Ensure prompts are a list of strings
    # prompts = prompts.split('\n')
    # assert len(prompts) == 4, "The number of prompts must be equal to 4."

    image = vis_processors["eval"](image).unsqueeze(0).to(device)
    images = image.repeat(len(prompts), 1, 1, 1)

    samples = {
        "image": images,
        "prompt": prompts,
    }

    outputs = model.generate(
        samples,
        length_penalty=float(len_penalty),
        repetition_penalty=float(repetition_penalty),
        num_beams=beam_size,
        max_length=max_len,
        min_length=min_len,
        top_p=top_p,
        use_nucleus_sampling=use_nucleus_sampling,
        num_captions=num_captions
    )

In [None]:
image_input = Image.open("/home/ubuntu/dev/captioning/test.jpg")
prompts = ["what's in the image?", "Question: what's the image about? answer:", "what's the story in the image?", "what's the style of the image?"]

inference(image_input, prompts, 4)

In [7]:
# model.generate | model: 
print(type(model))

<class 'lavis.models.blip2_models.blip2_vicuna_instruct.Blip2VicunaInstruct'>


In [None]:
image_input = Image.open("/home/ubuntu/dev/captioning/test.jpg")
print(type(image_input))
print(image_input.size)

prompts = ["what's in the image?", "Question: what's the image about? answer:"]
samples = {
    "image": image_input,
    "prompt": prompts,
}
    
model.generate(samples)

In [15]:
# Assuming you have a list of image paths and prompts
image_paths = ["/home/ubuntu/dev/captioning/test.jpg", "/home/ubuntu/dev/captioning/test.jpg"]
prompts = ["what's in the image?", "Question: what's the image about? answer:"]

images = []
for img_path in image_paths:
    image_input = Image.open(img_path)
    image_input = vis_processors["eval"](image_input).unsqueeze(0).to(device)
    images.append(image_input)

images = torch.cat(images, dim=0)  # Concatenate along the batch dimension

samples = {
    "image": images,
    "prompt": prompts,
}

output_texts = model.generate(samples)