In [7]:
from transformers import BlipProcessor, BlipForConditionalGeneration, BlipForQuestionAnswering
import os
import shutil

In [8]:
def download_multimodal_model(save_directory="./blip_model"):
    os.makedirs(save_directory, exist_ok=True)

    processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
    model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")

    processor_path = os.path.join(save_directory, "processor")
    model_path = os.path.join(save_directory, "model")

    processor.save_pretrained(processor_path)
    model.save_pretrained(model_path)   

    print(f"Model and processor saved to {save_directory}")

    for root, dirs, files in os.walk(save_directory):
        for file in files:
            print(f"{os.path.relpath(os.path.join(root, file), save_directory)}")

    return save_directory

In [9]:
model_directory = download_multimodal_model()
print(f"\n Model downloaded to {model_directory}")


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


preprocessor_config.json:   0%|          | 0.00/445 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.54G [00:00<?, ?B/s]

Error while downloading from https://cdn-lfs.hf.co/repos/aa/ad/aaad7f73f20d7afb48036bd3013cc2ba3c6d79f49861c892725ac44fe5e081be/33786eed34def0c95fa948128cb4386be9b9219aa2c2e25f1c9c744692121bb7?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27model.safetensors%3B+filename%3D%22model.safetensors%22%3B&Expires=1753982832&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTc1Mzk4MjgzMn19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5oZi5jby9yZXBvcy9hYS9hZC9hYWFkN2Y3M2YyMGQ3YWZiNDgwMzZiZDMwMTNjYzJiYTNjNmQ3OWY0OTg2MWM4OTI3MjVhYzQ0ZmU1ZTA4MWJlLzMzNzg2ZWVkMzRkZWYwYzk1ZmE5NDgxMjhjYjQzODZiZTliOTIxOWFhMmMyZTI1ZjFjOWM3NDQ2OTIxMjFiYjc%7EcmVzcG9uc2UtY29udGVudC1kaXNwb3NpdGlvbj0qIn1dfQ__&Signature=W6ygJIAUKilrwjegXRt2-do2kcGN-IRjoV9ZPIrWP-g84%7EDKAZeMQYetNEwgpgW5ZHIt7wm1NrWHiYHDGa2-XZQn5HIE9IGQigtW5iKWdxJpbpj24YsG5ze4rzqENqg4czlLPXI1wBwqLZ8AlnxZaP4fgzioX71%7EKT-X%7EYj8-ZU4Ar-7MMcgU%7ErQ%7EQtnguuaRA3eeMoFQ9yBt%7EPesADPQ0ecA5pRittPZWVxrr1bGsh2WW%7Eoeuplf6ffAdsGk

model.safetensors:  15%|#4        | 231M/1.54G [00:00<?, ?B/s]

Model and processor saved to ./blip_model
processor/tokenizer_config.json
processor/special_tokens_map.json
processor/tokenizer.json
processor/vocab.txt
processor/preprocessor_config.json
model/model.safetensors
model/config.json
model/generation_config.json

 Model downloaded to ./blip_model


In [11]:
from PIL import Image
import base64
import io
import requests

processor = BlipProcessor.from_pretrained("./blip_model/processor")
model = BlipForQuestionAnswering.from_pretrained("./blip_model/model")

In [17]:
def ask_about_image(image_path, question):
    image = Image.open(image_path).convert("RGB")
    
    inputs = processor(images=image, text=question, return_tensors="pt")
    outputs = model.generate(**inputs)
    
    answer = processor.decode(outputs[0], skip_special_tokens=True)
    print(f"Question: {question}")
    print(f"Answer: {answer}")
    return answer

In [18]:
ask_about_image(image_path="spagetthi.jpeg", question="What is in this image?")

Question: What is in this image?
Answer: food


'food'

In [20]:
ask_about_image(image_path="friend.jpeg", question="What is in this image? You should give me a detailed answer.")

Question: What is in this image? You should give me a detailed answer.
Answer: name


'name'