# IDEFICS 모델을 사용한 시각적 질의응답 실습

In [2]:
%pip install -U transformers \
             datasets==2.21.0 \
             diffusers==0.30.0 \
             accelerate==0.33.0 \
             torch==2.3.1 \
             torchvision==0.18.1 \
             sentencepiece==0.2.0


Collecting diffusers==0.30.0
  Using cached diffusers-0.30.0-py3-none-any.whl.metadata (18 kB)
Collecting importlib-metadata (from diffusers==0.30.0)
  Using cached importlib_metadata-8.2.0-py3-none-any.whl.metadata (4.7 kB)
INFO: pip is looking at multiple versions of torchvision to determine which version is compatible with other requirements. This could take a while.
Collecting torchvision==0.18.1
  Downloading torchvision-0.18.1-cp311-cp311-win_amd64.whl.metadata (6.6 kB)
Collecting zipp>=0.5 (from importlib-metadata->diffusers==0.30.0)
  Downloading zipp-3.20.0-py3-none-any.whl.metadata (3.6 kB)
Downloading diffusers-0.30.0-py3-none-any.whl (2.6 MB)
   ---------------------------------------- 0.0/2.6 MB ? eta -:--:--
   -- ------------------------------------- 0.2/2.6 MB 4.8 MB/s eta 0:00:01
   ---------- ----------------------------- 0.7/2.6 MB 8.3 MB/s eta 0:00:01
   ------------------ --------------------- 1.2/2.6 MB 9.4 MB/s eta 0:00:01
   -------------------------- ---------


[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
import torch
from transformers import IdeficsForVisionText2Text, AutoProcessor

device = "cuda" if torch.cuda.is_available() else "cpu"

model_name = "HuggingFaceM4/idefics-9b-instruct"
model = IdeficsForVisionText2Text.from_pretrained(model_name, 
                                                  torch_dtype=torch.bfloat16).to(device)
processor = AutoProcessor.from_pretrained(model_name)

# 생성 매개변수
exit_condition = processor.tokenizer("<end_of_utterance>", add_special_tokens=False).input_ids
bad_words_ids = processor.tokenizer(["<image>", "<fake_token_around_image>"], add_special_tokens=False).input_ids

config.json:   0%|          | 0.00/1.41k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors.index.json:   0%|          | 0.00/99.3k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.97G [00:00<?, ?B/s]

KeyboardInterrupt: 

# 제로샷 추론

![](https://hips.hearstapps.com/hmg-prod/images/dog-puns-1581708208.jpg)

In [4]:
url = "https://hips.hearstapps.com/hmg-prod/images/dog-puns-1581708208.jpg"
img = processor.image_processor.fetch_images([url])[0]

prompts = [
    "\nUser:",
    img,
    "Describe this image.\nAssistant: ",
]

inputs = processor(prompts, return_tensors="pt", debug=True).to(device)

generated_ids = model.generate(**inputs, eos_token_id=exit_condition, bad_words_ids=bad_words_ids, max_length=100)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
generated_text

full_text='<s>\nUser:<fake_token_around_image><image><fake_token_around_image>Describe this image.\nAssistant:'


"<s> \nUser:<fake_token_around_image><image><fake_token_around_image> Describe this image.\nAssistant: The image features a small dog wearing a pair of black sunglasses, giving it a cool and stylish appearance. The dog is looking directly at the camera, capturing the viewer's attention. The sunglasses cover a significant portion of the dog's face, making it the central focus of the image. The dog appears to be a Boston Terrier, a breed known"

# 원샷 추론(1개 예시)으로 설명하기

![](https://hips.hearstapps.com/hmg-prod/images/cute-photos-of-cats-in-grass-1593184777.jpg)
![](https://hips.hearstapps.com/hmg-prod/images/dog-puns-1581708208.jpg)

In [5]:
url = "https://hips.hearstapps.com/hmg-prod/images/cute-photos-of-cats-in-grass-1593184777.jpg"
img = processor.image_processor.fetch_images([url])[0]

# img 또는 url를 사용하세요.
prompts = [
    "User:",
    img,
    "Describe this image."
    "Assistant: An image of two kittens in grass." # 원샷 예제
    "User:",
    "https://hips.hearstapps.com/hmg-prod/images/dog-puns-1581708208.jpg", 
    "Describe this image.",
    "Assistant: "
]

inputs = processor(prompts, return_tensors="pt", debug=True).to(device)

generated_ids = model.generate(**inputs, eos_token_id=exit_condition, bad_words_ids=bad_words_ids, max_length=100)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
generated_text

full_text='<s>User:<fake_token_around_image><image><fake_token_around_image>Describe this image.Assistant: An image of two kittens in grass.User:<fake_token_around_image><image><fake_token_around_image>Describe this image.<end_of_utterance>Assistant:'


'User: Describe this image.Assistant: An image of two kittens in grass.User: Describe this image. Assistant: A dog wearing a pair of black glasses.'

# 이미지 주변에 특수 문자 삽입하기

In [6]:
generated_ids = model.generate(**inputs, eos_token_id=exit_condition, bad_words_ids=bad_words_ids, max_length=100)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
generated_text

'<s> User:<fake_token_around_image><image><fake_token_around_image> Describe this image.Assistant: An image of two kittens in grass.User:<fake_token_around_image><image><fake_token_around_image> Describe this image.<end_of_utterance> Assistant: A dog wearing a pair of black glasses.<end_of_utterance>'

# 이미지 속 텍스트에 대해 질문하기

![](img/happy-car-chris.png)

In [7]:
from PIL import Image
img = Image.open("img/happy-car-chris.png") 

prompts = [
    "User: ",
    img,
    "Describe this image.",
    "Assistant: ",
]

inputs = processor(prompts, return_tensors="pt", debug=True).to(device)

generated_ids = model.generate(**inputs, eos_token_id=exit_condition, bad_words_ids=bad_words_ids, max_length=100)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
generated_text

full_text='<s>User:<fake_token_around_image><image><fake_token_around_image>Describe this image.<end_of_utterance>Assistant:'


'<s> User:<fake_token_around_image><image><fake_token_around_image> Describe this image.<end_of_utterance> Assistant: A man is sitting on the hood of a white sports car.<end_of_utterance>'

In [8]:
from PIL import Image
img = Image.open("img/happy-car-chris.png") 

prompts = [
    "User: ",
    img,
    "Who makes this car?",
    "Assistant: ",
]

inputs = processor(prompts, return_tensors="pt", debug=True).to(device)

generated_ids = model.generate(**inputs, eos_token_id=exit_condition, bad_words_ids=bad_words_ids, max_length=100)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
generated_text

full_text='<s>User:<fake_token_around_image><image><fake_token_around_image>Who makes this car?<end_of_utterance>Assistant:'


'<s> User:<fake_token_around_image><image><fake_token_around_image> Who makes this car?<end_of_utterance> Assistant: The car is made by Porsche.<end_of_utterance>'

![](img/baby-groot-toy.jpg)

In [9]:
img = Image.open("img/baby-groot-toy.jpg") 

prompts = [
    "User: ",
    img,
    "Which movie is this character from?",
    "Assistant: ",
]

inputs = processor(prompts, return_tensors="pt").to(device)

generated_ids = model.generate(**inputs, max_length=100) # eos_token_id=exit_condition, bad_words_ids=bad_words_ids)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(generated_text)

User: Which movie is this character from? Assistant: This character is from the movie Guardians of the Galaxy.


# 연쇄적 사고(Chain of thought; CoT)

![](img/baby-groot-toy.jpg)

In [10]:
# 해당 이미지는 이곳에서 가져왔습니다. https://www.amazon.com/Hot-Toys-Marvel-Guardians-Life-Size/dp/B07257N92P
img = Image.open("img/baby-groot-toy.jpg") 

prompts = [
    "User: ",
    img,
#    "Who produced the movie that features this character?",
    "Who produced the movie that features this character? Think step-by-step.",
    "Assistant: ",
]

inputs = processor(prompts, return_tensors="pt").to(device)

generated_ids = model.generate(**inputs, max_length=100) #, eos_token_id=exit_condition, bad_words_ids=bad_words_ids)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(generated_text)

User: Who produced the movie that features this character? Think step-by-step. Assistant: Marvel Studios produced the movie that features this character.


![](img/margherita-pizza.jpg)

In [11]:
# 해당 이미지는 이곳에서 가져왔습니다. https://eu.ooni.com/blogs/recipes/margherita-pizza

img = Image.open("img/margherita-pizza.jpg") 

prompts = [
    "User: ",
    img,
    "How do I make this? Think step by step.",
    "Assistant: ",
]

inputs = processor(prompts, return_tensors="pt").to(device)

generated_ids = model.generate(**inputs, max_length=1000, eos_token_id=exit_condition, bad_words_ids=bad_words_ids)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(generated_text)

User: How do I make this? Think step by step. Assistant: To make a Margherita pizza at home, follow these steps:

1. Start by preheating your oven to the highest temperature it can reach, usually around 500°F (260°C).

2. On a lightly floured surface, roll out your pizza dough into a circle.

3. Transfer the dough to a pizza stone or baking sheet.

4. Sprinkle a layer of tomato sauce on top of the dough, leaving a small border around the edges.

5. Add a layer of fresh mozzarella cheese on top of the tomato sauce.

6. Add a layer of fresh basil leaves on top of the cheese.

7. Drizzle a small amount of olive oil over the pizza.

8. Place the pizza in the preheated oven and bake for 10-12 minutes, or until the crust is golden brown and the cheese is melted and bubbly.

9. Remove the pizza from the oven and let it cool for a few minutes before slicing and serving.

Optional: You can also add other toppings like sliced mushrooms, olives, or pepperoni to customize your Margherita pizza.


![](img/nflx-5-year-stock-chart.png)

In [12]:
img = Image.open("img/nflx-5-year-stock-chart.png") 

prompts = [
    "User: ",
    img,
    "Describe this image. Think step by step.",
    "Assistant: ",
]

inputs = processor(prompts, return_tensors="pt").to(device)

generated_ids = model.generate(**inputs, max_length=1000, eos_token_id=exit_condition, bad_words_ids=bad_words_ids)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(generated_text)

User: Describe this image. Think step by step. Assistant: The image shows the Netflix, Inc. stock performance over the last five years. The Netflix stock price is displayed on a stock market tracking app, with the current value at $424.04. The app also provides a graph that tracks the stock's performance over time. The Netflix logo is visible in the top left corner of the image.


## 차트에 대해서는 아직 제대로 작동하지 않습니다.

In [13]:
img = Image.open("img/nflx-5-year-stock-chart.png") 

prompts = [
    "User: ",
    img,
    "What is the maxmium stock price as shown in this chart. Think step by step.",
    "Assistant: ",
]

inputs = processor(prompts, return_tensors="pt").to(device)

generated_ids = model.generate(**inputs, max_length=1000, eos_token_id=exit_condition, bad_words_ids=bad_words_ids)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(generated_text)

User: What is the maxmium stock price as shown in this chart. Think step by step. Assistant: The maximum stock price shown in the chart is $4500. This can be calculated by adding the highest point on the chart (around 4300) to the lowest point on the chart (around 3000).


## 차트에 대해서는 아직 제대로 작동하지 않습니다.

In [14]:
img = Image.open("img/nflx-5-year-stock-chart.png") 

prompts = [
    "User: ",
    img,
    "What is the current stock price as shown in this chart. Think step by step.",
    "Assistant: ",
]

inputs = processor(prompts, return_tensors="pt").to(device)

generated_ids = model.generate(**inputs, max_length=1000, eos_token_id=exit_condition, bad_words_ids=bad_words_ids)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(generated_text)

User: What is the current stock price as shown in this chart. Think step by step. Assistant: The current stock price is $43.02.
