## 5.3 Fuyu  
A small version of the multimodal model, it can support arbitrary image resolutions, answer questions about graphs and diagrams, answer UI-based questions, and do fine-grained localization on screen images.

Use the pre-downloaded model weights on `/share/lab5/fuyu`: 

In [None]:
# You can directly use the downloaded weights from /share/lab5/fuyu


#!export HF_ENDPOINT=https://hf-mirror.com
#!huggingface-cli download --resume-download adept/fuyu-8b  --local-dir your_path_of_fuyu

In [None]:
from PIL import Image
import torch
from transformers import FuyuProcessor, FuyuForCausalLM
import requests
import numpy as np

In [None]:
class Fuyu():
    def __init__(self, model_id='your_path_of_fuyu'):
        if torch.cuda.is_available():
            print("You are running the model on GPU.")
            self.device = torch.device("cuda:0")
            self.dtype = torch.float16
        else:
            print("You are running the model on CPU.")
            self.device = torch.device("cpu")
            self.dtype = torch.bfloat16
        
        print('Begin loading.')
        self.model =  FuyuForCausalLM.from_pretrained(model_id, device_map=self.device, torch_dtype=self.dtype)
        print('Checkpoints loaded!')
        self.processor = FuyuProcessor.from_pretrained(model_id)
        print('Processor loaded!')

    def prompt(self, text, image=None, out_tokens=200):
        #Prompt the model with a text and optional an image prompt.
        # if prompt does not end in \n, add a \n
        if text[-1] != "\n":
            text += "\n"
        # pre processing image and text
        inputs = self.processor(text=text, images=[image], return_tensors="pt").to(self.device)
        prompt_len = inputs["input_ids"].shape[-1]
        print(f"prompt length: %s" % prompt_len)
        generation_output = self.model.generate(**inputs, max_new_tokens=out_tokens)
        generation_text = self.processor.batch_decode(generation_output[:,prompt_len:], skip_special_tokens=True)
        return generation_text[0].lstrip()


In [None]:
# Loading the model.  It will take a while to load the model for the first time.
# If you encounter an 'out of memory' error, make sure no other programs are running on this GPU or restart the kernel.
model_id = "/share/lab5/fuyu" 
fuyu = Fuyu(model_id)

In [None]:
## a handy utility function to print the output with a specific max characters per line
def pprint(text, line_char=256):
  last_char = "" # handle last word in each line
  for i in range(0, len(text), line_char):
    line = text[i:i+line_char]
    line = last_char+line
    last_char = ""
    line = line.split(" ")
    last_char = line[-1]
    if i <= len(text)-line_char:
      # handle last word in last line
      line = line[:-1]
    print(" ".join(line))


In [None]:
# This is how you use the model to generate text.

def run_image(prompt,img_path):
  image_pil = Image.open(img_path)
  image_pil.show()
  image = np.array(image_pil)
    
  if len(image.shape) == 2:
      image = np.stack((image,) * 3, axis=-1)
  
  output = fuyu.prompt(prompt, image, out_tokens=100)
  pprint(f"OUTPUT: \n{output}", 50)


In [None]:
prompt = 'How many bottles of [Magna] beer are there? Please note that several types of beer might be on the table.'
img_path = '/share/lab5/data/test0.jpg'
run_image(prompt, img_path)

In [None]:
prompt =' Describe what is Object 1 and object 2. Tell me what is in the circled glass.'
img_path = '/share/lab5/data/test1.jpg'
run_image(prompt, img_path)

In [None]:
## You can also directly load the image from an URL

def run_image_url(prompt, img_url):
  image = Image.open(requests.get(img_url, stream = True).raw)
  image.show()
  image = np.array(image)
    
  if len(image.shape) == 2:
      image = np.stack((image,) * 3, axis=-1)
  
  output = fuyu.prompt(prompt, image, out_tokens=100)
  pprint(f"OUTPUT: \n{output}", 50)

In [None]:
## using a URL image
prompt = 'what is in the image?'
url = "https://k.sinaimg.cn/n/sinakd20240410s/106/w1024h682/20240410/cf40-2931ffbf2b8611590b5b3384c200f2d4.png/w700d1q75cms.jpg?by=cms_fixed_width"
run_image_url(prompt, url)

In [None]:
prompt= ' Please read the text in this image and return the information in the following JSON format (note xxx is placeholder, if the information is not available in the image, put "N/A" instead). {"class": xxx, "DLN": xxx, "DOB": xxx, "Name": xxx, "Address": xxx, "EXP": xxx, "ISS": xxx, "SEX": xxx, "HGT": xxx, "WGT": xxx, "EYES": xxx, "HAIR": xxx, "DONOR": xxx}'
img_path = '/share/lab5/data/test2.jpg'
run_image(prompt, img_path)

In [None]:
prompt = 'What are all the scene text in the image?'
img_path = '/share/lab5/data/test15.jpg'
run_image(prompt, img_path)

In [None]:
prompt ='Count the number of apples in the image.'
img_path = '/share/lab5/data/test3.jpg'
run_image(prompt, img_path)

In [None]:
prompt = 'Count the number of apples in the image row-by-row.'
img_path = '/share/lab5/data/test3.jpg'
run_image(prompt, img_path)

In [None]:
prompt = 'Describe the landmark in the image.'
img_path = '/share/lab5/data/test6.jpg'
run_image(prompt, img_path)

In [None]:
prompt = 'Describe the name of the dish.'
img_path = '/share/lab5/data/test7.jpg'
run_image(prompt, img_path)

In [None]:
prompt = 'What is wrong with the foot in this figure??'
img_path = '/share/lab5/data/test8.jpg'
run_image(prompt, img_path)

In [None]:
prompt ='What is the spatial relation between the frisbee and the man?'
img_path = '/share/lab5/data/test9.jpg'
run_image(prompt, img_path)

In [None]:
prompt = 'Which oceans surround Africa?  both to the east and to the west.'
img_path = '/share/lab5/data/test13.jpg'
run_image(prompt, img_path)

In [None]:
#### Your Task ####
# Try at least three examples of your image, using different prompts.  See if the model can generate the correct answer.