# InstructBLIP Eval Code

In [None]:
!pip install torch transformers Pillow scikit-learn numba

## Test Script

In [None]:
from transformers import InstructBlipProcessor, InstructBlipForConditionalGeneration
import torch
from PIL import Image
import requests
from numba import cuda

cuda.select_device(0) #COMMENT THIS IF NOT USED!!
cuda.close()

model = InstructBlipForConditionalGeneration.from_pretrained("Salesforce/instructblip-vicuna-13b")
processor = InstructBlipProcessor.from_pretrained("Salesforce/instructblip-vicuna-13b")

device = "cuda" if torch.cuda.is_available() else "cpu"
model.half()  # Use float16 to reduce memory usage
model.to(device)


url = "https://raw.githubusercontent.com/salesforce/LAVIS/main/docs/_static/Confusing-Pictures.jpg"
image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
prompt = "What is unusual about this image?"
inputs = processor(images=image, text=prompt, return_tensors="pt").to(device)

outputs = model.generate(
        **inputs,
        do_sample=True,
        max_length=30,  # Reduce max_length
        top_p=0.1,
        temperature=0.1,
)

generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
print(generated_text)


## Libs

In [1]:
import requests
from PIL import Image
from io import BytesIO
import re
import torch
import numpy as np
from sklearn.preprocessing import normalize
from transformers import InstructBlipProcessor, InstructBlipForConditionalGeneration

## Logits Calculator

In [2]:
def calculate_log_prob(model, tokenizer, prefix, targets):
    log_sums = []
    for target in targets:
        input_tokens = tokenizer.encode(prefix, add_special_tokens=False, return_tensors='pt')
        output_tokens = tokenizer.encode(target, add_special_tokens=False, return_tensors='pt')

        tokens = torch.cat([input_tokens, output_tokens], dim=1)
        with torch.no_grad():
            outputs = model(tokens)
            logits = outputs.logits

        log_sum = 0
        range_index = range(input_tokens.shape[1] - 1, tokens.shape[1] - 1)
        len_range = tokens.shape[1] - 1 - (input_tokens.shape[1] - 1) 
        for i in range_index:
            past_tok, current_tok = i, i + 1
            token_logit = logits[0, past_tok, :]
            token_log_probs = torch.nn.functional.log_softmax(token_logit, dim=-1)
            log_token_prob = token_log_probs[tokens[0, current_tok]].item()
            log_sum += log_token_prob

        log_sums.append(log_sum / len_range)

    normalized_scores = normalize(log_sums)
    pred = targets[np.argmax(normalized_scores)]
    return pred, normalized_scores

## Non-Model Helpers

In [3]:
def image_parser(image_file):
    out = image_file.split(',')
    return out


def load_image(image_file):
    if image_file.startswith("http") or image_file.startswith("https"):
        response = requests.get(image_file)
        image = Image.open(BytesIO(response.content)).convert("RGB")
    else:
        image = Image.open(image_file).convert("RGB")
    return image


def load_images(image_files):
    out = []
    for image_file in image_files:
        image = load_image(image_file)
        out.append(image)
    return out

## Inference Helpers

In [4]:
def count_all_parameters(model):
    return sum(p.numel() for p in model.parameters())

def load_model_processor(model_path,fp_16=True):
    model = InstructBlipForConditionalGeneration.from_pretrained(model_path)
    processor = InstructBlipProcessor.from_pretrained(model_path)
    
    device = "cuda" if torch.cuda.is_available() else "cpu"
    if fp_16:
        model.half()
    model.to(device)
    
    return model, processor, device


def eval_model(model, processor, image_files, query, options,device='cuda'):
    image_files = image_parser(image_files) 
    images = load_images(image_files)

    log_lik_scores = []

    for image in images:
        images_tensor = processor(images=image, return_tensors="pt").pixel_values.to(device).half() # cast to fp16

        for option in options:
            target_prompt = query + ' ' + option

            inputs = processor(text=target_prompt, return_tensors="pt").to(device)
            input_ids = inputs.input_ids
            attention_mask = inputs.attention_mask

            # Mandatory q-former for InstructBLIP
            qformer_inputs = processor(images=image, text=target_prompt, return_tensors="pt").to(device)
            qformer_input_ids = qformer_inputs.input_ids

            with torch.inference_mode(), torch.cuda.amp.autocast():
                outputs = model(
                    input_ids=input_ids,
                    labels=input_ids,
                    attention_mask=attention_mask,
                    pixel_values=images_tensor, #float-16
                    qformer_input_ids=qformer_input_ids, #required
                )

            log_lik_scores.append((option, -outputs.loss.item()))

    pred_id = np.argmax(np.asarray([x[1] for x in log_lik_scores]))
    print(log_lik_scores)
    print('Prediction: {}'.format(log_lik_scores[pred_id]))
    return log_lik_scores[pred_id]

In [5]:
from numba import cuda

if __name__ == '__main__':    

#     cuda.select_device(0) #COMMENT THIS IF NOT USED!!
#     cuda.close()
    
    model_path = "Salesforce/instructblip-vicuna-13b"
    model,processor, device = load_model_processor(model_path, fp_16=True)

    prompt = "What is this dish name?"
    image_file = "https://eatwellabi.com/wp-content/uploads/2019/01/IMG_5172-500x375.jpg"

    shared_prompt = 'This is an image of a '
    options = [shared_prompt+x for x in ['nasi goreng', 'nasi uduk', 'laksa', 'nasi kuning']]

    eval_model(
        model = model,
        processor = processor,
        image_files = image_file,
        query = prompt,
        options = options,
        device = device,
    )

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message.
Expanding inputs for image tokens in InstructBLIP should be done in processing. Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your InstructBLIP model. Using processors without these attributes in the config is deprecated and will throw an error in v4.47.
  with torch.inference_mode(), torch.cuda.amp.autocast():
Expanding inputs for image tokens in InstructBLIP should be 

[('This is an image of a nasi goreng', -2.786306381225586), ('This is an image of a nasi uduk', -3.217968463897705), ('This is an image of a laksa', -3.9249234199523926), ('This is an image of a nasi kuning', -3.511770725250244)]
Prediction: ('This is an image of a nasi goreng', -2.786306381225586)
