# Add Yarn package repository
!curl -sS https://dl.yarnpkg.com/debian/pubkey.gpg | sudo apt-key add -
!echo "deb https://dl.yarnpkg.com/debian/ stable main" | sudo tee /etc/apt/sources.list.d/yarn.list

!export NVM_DIR="$HOME/.nvm"
![ -s "$NVM_DIR/nvm.sh" ] && \. "$NVM_DIR/nvm.sh"
!nvm install 16
# Install Yarn
!sudo apt update
!sudo apt install yarn

!pip install uv
!uv pip install -r requirements.txt
!uv pip install 'accelerate>=0.26.0'
!uv pip install torch
!uv pip install flash-attn==2.7.4.post1 --no-build-isolation
!./install_circuitsvis.sh
# please install yarn as per README.md

In [1]:
%load_ext autoreload
%autoreload 2
import torch
from transformers import PaliGemmaProcessor, PaliGemmaForConditionalGeneration
from PIL import Image
import matplotlib.pyplot as plt
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

torch.set_grad_enabled(False)  # avoid blowing up mem
device = "cuda"

In [None]:
from transformers import AutoProcessor, AutoModelForVision2Seq
import torch
DEVICE = "cuda"

processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-Instruct")
model = AutoModelForVision2Seq.from_pretrained("HuggingFaceTB/SmolVLM-Instruct",
                                                torch_dtype=torch.bfloat16,
                                                attn_implementation="eager").to(DEVICE)

In [None]:
from PIL import Image
from transformers.image_utils import load_image

img_path = "imgs/frisbee.jpg"
image = load_image(img_path)
plt.axis("off")
_ = plt.imshow(image)

In [None]:
from getAttentionLib import get_response, get_attention, dump_attn, get_img_grid_sizes, plot_mult_attn_sums

text = "<image>What color is the frisbee?"

inputs = processor(text=text, images=image, return_tensors="pt").to(model.device)
inputs.pixel_values = inputs.pixel_values[:, :1, :, :, :]
print(inputs.pixel_values.shape)
print(inputs.keys())
print(inputs.input_ids)
input_tokens = processor.tokenizer.convert_ids_to_tokens(inputs.input_ids[0])
outputs = model.generate(**inputs, max_new_tokens=100)
response: str = processor.tokenizer.decode(outputs[0], skip_special_tokens=True)


print('inputs_tokens')
print(input_tokens)
print(outputs)

print('\nResponse:')
print(response)


# Prepare inputs
# inputs_tokens, response = get_response(model, processor, prompt, image)
# print(inputs_tokens)
# print(response)


In [24]:
# text = "<image>What color is the frisbee?"
# inputs = processor(text=text, images=image, return_tensors="pt").to(model.device)

# # Check dimensions of inputs
# print("Input IDs shape:", inputs.input_ids.shape, inputs.input_ids[:13])
# print("Image embeddings shape:", inputs.image_embeds.shape if hasattr(inputs, 'image_embeds') else "No direct image embeddings")

# # Get attention from first layer to see actual dimensions
# outputs = model(input_ids=inputs.input_ids, image=image, output_attentions=True)
# first_layer_attention = outputs.attentions[0]
# print("\nAttention shape:", first_layer_attention.shape)

# Create Multimodal Attention Visualization

# Plot Region-wise Attention

In [None]:
fig = plot_mult_attn_sums(model, inputs, layers=[0, 10, 20])
fig.savefig("imgs/blockwise-attn-sums-frisbee.png")
fig.show()

# Verify Numbers on VQA

In [None]:
import tqdm
from getAttentionLib import compute_mult_attn_sums, load_vqa_ds, plot_images_grid

n_vqa_samples = 15 # 1000
ds = load_vqa_ds(split="train")

layers = [0, 10, 20]
attens_tensor = []
responses = []
imgs = []
seen_imgs = set()
pbar = tqdm.tqdm(total=n_vqa_samples)
for row in ds:
    if len(imgs) >= n_vqa_samples:
        break

    if row["image_id"] in seen_imgs:
        continue
    seen_imgs.add(row["image_id"])
    print(row['question'])
    text = f"<image>{row['question']}"
    try:
        inputs = processor(text=text, images=row["image"], return_tensors="pt").to(model.device)
    except ValueError as e: # Unsupported number of image dimensions: 2
        if 'Unsupported number' in str(e):
            continue

    response = get_response(model, processor, text, row["image"])[1]
    #responses.append(response.replace("\n", " A: ").replace("Answer en", "Q:"))
    question = text.strip('<image>')
    responses.append(question + '|' + response.split(question)[1])
    
    imgs.append(row["image"])
    
    mult_attn_sums = compute_mult_attn_sums(model, inputs, layers=layers)
    attens_tensor.append(torch.stack(mult_attn_sums))
    
    pbar.update(1)
pbar.close()
    
stacked_attens = torch.stack(attens_tensor)
assert stacked_attens.shape == (n_vqa_samples, len(layers), 3, 3)

In [None]:
means = stacked_attens.mean(dim=0)
assert means.shape == (len(layers), 3, 3)
stds = stacked_attens.std(dim=0)
assert stds.shape == (len(layers), 3, 3)
fig = plot_mult_attn_sums(None, None,layers=layers, mult_attn_sums=means, stds=stds)
fig.savefig("imgs/blockwise-attn-sums-vqa1000.png")
fig.show()

# Show VQA Grid

In [None]:
import textwrap

def process_response(r: str) -> str:
    q, a = r.split('|')
    first_response = a.split('.')[0].split('?')[0]
    chars_per_line = 20
    
    # Wrap text to fit within subplot
    a = textwrap.fill(first_response, width=chars_per_line)
    return f"Q: {q}\nA: {a}"

process_response(responses[0])

In [None]:
proc_responses = [process_response(r) for r in responses]
fig = plot_images_grid(imgs[:15], proc_responses[:15], nrows=3, ncols=5, figsize=(10, 6))
fig.savefig("imgs/vqa-grid-of-img-question-answer.png")
fig.show()