In [11]:
from transformers import AutoImageProcessor, AutoModel
from transformers.models.dinov2.modeling_dinov2 import Dinov2Model
from PIL import Image
import requests
import torch
import numpy as np

url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
image = Image.open(requests.get(url, stream=True).raw)
print(image.height, image.width)  # [480, 640]

processor = AutoImageProcessor.from_pretrained('facebook/dinov2-small')
model = Dinov2Model.from_pretrained('facebook/dinov2-small')


inputs = processor(images=image, return_tensors="pt")
print(f"processed image shape: {inputs.pixel_values.shape}")  # [1, 3, 224, 224]


outputs = model(**inputs, output_hidden_states=True)
print(len(outputs.hidden_states))
last_hidden_states = outputs[0]
print(f"last hidden states shape: {last_hidden_states.shape}")  # [1, 1 + 256, 768]

num_patches_height = inputs.pixel_values.shape[2] // model.config.patch_size
num_patches_width = inputs.pixel_values.shape[3] // model.config.patch_size
cls_token = last_hidden_states[:, 0, :]
patch_features = last_hidden_states[:, 1:, :].unflatten(1, (num_patches_height, num_patches_width))

print(f"cls token shape: {cls_token.shape}")
print(f"patch features shape: {patch_features.shape}")





480 640
processed image shape: torch.Size([1, 3, 224, 224])
13
last hidden states shape: torch.Size([1, 257, 384])
cls token shape: torch.Size([1, 384])
patch features shape: torch.Size([1, 16, 16, 384])


In [None]:
# change the input image size 
processor = AutoImageProcessor.from_pretrained('facebook/dinov2-small',size={"width": 1024, "height": 1024},do_center_crop=False)
inputs = processor(images=image, return_tensors="pt")
print(f"processed image shape: {inputs.pixel_values.shape}")  # [1, 3, 224, 224]


outputs = model(**inputs, output_hidden_states=True)
last_hidden_states = outputs[0]
print(f"last hidden states shape: {last_hidden_states.shape}")  # [1, 1 + 256, 768]

num_patches_height = inputs.pixel_values.shape[2] // model.config.patch_size
num_patches_width = inputs.pixel_values.shape[3] // model.config.patch_size
cls_token = last_hidden_states[:, 0, :]
patch_features = last_hidden_states[:, 1:, :].unflatten(1, (num_patches_height, num_patches_width))

print(f"cls token shape: {cls_token.shape}")
print(f"patch features shape: {patch_features.shape}")

print(f"patches x patch size: {num_patches_height*model.config.patch_size} x {num_patches_width*model.config.patch_size}")

processed image shape: torch.Size([1, 3, 1024, 1024])
last hidden states shape: torch.Size([1, 5330, 384])
cls token shape: torch.Size([1, 384])
patch features shape: torch.Size([1, 73, 73, 384])
patches x patch size: 1022 x 1022


13
torch.Size([1, 257, 384])


NameError: name 'batch_size' is not defined

In [None]:
print(type(model))
print(type(model.config))
print(type(processor))

<class 'transformers.models.dinov2.modeling_dinov2.Dinov2Model'>
<class 'transformers.models.dinov2.configuration_dinov2.Dinov2Config'>
<class 'transformers.models.bit.image_processing_bit.BitImageProcessor'>


In [None]:
# now do the same for the Dinov2-large model

model = AutoModel.from_pretrained('facebook/dinov2-large')
processor = AutoImageProcessor.from_pretrained('facebook/dinov2-large',size={"height":518,"width":518},do_center_crop=False)
print(processor)
image = Image.open(requests.get(url, stream=True).raw)

inputs = processor(images=image, return_tensors="pt")
print(inputs.pixel_values.shape)  # [1, 3, 224, 224]
batch_size, rgb, img_height, img_width = inputs.pixel_values.shape
num_patches_height, num_patches_width = img_height // patch_size, img_width // patch_size
num_patches_flat = num_patches_height * num_patches_width

outputs = model(**inputs, output_hidden_states=True)
print(len(outputs.hidden_states))
last_hidden_states = outputs[0]
print(last_hidden_states.shape)  # [1, 1 + 256, 768]
assert last_hidden_states.shape == (batch_size, 1 + num_patches_flat, model.config.hidden_size)
print(num_patches_height,num_patches_width)











BitImageProcessor {
  "crop_size": {
    "height": 224,
    "width": 224
  },
  "do_center_crop": false,
  "do_convert_rgb": true,
  "do_normalize": true,
  "do_rescale": true,
  "do_resize": true,
  "image_mean": [
    0.485,
    0.456,
    0.406
  ],
  "image_processor_type": "BitImageProcessor",
  "image_std": [
    0.229,
    0.224,
    0.225
  ],
  "resample": 3,
  "rescale_factor": 0.00392156862745098,
  "size": {
    "height": 518,
    "width": 518
  }
}

torch.Size([1, 3, 518, 518])
25
torch.Size([1, 1370, 1024])
37 37
