In [None]:
import torch

model = torch.hub.load(
    repo_or_dir='facebookresearch/dinov3',
    model='dinov3_convnext_small',
    weights='/home/spactureai/xalil/dinov3/models/dinov3_convnext_small_pretrain_lvd1689m-296db49d.pth',
)

# where MODEL_NAME can be one of:
# - dinov3_vits16
# - dinov3_vits16plus
# - dinov3_vitb16
# - dinov3_vitl16
# - dinov3_vith16plus
# - dinov3_vit7b16
# - dinov3_convnext_tiny
# - dinov3_convnext_small
# - dinov3_convnext_base
# - dinov3_convnext_large



In [None]:
#import torch

from dinov3.hub.dinotxt import dinov3_vitl16_dinotxt_tet1280d20h24l
model, tokenizer = dinov3_vitl16_dinotxt_tet1280d20h24l()
model = model.to(device="cuda")

In [None]:
import urllib
from PIL import Image

def load_image_from_url(url: str) -> Image:
    with urllib.request.urlopen(url) as f:
        return Image.open(f).convert("RGB")


EXAMPLE_IMAGE_URL = "https://dl.fbaipublicfiles.com/dinov2/images/example.jpg"
img_pil = load_image_from_url(EXAMPLE_IMAGE_URL)
display(img_pil)

In [None]:
import torch
from dinov3.data.transforms import make_classification_eval_transform

image_preprocess = make_classification_eval_transform()
image_tensor = torch.stack([image_preprocess(img_pil)], dim=0).cuda()
texts = ["photo of dogs", "photo of a chair", "photo of a bowl", "photo of a tupperware"]
class_names = ["dog", "chair", "bowl", "tupperware"]
tokenized_texts_tensor = tokenizer.tokenize(texts).cuda()
model = model.cuda()
with torch.autocast('cuda', dtype=torch.float):
    with torch.no_grad():
        image_features = model.encode_image(image_tensor)
        text_features = model.encode_text(tokenized_texts_tensor)
image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)
similarity = (
    text_features.cpu().float().numpy() @ image_features.cpu().float().numpy().T
)
print(similarity) 

In [None]:
from torchvision.datasets import ImageFolder
from dinov3.data.transforms import make_classification_eval_transform

image_preprocess = make_classification_eval_transform(resize_size=512, crop_size=512)
# Please update the following directory to the root of ImageNet1k val dataset.
imagenet_val_root_dir = "/home/spactureai/xalil/dinov3/imagenet"
val_dataset = ImageFolder(imagenet_val_root_dir, image_preprocess)
model = model.eval().cuda()

In [None]:
import os
import json


def get_list(path):
    return os.listdir(path)


def rename_directory(root_path, directory, targets):
    source = os.path.join(root_path, directory)
    target = os.path.join(root_path, targets[directory])
    os.rename(source, target)


def remove_files(root_path, directory, n_remained):
    files = get_list(os.path.join(root_path, directory))
    files = files[n_remained:]
    for file_name in files:
        os.remove(os.path.join(root_path, directory, file_name))


# Run the program
root_path = '/home/spactureai/xalil/dinov3/imagenet'
with open('./imagenet_class_index.json') as f:
    json_data = json.load(f)
targets = {}
for key in json_data.keys():
    targets[json_data[key][0]] = key

directories = get_list(root_path)
for directory in directories:
    remove_files(root_path, directory, 5)
for directory in directories:
    rename_directory(root_path, directory, targets)

In [8]:
with torch.autocast('cuda', dtype=torch.float):
    with torch.no_grad():
        image_class_tokens, image_patch_tokens, backbone_patch_tokens = model.encode_image_with_patch_tokens(image_tensor)
        text_features_aligned_to_patch = model.encode_text(tokenized_texts_tensor)[:, 1024:] # Part of text features that is aligned to patch fea

In [9]:
import torch.nn.functional as F

B, P, D = image_patch_tokens.shape
H = W = int(P**0.5) 
x = image_patch_tokens.movedim(2, 1).unflatten(2, (H, W)).float()  # [B, D, H, W]
x = F.interpolate(x, size=(480, 640), mode="bicubic", align_corners=False)
x = F.normalize(x, p=2, dim=1)
y = F.normalize(text_features_aligned_to_patch.float(), p=2, dim=1)
per_patch_similarity_to_text = torch.einsum("bdhw,cd->bchw", x, y)
pred_idx = per_patch_similarity_to_text.argmax(1).squeeze(0)