In [2]:
from PIL import Image
import numpy as np
import torch
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import cv2

from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import PCA

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Running on device: {device}')

Running on device: cuda


In [4]:
REPO_NAME = "facebookresearch/dinov2"
MODEL_NAME = "dinov2_vits14_reg"
model = torch.hub.load(REPO_NAME, 'dinov2_vits14').to(device)

Using cache found in /home/mn4560/.cache/torch/hub/facebookresearch_dinov2_main


In [5]:
class DinoV2Matcher:
    def __init__(
    self, 
    repo_name=REPO_NAME, 
    model_name=MODEL_NAME, 
    smaller_edge_size=448, 
    half_precision=False, 
    device="cuda"
    ):
        self.repo_name = repo_name
        self.model_name = model_name
        self.smaller_edge_size = smaller_edge_size
        self.half_precision = half_precision
        self.device = device

        if self.half_precision:
            self.model = torch.hub.load(repo_or_dir=repo_name, model=model_name).half().to(self.device)
        else:
            self.model = torch.hub.load(repo_or_dir=repo_name, model=model_name).to(self.device)

        self.model.eval()

        # Rescale to ImageNet defaults
        self.transform = transforms.Compose([
            transforms.Resize(
                size=smaller_edge_size, 
                interpolation=transforms.InterpolationMode.BICUBIC, 
                antialias=True
            ),
            transforms.ToTensor(),
            transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
            ])


    # https://github.com/facebookresearch/dinov2/blob/255861375864acdd830f99fdae3d9db65623dafe/notebooks/features.ipynb
    def prepare_image(
        self, 
        rgb_image_numpy
    ):
        image = Image.fromarray(rgb_image_numpy)
        image_tensor = self.transform(image)
        resize_scale = image.width / image_tensor.shape[2]

        # Crop image to dimensions that are a multiple of the patch size
        height, width = image_tensor.shape[1:] # -> C x H x W
        
        # crop a bit from right and bottom parts
        cropped_width, cropped_height = width - width % self.model.patch_size, height - height % self.model.patch_size
        image_tensor = image_tensor[:, :cropped_height, :cropped_width]

        grid_size = (cropped_height // self.model.patch_size, cropped_width // self.model.patch_size)
        return image_tensor, grid_size, resize_scale


    def prepare_mask(
        self, 
        mask_image_numpy, 
        grid_size, 
        resize_scale
    ):
        cropped_mask_image_numpy = mask_image_numpy[:int(grid_size[0]*self.model.patch_size*resize_scale), :int(grid_size[1]*self.model.patch_size*resize_scale)]
        image = Image.fromarray(cropped_mask_image_numpy)
        resized_mask = image.resize((grid_size[1], grid_size[0]), resample=Image.Resampling.NEAREST)
        resized_mask = np.asarray(resized_mask).flatten()
        return resized_mask


    def extract_features(
        self, 
        image_tensor
    ):
        with torch.inference_mode():
            if self.half_precision:
                image_batch = image_tensor.unsqueeze(0).half().to(self.device)
            else:
                image_batch = image_tensor.unsqueeze(0).to(self.device)

            tokens = self.model.get_intermediate_layers(image_batch)[0].squeeze()
            print(tokens)
            return tokens.cpu().numpy()


    def idx_to_source_position(
        self, 
        idx, 
        grid_size, 
        resize_scale
    ):
        row = (idx // grid_size[1])*self.model.patch_size*resize_scale + self.model.patch_size / 2
        col = (idx % grid_size[1])*self.model.patch_size*resize_scale + self.model.patch_size / 2
        return row, col


    def get_embedding_visualization(
        self, 
        tokens, 
        grid_size, 
        resized_mask=None
    ):
        pca = PCA(n_components=3)
        if resized_mask is not None:
            tokens = tokens[resized_mask]
        reduced_tokens = pca.fit_transform(tokens.astype(np.float32))
        if resized_mask is not None:
            tmp_tokens = np.zeros((*resized_mask.shape, 3), dtype=reduced_tokens.dtype)
            tmp_tokens[resized_mask] = reduced_tokens
            reduced_tokens = tmp_tokens
        reduced_tokens = reduced_tokens.reshape((*grid_size, -1))
        normalized_tokens = (reduced_tokens-np.min(reduced_tokens))/(np.max(reduced_tokens)-np.min(reduced_tokens))
        return normalized_tokens


    def get_combined_embedding_visualization(
        self, 
        tokens1, 
        token2, 
        grid_size1, 
        grid_size2, 
        mask1=None, 
        mask2=None, 
        random_state=20
    ):
        pca = PCA(n_components=3, random_state=random_state)
        
        token1_shape = tokens1.shape[0]
        if mask1 is not None:
            tokens1 = tokens1[mask1]
        if mask2 is not None:
            token2 = token2[mask2]
        combinedtokens= np.concatenate((tokens1, token2), axis=0)
        reduced_tokens = pca.fit_transform(combinedtokens.astype(np.float32))
        
        
        if mask1 is not None and mask2 is not None:
            resized_mask = np.concatenate((mask1, mask2), axis=0)
            tmp_tokens = np.zeros((*resized_mask.shape, 3), dtype=reduced_tokens.dtype)
            tmp_tokens[resized_mask] = reduced_tokens
            reduced_tokens = tmp_tokens
        elif mask1 is not None and mask2 is None:
            return sys.exit("Either use both masks or none")
        elif mask1 is None and mask2 is not None:
            return sys.exit("Either use both masks or none")
        
        print("tokens1.shape", tokens1.shape)
        print("token2.shape", token2.shape)
        print("reduced_tokens.shape", reduced_tokens.shape)
        normalized_tokens = (reduced_tokens-np.min(reduced_tokens))/(np.max(reduced_tokens)-np.min(reduced_tokens))

        rgbimg1 = normalized_tokens[0:token1_shape,:]
        rgbimg2 = normalized_tokens[token1_shape:,:]

        rgbimg1 = rgbimg1.reshape((*grid_size1, -1))
        rgbimg2 = rgbimg2.reshape((*grid_size2, -1))
        return rgbimg1,rgbimg2

In [6]:
import cv2
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

def get_keypoints(features, grid_size, resize_scale, top_k=10):
    # Compute the mean feature vector
    mean_feature = np.mean(features, axis=0)

    # Compute the L2 distance between each feature vector and the mean feature vector
    distances = np.linalg.norm(features - mean_feature, axis=1)

    # Get the indices of the top-k salient features
    salient_indices = np.argsort(distances)[-top_k:]

    # Convert salient indices to keypoint locations
    keypoints = [dm.idx_to_source_position(idx, grid_size, resize_scale) for idx in salient_indices]

    return keypoints


# Load image and mask
image1 = cv2.cvtColor(cv2.imread('Images/asd1.jpg', cv2.IMREAD_COLOR), cv2.COLOR_BGR2RGB)
mask1 = cv2.imread('Images/asd1_mask.jpg', cv2.IMREAD_COLOR)[:, :, 0] > 127

image2 = cv2.cvtColor(cv2.imread('Images/asd2.jpg', cv2.IMREAD_COLOR), cv2.COLOR_BGR2RGB)
mask2 = cv2.imread('Images/asd2_mask.jpg', cv2.IMREAD_COLOR)[:, :, 0] > 127

# Init model
dm = DinoV2Matcher(repo_name=REPO_NAME, model_name=MODEL_NAME, half_precision=False)

# Extract features
image_tensor1, grid_size1, resize_scale1 = dm.prepare_image(image1)
features1 = dm.extract_features(image_tensor1)

image_tensor2, grid_size2, resize_scale2 = dm.prepare_image(image2)
features2 = dm.extract_features(image_tensor2)

# Prepare masks
resized_mask1 = dm.prepare_mask(mask1, grid_size1, resize_scale1)
resized_mask2 = dm.prepare_mask(mask2, grid_size2, resize_scale2)

# Get keypoints
keypoints1 = get_keypoints(features1, grid_size1, resize_scale1, top_k=300)
keypoints2 = get_keypoints(features2, grid_size2, resize_scale2, top_k=300)

# Compute PCA between patches of the images
pca = PCA(n_components=3)
pca_features1 = []
pca_features2 = []

for keypoint1, keypoint2 in zip(keypoints1, keypoints2):
    patch1 = features1[keypoint1[0], keypoint1[1], :]
    patch2 = features2[keypoint2[0], keypoint2[1], :]
    pca_features = pca.fit_transform(np.vstack((patch1, patch2)))
    pca_features1.append(pca_features[0])
    pca_features2.append(pca_features[1])

pca_features1 = np.array(pca_features1)
pca_features2 = np.array(pca_features2)

# Normalize PCA features
pca_features1 = (pca_features1 - np.min(pca_features1)) / (np.max(pca_features1) - np.min(pca_features1))
pca_features2 = (pca_features2 - np.min(pca_features2)) / (np.max(pca_features2) - np.min(pca_features2))

# Threshold the first PCA component to remove background
threshold = 0.2
pca_features1[:, 0] = np.where(pca_features1[:, 0] > threshold, pca_features1[:, 0], 0)
pca_features2[:, 0] = np.where(pca_features2[:, 0] > threshold, pca_features2[:, 0], 0)

# Visualization
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(12, 12))

# Plot PCA features for image1
ax1.imshow(image1)
for keypoint, pca_feature in zip(keypoints1, pca_features1):
    color = pca_feature
    ax1.plot(keypoint[1], keypoint[0], 'o', markersize=5, color=color)
ax1.set_title("Image 1 with PCA Features")

# Plot PCA features for image2
ax3.imshow(image2)
for keypoint, pca_feature in zip(keypoints2, pca_features2):
    color = pca_feature
    ax3.plot(keypoint[1], keypoint[0], 'o', markersize=5, color=color)
ax3.set_title("Image 2 with PCA Features")

# Use mask
vis_image3, vis_image4 = dm.get_combined_embedding_visualization(
    features1, features2, grid_size1, grid_size2, resized_mask1, resized_mask2
)

ax2.imshow(vis_image3)
ax2.set_title("Combined Embedding Visualization (Image 1)")
ax4.imshow(vis_image4)
ax4.set_title("Combined Embedding Visualization (Image 2)")

fig.tight_layout()
plt.show()

Using cache found in /home/mn4560/.cache/torch/hub/facebookresearch_dinov2_main


tensor([[-1.5454,  2.0980,  3.3032,  ...,  2.4588,  2.0603,  3.0550],
        [-1.3895,  1.8239,  3.3942,  ...,  1.7078,  2.0420,  2.8918],
        [-0.6265,  2.0717,  3.9208,  ...,  0.4604,  2.1315,  2.5929],
        ...,
        [-0.0944, -1.7360,  3.8943,  ..., -0.0304,  0.8920,  3.4019],
        [-0.0931,  0.7979,  3.7121,  ...,  0.9696,  0.8654,  2.9916],
        [-0.3459, -0.2272,  2.5107,  ...,  0.9055,  1.2308,  2.6028]],
       device='cuda:0')
tensor([[-1.6779,  1.1822,  2.1663,  ...,  2.5440,  1.4690,  2.4890],
        [-1.7166,  1.0868,  2.7987,  ...,  1.5234,  1.7411,  2.6487],
        [-2.6033,  0.5032,  2.5419,  ...,  2.0332,  1.3798,  2.6301],
        ...,
        [-1.6551, -2.2631,  1.7313,  ...,  1.0665,  0.2895,  3.9159],
        [-1.4922, -0.7579,  2.4468,  ...,  1.3105,  0.5018,  3.8212],
        [-1.2103, -1.1017,  1.5953,  ...,  1.6000,  0.4576,  3.6525]],
       device='cuda:0')


IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

: 

### Video Example ###

ModuleNotFoundError: No module named 'mmcv'

: 