# Computing the PCA of a Foreground Object

### Setup

Let's start by loading some pre-requisites and checking the DINOv3 repository location:
- `local` if `DINOV3_LOCATION` environment variable was set to work with a local version of DINOv3 repository;
- `github` if the code should be loaded via torch hub.

In [1]:
import pickle
import os
import urllib

import numpy as np
import matplotlib.pyplot as plt
from PIL import Image

import torch
import torchvision.transforms.functional as TF
from sklearn.decomposition import PCA
from scipy import signal

import random

DINOV3_GITHUB_LOCATION = "/home/lades/computer_vision/wesley/dino-soja/dinov3"

if os.getenv("DINOV3_LOCATION") is not None:
    DINOV3_LOCATION = os.getenv("DINOV3_LOCATION")
else:
    DINOV3_LOCATION = DINOV3_GITHUB_LOCATION

print(f"DINOv3 location set to {DINOV3_LOCATION}")

# examples of available DINOv3 models:
MODEL_DINOV3_VITS = "dinov3_vits16"
MODEL_DINOV3_VITSP = "dinov3_vits16plus"
MODEL_DINOV3_VITB = "dinov3_vitb16"
MODEL_DINOV3_VITL = "dinov3_vitl16"
MODEL_DINOV3_VITHP = "dinov3_vith16plus"
MODEL_DINOV3_VIT7B = "dinov3_vit7b16"

MODEL_NAME = MODEL_DINOV3_VITL

print(f"Loading DINOv3 model {MODEL_NAME}...")

model = torch.hub.load(
    repo_or_dir=DINOV3_LOCATION,
    model=MODEL_NAME,
    source="local",
    weights="https://dinov3.llamameta.net/dinov3_vitl16/dinov3_vitl16_pretrain_sat493m-eadcf0ff.pth?Policy=eyJTdGF0ZW1lbnQiOlt7InVuaXF1ZV9oYXNoIjoiamVkbGhoemY3bnlpYmJ1NnVhdmJ2NGtrIiwiUmVzb3VyY2UiOiJodHRwczpcL1wvZGlub3YzLmxsYW1hbWV0YS5uZXRcLyoiLCJDb25kaXRpb24iOnsiRGF0ZUxlc3NUaGFuIjp7IkFXUzpFcG9jaFRpbWUiOjE3NTcxNzAwMzh9fX1dfQ__&Signature=YktyLH8aRk9MBqJe9X6LfVwzezdJo51sVHwKx29n7BTo88T8QR4HHDj9kCyi2wVyUsI4qDVXoJWfHDdbbkxiEPqXukQUDVtWq25HG5ODC8JP-%7ENDbRUPQYkPUHER6ssK9WK5K4Nva6EdiBtIKqMm9G3RMh1uFA86wGz4X4FvgDiaRh4aCWsM5jLg3Gsvr1QIFoPpgAO%7EOnYYG-wxj1VRnrY32Wm6OoNw61M96kgfPXNxEFwYphBu2ImJykrvgc0Yea1J2jV3FwECfDmdVftQ3okThpbxwPJ-wsAxDXc2z2-bJaPQowafr8afLKO2hFl57iMiDQELW2DDS9GYgvpw7Q__&Key-Pair-Id=K15QRJLYKIFSLZ&Download-Request-ID=783982447506725"
)
model.cuda()

DINOv3 location set to /home/lades/computer_vision/wesley/dino-soja/dinov3
Loading DINOv3 model dinov3_vitl16...


DinoVisionTransformer(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(3, 1024, kernel_size=(16, 16), stride=(16, 16))
    (norm): Identity()
  )
  (rope_embed): RopePositionEmbedding()
  (blocks): ModuleList(
    (0-23): 24 x SelfAttentionBlock(
      (norm1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (attn): SelfAttention(
        (qkv): LinearKMaskedBias(in_features=1024, out_features=3072, bias=True)
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=1024, out_features=1024, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (ls1): LayerScale()
      (norm2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=1024, out_features=4096, bias=True)
        (act): GELU(approximate='none')
        (fc2): Linear(in_features=4096, out_features=1024, bias=True)
        (drop): Dropout(p=0.0, inplace=False)
      )
      (ls2): LayerScale()
    )
  )
  (norm)

In [None]:
PATCH_SIZE = 16
IMAGE_SIZE = 256

SATELLITE_MEAN = (0.430, 0.411, 0.296)
SATELLITE_STD = (0.213, 0.156, 0.143)

MODEL_TO_NUM_LAYERS = {
    MODEL_DINOV3_VITS: 12,
    MODEL_DINOV3_VITSP: 12,
    MODEL_DINOV3_VITB: 12,
    MODEL_DINOV3_VITL: 24,
    MODEL_DINOV3_VITHP: 32,
    MODEL_DINOV3_VIT7B: 40,
}

n_layers = MODEL_TO_NUM_LAYERS[MODEL_NAME]
    
def load_image(path: str) -> Image:
    return Image.open(path).convert("RGB")

# image resize transform to dimensions divisible by patch size
def resize_transform(
    mask_image: Image,
    image_size: int = IMAGE_SIZE,
    patch_size: int = PATCH_SIZE,
) -> torch.Tensor:
    w, h = mask_image.size
    h_patches = int(image_size / patch_size)
    w_patches = int((w * image_size) / (h * patch_size))
    return TF.to_tensor(TF.resize(mask_image, (h_patches * patch_size, w_patches * patch_size)))


In [25]:
# Obter imagem aleatória do dataset
dataset_path = "/home/lades/computer_vision/wesley/dataset/daninhas_multiclasse/DATASET_CARURU/rgb/"
image_uri = f"{dataset_path}{random.choice(os.listdir(dataset_path))}"

print(f"Using image {image_uri}")

# Obtém o respectivo label
label_uri = image_uri.replace("rgb", "labels").replace(".jpg", ".png")
label = np.array(Image.open(label_uri))

print(f"Using label {label_uri}")
np.unique(label)

Using image /home/lades/computer_vision/wesley/dataset/daninhas_multiclasse/DATASET_CARURU/rgb/2025-09-17_TH134_06-05-2025_c_mask_36_34169_21541.jpg
Using label /home/lades/computer_vision/wesley/dataset/daninhas_multiclasse/DATASET_CARURU/labels/2025-09-17_TH134_06-05-2025_c_mask_36_34169_21541.png


array([  0,   1, 255], dtype=uint8)

In [38]:
def extrair_tensores(label, image_uri):
    image = load_image(image_uri)
    image_resized = resize_transform(image)
    image_resized_norm = TF.normalize(image_resized, mean=SATELLITE_MEAN, std=SATELLITE_STD)

    label_mask = (label == 1).astype(np.float32)
    label_mask = signal.convolve2d(label_mask, np.ones((PATCH_SIZE, PATCH_SIZE)), mode='valid')[::PATCH_SIZE, ::PATCH_SIZE]
    fg_score_mf = torch.from_numpy(label_mask > 0.5)

    h_patches, w_patches = image_resized_norm.shape[1] // PATCH_SIZE, image_resized_norm.shape[2] // PATCH_SIZE
    print(f"Image size: {image_resized_norm.shape}, patches: {h_patches}x{w_patches}")

    with torch.inference_mode():
        with torch.autocast(device_type='cuda', dtype=torch.float32):
            feats = model.get_intermediate_layers(image_resized_norm.unsqueeze(0).cuda(), n=range(n_layers), reshape=True, norm=True)
            x = feats[-1].squeeze().detach().cpu()
            dim = x.shape[0]
            x = x.view(dim, -1).permute(1, 0)
            
    x.shape, fg_score_mf.shape

    # Dividir x em foreground e background, considerando a label_mask
    foreground_x = x[fg_score_mf.view(-1) > 0.5]
    background_x = x[fg_score_mf.view(-1) <= 0.5]

    print(f'Foreground: {foreground_x.shape}, Background: {background_x.shape}')

    return x, foreground_x, background_x


In [20]:
def calcular_pca(label, image_uri): 
    # Cria a máscara de foreground (tudo que estiver no fundo preto que seja diferente de branco)
    label_mask = (label == 1).astype(np.float32)
    label_mask = signal.convolve2d(label_mask, np.ones((PATCH_SIZE, PATCH_SIZE)), mode='valid')[::PATCH_SIZE, ::PATCH_SIZE]
    fg_score_mf = torch.from_numpy(label_mask > 0.5)

    image = load_image(image_uri)
    image_resized = resize_transform(image)
    image_resized_norm = TF.normalize(image_resized, mean=SATELLITE_MEAN, std=SATELLITE_STD)

    h_patches, w_patches = image_resized_norm.shape[1] // PATCH_SIZE, image_resized_norm.shape[2] // PATCH_SIZE
    print(f"Image size: {image_resized_norm.shape}, patches: {h_patches}x{w_patches}")

    with torch.inference_mode():
        with torch.autocast(device_type='cuda', dtype=torch.float32):
            feats = model.get_intermediate_layers(image_resized_norm.unsqueeze(0).cuda(), n=range(n_layers), reshape=True, norm=True)
            x = feats[-1].squeeze().detach().cpu()
            dim = x.shape[0]
            x = x.view(dim, -1).permute(1, 0)
            
    fg_patches = x

    pca = PCA(n_components=3, whiten=True)
    pca.fit(fg_patches)

    # apply the PCA, and then reshape
    projected_image = torch.from_numpy(pca.transform(x.numpy())).view(h_patches, w_patches, 3)

    # multiply by 2.0 and pass through a sigmoid to get vibrant colors 
    projected_image = torch.nn.functional.sigmoid(projected_image.mul(2.0)).permute(2, 0, 1)

    # mask the background using the fg_score_mf
    projected_foreground_image = projected_image * (fg_score_mf.unsqueeze(0) > 0.5)
    
    projected_background_image = projected_image * (fg_score_mf.unsqueeze(0) <= 0.5)

    return projected_image, projected_foreground_image, projected_background_image

def plotar_imagens(image_uri, projected_image, projected_foreground_image, label_mask, label_uri):
    plt.figure(figsize=(20, 10))
    plt.subplot(1, 5, 1)
    plt.imshow(np.array(Image.open(image_uri)))
    plt.title("Imagem original")
    plt.subplot(1, 5, 2)
    plt.imshow(projected_image.permute(1, 2, 0))
    plt.title("PCA da imagem projetada")
    plt.subplot(1, 5, 3)
    plt.imshow(projected_foreground_image.permute(1, 2, 0))
    plt.title("PCA da imagem projetada + mask")
    plt.subplot(1, 5, 4)
    plt.imshow(np.array(Image.fromarray(label_mask)))
    plt.title("PCA da mask label (GT)")
    plt.subplot(1, 5, 5)
    plt.imshow(Image.open(label_uri))
    plt.title("Label (GT)")
    plt.show()

    # Exibir label_mask com os valores de cada bloquinho
    plt.figure(figsize=(10, 10))
    plt.imshow(label_mask)
    for i in range(label_mask.shape[0]):
        for j in range(label_mask.shape[1]):
            plt.text(j, i, f"{label_mask[i, j]:.1f}", ha='center', va='center', color='white')
    plt.show()

    # Exibir projected_image com os valores de cada bloquinho
    plt.figure(figsize=(10, 10))
    plt.imshow(projected_image.permute(1, 2, 0))
    for i in range(projected_image.shape[1]):
        for j in range(projected_image.shape[2]):
            plt.text(j, i, f"{projected_image[0, i, j]:.1f}", ha='center', va='center', color='white')
    plt.show()

### Experimento
Neste experimento, irei considerar o mask label da respectiva imagem, para separar os patches do PCA da imagem que estiverem dentro da mask label da imagem. Isso sera o foreground e o restante, background.

In [None]:
dataset_path_rgb = "/home/lades/computer_vision/wesley/dataset/daninhas_multiclasse/DATASET_CARURU/rgb/"
dataset_path_label = "/home/lades/computer_vision/wesley/dataset/daninhas_multiclasse/DATASET_CARURU/labels/" # Extensão png

lista_imagens = [f"{dataset_path_rgb}{file}" for file in os.listdir(dataset_path_rgb) if file.endswith('.jpg')]
lista_labels = [f"{dataset_path_label}{file.split('.')[0]}.png" for file in os.listdir(dataset_path_rgb) if file.endswith('.jpg')]

len(lista_imagens), len(lista_labels)

x_caruru = []
x_not_caruru = []

for image_uri, label_uri in zip(lista_imagens, lista_labels):
    label = np.array(Image.open(label_uri))
    x, foreground_x, background_x = extrair_tensores(label, image_uri)
    x_caruru.append(foreground_x)
    x_not_caruru.append(background_x)
    

Image size: torch.Size([3, 256, 256]), patches: 16x16
Foreground: torch.Size([31, 1024]), Background: torch.Size([225, 1024])
Image size: torch.Size([3, 256, 256]), patches: 16x16
Foreground: torch.Size([130, 1024]), Background: torch.Size([126, 1024])
Image size: torch.Size([3, 256, 256]), patches: 16x16
Foreground: torch.Size([63, 1024]), Background: torch.Size([193, 1024])
Image size: torch.Size([3, 256, 256]), patches: 16x16
Foreground: torch.Size([5, 1024]), Background: torch.Size([251, 1024])
Image size: torch.Size([3, 256, 256]), patches: 16x16
Foreground: torch.Size([43, 1024]), Background: torch.Size([213, 1024])
Image size: torch.Size([3, 256, 256]), patches: 16x16
Foreground: torch.Size([7, 1024]), Background: torch.Size([249, 1024])
Image size: torch.Size([3, 256, 256]), patches: 16x16
Foreground: torch.Size([84, 1024]), Background: torch.Size([172, 1024])
Image size: torch.Size([3, 256, 256]), patches: 16x16
Foreground: torch.Size([5, 1024]), Background: torch.Size([251, 1

In [48]:
all_x_caruru = torch.cat(x_caruru, dim=0)
all_x_not_caruru = torch.cat(x_not_caruru, dim=0)

all_x_caruru.shape, all_x_not_caruru.shape

(torch.Size([5205, 1024]), torch.Size([19627, 1024]))

In [49]:
# Calcular distância média entre os pontos da própria classe (mean_dist_caruru_caruru e mean_dist_caruru_not_caruru)
from sklearn.metrics import pairwise_distances

mean_dist_caruru_caruru = pairwise_distances(all_x_caruru).mean()
mean_dist_caruru_not_caruru = pairwise_distances(all_x_caruru, all_x_not_caruru).mean()

mean_dist_caruru_caruru, mean_dist_caruru_not_caruru


(5.208234, 5.623066)