# Computing the PCA of a Foreground Object

### Setup

Let's start by loading some pre-requisites and checking the DINOv3 repository location:
- `local` if `DINOV3_LOCATION` environment variable was set to work with a local version of DINOv3 repository;
- `github` if the code should be loaded via torch hub.

In [2]:
import pickle
import os
import urllib

import numpy as np
import matplotlib.pyplot as plt
from PIL import Image

import torch
import torchvision.transforms.functional as TF
from sklearn.decomposition import PCA
from scipy import signal

import random
import tqdm # Progress bar
from sklearn.metrics import pairwise_distances # Isso aqui é para calcular a distancia média entre os patches

In [3]:

DINOV3_GITHUB_LOCATION = "/home/lades/computer_vision/wesley/dino-soja/dinov3"

if os.getenv("DINOV3_LOCATION") is not None:
    DINOV3_LOCATION = os.getenv("DINOV3_LOCATION")
else:
    DINOV3_LOCATION = DINOV3_GITHUB_LOCATION

print(f"DINOv3 location set to {DINOV3_LOCATION}")

# examples of available DINOv3 models:
MODEL_DINOV3_VITS = "dinov3_vits16"
MODEL_DINOV3_VITSP = "dinov3_vits16plus"
MODEL_DINOV3_VITB = "dinov3_vitb16"
MODEL_DINOV3_VITL = "dinov3_vitl16"
MODEL_DINOV3_VITHP = "dinov3_vith16plus"
MODEL_DINOV3_VIT7B = "dinov3_vit7b16"

MODEL_NAME = MODEL_DINOV3_VITL

print(f"Loading DINOv3 model {MODEL_NAME}...")

model = torch.hub.load(
    repo_or_dir=DINOV3_LOCATION,
    model=MODEL_NAME,
    source="local",
    weights="https://dinov3.llamameta.net/dinov3_vitl16/dinov3_vitl16_pretrain_sat493m-eadcf0ff.pth?Policy=eyJTdGF0ZW1lbnQiOlt7InVuaXF1ZV9oYXNoIjoiamVkbGhoemY3bnlpYmJ1NnVhdmJ2NGtrIiwiUmVzb3VyY2UiOiJodHRwczpcL1wvZGlub3YzLmxsYW1hbWV0YS5uZXRcLyoiLCJDb25kaXRpb24iOnsiRGF0ZUxlc3NUaGFuIjp7IkFXUzpFcG9jaFRpbWUiOjE3NTcxNzAwMzh9fX1dfQ__&Signature=YktyLH8aRk9MBqJe9X6LfVwzezdJo51sVHwKx29n7BTo88T8QR4HHDj9kCyi2wVyUsI4qDVXoJWfHDdbbkxiEPqXukQUDVtWq25HG5ODC8JP-%7ENDbRUPQYkPUHER6ssK9WK5K4Nva6EdiBtIKqMm9G3RMh1uFA86wGz4X4FvgDiaRh4aCWsM5jLg3Gsvr1QIFoPpgAO%7EOnYYG-wxj1VRnrY32Wm6OoNw61M96kgfPXNxEFwYphBu2ImJykrvgc0Yea1J2jV3FwECfDmdVftQ3okThpbxwPJ-wsAxDXc2z2-bJaPQowafr8afLKO2hFl57iMiDQELW2DDS9GYgvpw7Q__&Key-Pair-Id=K15QRJLYKIFSLZ&Download-Request-ID=783982447506725"
)
model.cuda()

DINOv3 location set to /home/lades/computer_vision/wesley/dino-soja/dinov3
Loading DINOv3 model dinov3_vitl16...


DinoVisionTransformer(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(3, 1024, kernel_size=(16, 16), stride=(16, 16))
    (norm): Identity()
  )
  (rope_embed): RopePositionEmbedding()
  (blocks): ModuleList(
    (0-23): 24 x SelfAttentionBlock(
      (norm1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (attn): SelfAttention(
        (qkv): LinearKMaskedBias(in_features=1024, out_features=3072, bias=True)
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=1024, out_features=1024, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (ls1): LayerScale()
      (norm2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=1024, out_features=4096, bias=True)
        (act): GELU(approximate='none')
        (fc2): Linear(in_features=4096, out_features=1024, bias=True)
        (drop): Dropout(p=0.0, inplace=False)
      )
      (ls2): LayerScale()
    )
  )
  (norm)

In [4]:
PATCH_SIZE = 16
IMAGE_SIZE = 256

SATELLITE_MEAN = (0.430, 0.411, 0.296)
SATELLITE_STD = (0.213, 0.156, 0.143)

MODEL_TO_NUM_LAYERS = {
    MODEL_DINOV3_VITS: 12,
    MODEL_DINOV3_VITSP: 12,
    MODEL_DINOV3_VITB: 12,
    MODEL_DINOV3_VITL: 24,
    MODEL_DINOV3_VITHP: 32,
    MODEL_DINOV3_VIT7B: 40,
}

n_layers = MODEL_TO_NUM_LAYERS[MODEL_NAME]
    
def load_image(path: str) -> Image:
    return Image.open(path).convert("RGB")

# image resize transform to dimensions divisible by patch size
def resize_transform(
    mask_image: Image,
    image_size: int = IMAGE_SIZE,
    patch_size: int = PATCH_SIZE,
) -> torch.Tensor:
    w, h = mask_image.size
    h_patches = int(image_size / patch_size)
    w_patches = int((w * image_size) / (h * patch_size))
    return TF.to_tensor(TF.resize(mask_image, (h_patches * patch_size, w_patches * patch_size)))


In [5]:
# Obter imagem aleatória do dataset
dataset_path = "/home/lades/computer_vision/wesley/dataset/daninhas_multiclasse/DATASET_CARURU/rgb/"
image_uri = f"{dataset_path}{random.choice(os.listdir(dataset_path))}"

print(f"Using image {image_uri}")

# Obtém o respectivo label
label_uri = image_uri.replace("rgb", "labels").replace(".jpg", ".png")
label = np.array(Image.open(label_uri))

print(f"Using label {label_uri}")
np.unique(label)

Using image /home/lades/computer_vision/wesley/dataset/daninhas_multiclasse/DATASET_CARURU/rgb/2025-09-17_TH134_06-05-2025_c_mask_14_8807_24073.jpg
Using label /home/lades/computer_vision/wesley/dataset/daninhas_multiclasse/DATASET_CARURU/labels/2025-09-17_TH134_06-05-2025_c_mask_14_8807_24073.png


array([  0,   1, 255], dtype=uint8)

In [6]:
def extrair_tensores(label, image_uri):
    image = load_image(image_uri)
    image_resized = resize_transform(image)
    image_resized_norm = TF.normalize(image_resized, mean=SATELLITE_MEAN, std=SATELLITE_STD)

    label_mask = (label == 1).astype(np.float32)
    label_mask = signal.convolve2d(label_mask, np.ones((PATCH_SIZE, PATCH_SIZE)), mode='valid')[::PATCH_SIZE, ::PATCH_SIZE]
    fg_score_mf = torch.from_numpy(label_mask > 0.5)

    h_patches, w_patches = image_resized_norm.shape[1] // PATCH_SIZE, image_resized_norm.shape[2] // PATCH_SIZE
    #print(f"Image size: {image_resized_norm.shape}, patches: {h_patches}x{w_patches}")

    with torch.inference_mode():
        with torch.autocast(device_type='cuda', dtype=torch.float32):
            feats = model.get_intermediate_layers(image_resized_norm.unsqueeze(0).cuda(), n=range(n_layers), reshape=True, norm=True)
            x = feats[-1].squeeze().detach().cpu()
            dim = x.shape[0]
            x = x.view(dim, -1).permute(1, 0)
            
    x.shape, fg_score_mf.shape

    # Dividir x em foreground e background, considerando a label_mask
    foreground_x = x[fg_score_mf.view(-1) > 0.5]
    background_x = x[fg_score_mf.view(-1) <= 0.5]

    #print(f'Foreground: {foreground_x.shape}, Background: {background_x.shape}')

    return x, foreground_x, background_x


In [7]:
def calcular_pca(label, image_uri): 
    # Cria a máscara de foreground (tudo que estiver no fundo preto que seja diferente de branco)
    label_mask = (label == 1).astype(np.float32)
    label_mask = signal.convolve2d(label_mask, np.ones((PATCH_SIZE, PATCH_SIZE)), mode='valid')[::PATCH_SIZE, ::PATCH_SIZE]
    fg_score_mf = torch.from_numpy(label_mask > 0.5)

    image = load_image(image_uri)
    image_resized = resize_transform(image)
    image_resized_norm = TF.normalize(image_resized, mean=SATELLITE_MEAN, std=SATELLITE_STD)

    h_patches, w_patches = image_resized_norm.shape[1] // PATCH_SIZE, image_resized_norm.shape[2] // PATCH_SIZE
    print(f"Image size: {image_resized_norm.shape}, patches: {h_patches}x{w_patches}")

    with torch.inference_mode():
        with torch.autocast(device_type='cuda', dtype=torch.float32):
            feats = model.get_intermediate_layers(image_resized_norm.unsqueeze(0).cuda(), n=range(n_layers), reshape=True, norm=True)
            x = feats[-1].squeeze().detach().cpu()
            dim = x.shape[0]
            x = x.view(dim, -1).permute(1, 0)
            
    fg_patches = x

    pca = PCA(n_components=3, whiten=True)
    pca.fit(fg_patches)

    # apply the PCA, and then reshape
    projected_image = torch.from_numpy(pca.transform(x.numpy())).view(h_patches, w_patches, 3)

    # multiply by 2.0 and pass through a sigmoid to get vibrant colors 
    projected_image = torch.nn.functional.sigmoid(projected_image.mul(2.0)).permute(2, 0, 1)

    # mask the background using the fg_score_mf
    projected_foreground_image = projected_image * (fg_score_mf.unsqueeze(0) > 0.5)
    
    projected_background_image = projected_image * (fg_score_mf.unsqueeze(0) <= 0.5)

    return projected_image, projected_foreground_image, projected_background_image

def plotar_imagens(image_uri, projected_image, projected_foreground_image, label_mask, label_uri):
    plt.figure(figsize=(20, 10))
    plt.subplot(1, 5, 1)
    plt.imshow(np.array(Image.open(image_uri)))
    plt.title("Imagem original")
    plt.subplot(1, 5, 2)
    plt.imshow(projected_image.permute(1, 2, 0))
    plt.title("PCA da imagem projetada")
    plt.subplot(1, 5, 3)
    plt.imshow(projected_foreground_image.permute(1, 2, 0))
    plt.title("PCA da imagem projetada + mask")
    plt.subplot(1, 5, 4)
    plt.imshow(np.array(Image.fromarray(label_mask)))
    plt.title("PCA da mask label (GT)")
    plt.subplot(1, 5, 5)
    plt.imshow(Image.open(label_uri))
    plt.title("Label (GT)")
    plt.show()

    # Exibir label_mask com os valores de cada bloquinho
    plt.figure(figsize=(10, 10))
    plt.imshow(label_mask)
    for i in range(label_mask.shape[0]):
        for j in range(label_mask.shape[1]):
            plt.text(j, i, f"{label_mask[i, j]:.1f}", ha='center', va='center', color='white')
    plt.show()

    # Exibir projected_image com os valores de cada bloquinho
    plt.figure(figsize=(10, 10))
    plt.imshow(projected_image.permute(1, 2, 0))
    for i in range(projected_image.shape[1]):
        for j in range(projected_image.shape[2]):
            plt.text(j, i, f"{projected_image[0, i, j]:.1f}", ha='center', va='center', color='white')
    plt.show()

### Experimento
Neste experimento, irei considerar o mask label da respectiva imagem, para separar os patches do PCA da imagem que estiverem dentro da mask label da imagem. Isso sera o foreground e o restante, background.

In [8]:
def agrupar_tensores(dataset_path): # dataset_path deve ser o path do dataset (ex: .../DATASET_CARURU/)
    dataset_path_rgb = f"{dataset_path}/rgb/"
    dataset_path_label = f"{dataset_path}/labels/" # Extensão png
    
    lista_imagens = [f"{dataset_path_rgb}{file}" for file in os.listdir(dataset_path_rgb) if file.endswith('.jpg')]
    lista_labels = [f"{dataset_path_label}{file}" for file in os.listdir(dataset_path_label) if file.endswith('.png')]
    
    lista_imagens.sort()
    lista_labels.sort()

    print(f"Foram encontrados {len(lista_imagens)} imagens e {len(lista_labels)} labels.")

    x_foreground = []
    x_background = []

    # Iterar sobre as imagens utilizando tqdm para mostrar o progresso
    for image_uri, label_uri in tqdm.tqdm(zip(lista_imagens, lista_labels), total=len(lista_imagens)):
        label = np.array(Image.open(label_uri))
        x, foreground, background = extrair_tensores(label, image_uri)
        x_foreground.append(foreground)
        x_background.append(background)

    all_foreground = torch.cat(x_foreground, dim=0)
    all_background = torch.cat(x_background, dim=0)
    
    print(f'Total Foreground: {all_foreground.shape}, Total Background: {all_background.shape}')
    
    return all_foreground, all_background

In [9]:
dataset_path = "/home/lades/computer_vision/wesley/dataset/daninhas_multiclasse/"

# Obter os nomes das classes, pelo sufixo DATASET_
dir_classes = [d for d in os.listdir(dataset_path) if d.startswith("DATASET_")]

classes = []
for dir_class in dir_classes:
    class_name = dir_class.split("DATASET_")[-1]
    # nickname sera as 3 primeiras letras (se a classe tiver somente uma palavra); ou as iniciais das palavras (se tiver mais de uma palavra)
    if "_" in class_name:
        nickname = "".join([word[0] for word in class_name.split("_")]).upper()
    else:
        nickname = class_name[:3].upper()
    classes.append((class_name, dir_class, nickname))

classes.sort()

classes

[('CARURU', 'DATASET_CARURU', 'CAR'),
 ('GRAMINEA_PORTE_ALTO', 'DATASET_GRAMINEA_PORTE_ALTO', 'GPA'),
 ('GRAMINEA_PORTE_BAIXO', 'DATASET_GRAMINEA_PORTE_BAIXO', 'GPB'),
 ('MAMONA', 'DATASET_MAMONA', 'MAM'),
 ('OUTRAS_FOLHAS_LARGAS', 'DATASET_OUTRAS_FOLHAS_LARGAS', 'OFL'),
 ('TREPADEIRA', 'DATASET_TREPADEIRA', 'TRE')]

In [None]:
def calcular_distancias_e_desvio_padrao(all_x_foreground, all_x_background, 
                                 max_samples_fg=5000, max_samples_bg=10000):
    """
    Calcula distâncias usando amostragem para economizar memória
    """
    print(f"Original - FG: {all_x_foreground.shape[0]:,}, BG: {all_x_background.shape[0]:,}")
    
    if all_x_foreground.shape[0] > max_samples_fg:
        # Amostrar foreground
        indices_fg = torch.randperm(all_x_foreground.shape[0])[:max_samples_fg]
        fg_sample = all_x_foreground[indices_fg]
        #print(f"Amostrando foreground: {max_samples_fg:,} patches")
    else:
        fg_sample = all_x_foreground
        
    if all_x_background.shape[0] > max_samples_bg:
        # Amostrar background
        indices_bg = torch.randperm(all_x_background.shape[0])[:max_samples_bg]
        bg_sample = all_x_background[indices_bg]
        #print(f"Amostrando background: {max_samples_bg:,} patches")
    else:
        bg_sample = all_x_background
    
    print(f"Calculando distâncias: {fg_sample.shape[0]:,} x {fg_sample.shape[0]:,} (intra)")
    print(f"Calculando distâncias: {fg_sample.shape[0]:,} x {bg_sample.shape[0]:,} (inter)")
    
    # Calcular distâncias nas amostras
    mean_dist_intra = pairwise_distances(fg_sample).mean()
    mean_dist_inter = pairwise_distances(fg_sample, bg_sample).mean()
    std_intra = pairwise_distances(fg_sample).std()
    std_inter = pairwise_distances(fg_sample, bg_sample).std()

    return mean_dist_intra, mean_dist_inter, std_intra, std_inter

: 

In [None]:
# Iterar sobre as classes e criar um array unico. Inicialmente conterá todas as all_foreground, all_background, mean_dist_classe_classe e mean_dist_classe_not_classe por classe

results = []

for i, (class_name, dir_class, nickname) in enumerate(classes):
    print(f"Processando classe {class_name} – {nickname}...")
    all_x_foreground, all_x_background = agrupar_tensores(f"{dataset_path}{dir_class}")
    #mean_dist_classe_classe = pairwise_distances(all_x_foreground).mean()
    #mean_dist_classe_not_classe = pairwise_distances(all_x_foreground, all_x_background).mean()
    mean_dist_classe_classe, mean_dist_classe_not_classe, std_classe_classe, std_classe_not_classe = calcular_distancias_e_desvio_padrao(all_x_foreground, all_x_background, max_samples_fg=int(0.5*all_x_foreground.shape[0]), max_samples_bg=int(0.5*all_x_background.shape[0]))

    results.append({
        "class": nickname,
        "all_foreground": all_x_foreground,
        "all_background": all_x_background,
        "mean_dist_classe_classe": mean_dist_classe_classe,
        "mean_dist_classe_not_classe": mean_dist_classe_not_classe,
        "std_classe_classe": std_classe_classe,
        "std_classe_not_classe": std_classe_not_classe
    })
    print(f"Distância média {nickname}_{nickname}: {mean_dist_classe_classe}, {nickname}_not_{nickname}: {mean_dist_classe_not_classe}")
    print(f"Desvio padrão {nickname}_{nickname}: {std_classe_classe}, {nickname}_not_{nickname}: {std_classe_not_classe}")
    

Processando classe CARURU – CAR...
Foram encontrados 97 imagens e 97 labels.


100%|██████████| 97/97 [00:13<00:00,  6.93it/s]


Total Foreground: torch.Size([5205, 1024]), Total Background: torch.Size([19627, 1024])
Original - FG: 5,205, BG: 19,627
Calculando distâncias: 2,602 x 2,602 (intra)
Calculando distâncias: 2,602 x 9,813 (inter)
Distância média CAR_CAR: 5.203938007354736, CAR_not_CAR: 5.625357151031494
Desvio padrão CAR_CAR: 0.6614007949829102, CAR_not_CAR: 0.6321923136711121
Processando classe GRAMINEA_PORTE_ALTO – GPA...
Foram encontrados 2726 imagens e 2726 labels.


100%|██████████| 2726/2726 [02:37<00:00, 17.26it/s]


Total Foreground: torch.Size([27369, 1024]), Total Background: torch.Size([670487, 1024])
Original - FG: 27,369, BG: 670,487
Calculando distâncias: 13,684 x 13,684 (intra)
Calculando distâncias: 13,684 x 335,243 (inter)
Distância média GPA_GPA: 6.252010822296143, GPA_not_GPA: 6.465545654296875
Desvio padrão GPA_GPA: 1.2339450120925903, GPA_not_GPA: 1.2388334274291992
Processando classe GRAMINEA_PORTE_BAIXO – GPB...
Foram encontrados 1747 imagens e 1747 labels.


100%|██████████| 1747/1747 [01:44<00:00, 16.72it/s]


Total Foreground: torch.Size([15705, 1024]), Total Background: torch.Size([431527, 1024])
Original - FG: 15,705, BG: 431,527
Calculando distâncias: 7,852 x 7,852 (intra)
Calculando distâncias: 7,852 x 215,763 (inter)
Distância média GPB_GPB: 5.868480682373047, GPB_not_GPB: 6.166003227233887
Desvio padrão GPB_GPB: 0.6617034673690796, GPB_not_GPB: 0.5884485244750977
Processando classe MAMONA – MAM...
Foram encontrados 1103 imagens e 1103 labels.


100%|██████████| 1103/1103 [01:18<00:00, 14.01it/s]


Total Foreground: torch.Size([10524, 1024]), Total Background: torch.Size([271844, 1024])
Original - FG: 10,524, BG: 271,844
Calculando distâncias: 5,262 x 5,262 (intra)
Calculando distâncias: 5,262 x 135,922 (inter)
Distância média MAM_MAM: 7.132387638092041, MAM_not_MAM: 7.052482604980469
Desvio padrão MAM_MAM: 4.106703758239746, MAM_not_MAM: 3.2247474193573
Processando classe OUTRAS_FOLHAS_LARGAS – OFL...
Foram encontrados 2999 imagens e 2999 labels.


100%|██████████| 2999/2999 [03:38<00:00, 13.74it/s]


Total Foreground: torch.Size([28817, 1024]), Total Background: torch.Size([738927, 1024])
Original - FG: 28,817, BG: 738,927
Calculando distâncias: 14,408 x 14,408 (intra)
Calculando distâncias: 14,408 x 369,463 (inter)


In [None]:
results

NameError: name 'results' is not defined

### Experimento 2: Foreground sera a mask das imagens da classe e Background sera a mask das imagens das outras classes