# Final Project

## Environment Setup

In [8]:
import os
from PIL import Image
from tqdm import tqdm
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from transformers import AutoFeatureExtractor, AutoModel

## Custom DogVsAi Dataset Class

In [12]:
class DogVsAiDataset(Dataset):
    def __init__(self, root_dir, transform=None):
        self.image_dir = os.path.join(root_dir, "Images")
        self.label_dir = os.path.join(root_dir, "Labels")
        self.transform = transform
        self.image_names = [f.split('.')[0] for f in os.listdir(self.image_dir) if f.endswith('.jpg')]

    def __len__(self):
        return len(self.image_names)

    def __getitem__(self, idx):
        name = self.image_names[idx]
        image_path = os.path.join(self.image_dir, name + '.jpg')
        label_path = os.path.join(self.label_dir, name + '.txt')

        image = Image.open(image_path).convert("RGB")
        if self.transform:
            image = self.transform(image)

        with open(label_path, 'r') as f:
            label = int(f.read().strip())

        return image, label

## ViT Feature Extractor Setup: Image Transformations and DINO Model Initialization

In [10]:
transform = transforms.Compose([
    transforms.Resize((128, 128)),
    transforms.ToTensor(),
])

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

dino_model_name = "facebook/dino-vits16"
feature_extractor = AutoFeatureExtractor.from_pretrained(dino_model_name)
dino_model = AutoModel.from_pretrained(dino_model_name).to(device)
dino_model.eval()

preprocessor_config.json:   0%|          | 0.00/244 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/453 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/86.7M [00:00<?, ?B/s]

Some weights of ViTModel were not initialized from the model checkpoint at facebook/dino-vits16 and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ViTModel(
  (embeddings): ViTEmbeddings(
    (patch_embeddings): ViTPatchEmbeddings(
      (projection): Conv2d(3, 384, kernel_size=(16, 16), stride=(16, 16))
    )
    (dropout): Dropout(p=0.0, inplace=False)
  )
  (encoder): ViTEncoder(
    (layer): ModuleList(
      (0-11): 12 x ViTLayer(
        (attention): ViTAttention(
          (attention): ViTSelfAttention(
            (query): Linear(in_features=384, out_features=384, bias=True)
            (key): Linear(in_features=384, out_features=384, bias=True)
            (value): Linear(in_features=384, out_features=384, bias=True)
          )
          (output): ViTSelfOutput(
            (dense): Linear(in_features=384, out_features=384, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
        )
        (intermediate): ViTIntermediate(
          (dense): Linear(in_features=384, out_features=1536, bias=True)
          (intermediate_act_fn): GELUActivation()
        )
        (output): ViTOutput(
          (d

model.safetensors:   0%|          | 0.00/86.7M [00:00<?, ?B/s]

## Feature Extraction Pipeline and Dataset Processing

In [16]:
# ======================
# Feature Engineering
# ======================
def extract_and_save_features(dataloader, name_prefix):
    all_features = []
    all_labels = []

    for images, labels in tqdm(dataloader, desc=f"Extracting {name_prefix}"):
        images = [transforms.ToPILImage()(img) for img in images]
        inputs = feature_extractor(images=images, return_tensors="pt", padding=True).to(device)

        with torch.no_grad():
            outputs = dino_model(**inputs)
            features = outputs.last_hidden_state.mean(dim=1)

        all_features.append(features.cpu())
        all_labels.append(labels)

    all_features = torch.cat(all_features)
    all_labels = torch.cat(all_labels)

    save_path = f"features_{name_prefix}.pt"
    torch.save((all_features, all_labels), save_path)
    print(f"Saved: {save_path} | Shape: {all_features.shape}")

# ======================
# Process dataset
# ======================
def process_dataset(base_path, suffix):
    print(f"\n=== Processing dataset: {suffix.upper()} ===")
    train_dataset = DogVsAiDataset(os.path.join(base_path, "Train"), transform=transform)
    val_dataset   = DogVsAiDataset(os.path.join(base_path, "Valid"), transform=transform)
    test_dataset  = DogVsAiDataset(os.path.join(base_path, "Test"), transform=transform)

    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    val_loader   = DataLoader(val_dataset, batch_size=16, shuffle=False)
    test_loader  = DataLoader(test_dataset, batch_size=16, shuffle=False)

    print(f"Train: {len(train_dataset)} | Val: {len(val_dataset)} | Test: {len(test_dataset)}")

    extract_and_save_features(train_loader, f"train_{suffix}")
    extract_and_save_features(val_loader, f"val_{suffix}")
    extract_and_save_features(test_loader, f"test_{suffix}")

# ======================
# Get features
# ======================
process_dataset("archive/Dogs Vs AiDogs", "full")
process_dataset("archive/Dogs Vs AiDogs_CUTTED", "cutted")


=== Processing dataset: FULL ===
Train: 18605 | Val: 5317 | Test: 2658


  return self.preprocess(images, **kwargs)
Extracting train_full: 100%|██████████| 1163/1163 [26:14<00:00,  1.35s/it]


Saved: features_train_full.pt | Shape: torch.Size([18605, 384])


Extracting val_full: 100%|██████████| 333/333 [09:19<00:00,  1.68s/it]


Saved: features_val_full.pt | Shape: torch.Size([5317, 384])


Extracting test_full: 100%|██████████| 167/167 [04:06<00:00,  1.48s/it]


Saved: features_test_full.pt | Shape: torch.Size([2658, 384])

=== Processing dataset: CUTTED ===
Train: 344 | Val: 100 | Test: 50


Extracting train_cutted: 100%|██████████| 22/22 [00:24<00:00,  1.11s/it]


Saved: features_train_cutted.pt | Shape: torch.Size([344, 384])


Extracting val_cutted: 100%|██████████| 7/7 [00:07<00:00,  1.02s/it]


Saved: features_val_cutted.pt | Shape: torch.Size([100, 384])


Extracting test_cutted: 100%|██████████| 4/4 [00:03<00:00,  1.10it/s]

Saved: features_test_cutted.pt | Shape: torch.Size([50, 384])



