In [1]:
# Note: The model and training settings do not follow the reference settings
# from the paper. The settings are chosen such that the example can easily be
# run on a small dataset with a single GPU.
import os
import pytorch_lightning as pl
import torch
import torchvision
from torch import nn

from lightly.loss import VICRegLLoss

## The global projection head is the same as the Barlow Twins one
from lightly.models.modules import BarlowTwinsProjectionHead
from lightly.models.modules.heads import VicRegLLocalProjectionHead
from lightly.transforms.vicregl_transform import VICRegLTransform
from lightly.transforms import utils
from lightly.data import LightlyDataset


from pytorch_lightning.loggers.neptune import NeptuneLogger

from pytorch_lightning.callbacks import LearningRateMonitor

In [2]:
convnext = torchvision.models.convnext_small()
convnext.classifier = nn.Identity()
backbone = convnext.features

In [3]:
img = torch.rand((1, 3, 224, 224))
backbone(img).shape

torch.Size([1, 768, 7, 7])

In [3]:
class VICRegL(pl.LightningModule):
    def __init__(self):
        super().__init__()
        #resnet = torchvision.models.resnet18()
        #self.backbone = nn.Sequential(*list(resnet.children())[:-2])
        #out_dim = 512 # resnet18
        
        self.backbone = torchvision.models.convnext_small().features
        out_dim = 768 # convnext
        
        self.projection_head = BarlowTwinsProjectionHead(out_dim, 2048, 2048)
        self.local_projection_head = VicRegLLocalProjectionHead(out_dim, 128, 128)
        self.average_pool = nn.AdaptiveAvgPool2d(output_size=(1, 1))
        self.criterion = VICRegLLoss()

    def forward(self, x):
        x = self.backbone(x)
        y = self.average_pool(x).flatten(start_dim=1)
        z = self.projection_head(y)
        y_local = x.permute(0, 2, 3, 1)  # (B, D, W, H) to (B, W, H, D)
        z_local = self.local_projection_head(y_local)
        return z, z_local

    def training_step(self, batch, batch_index):
        views_and_grids = batch[0]
        views = views_and_grids[: len(views_and_grids) // 2]
        grids = views_and_grids[len(views_and_grids) // 2 :]
        features = [self.forward(view) for view in views]
        loss = self.criterion(
            global_view_features=features[:2],
            global_view_grids=grids[:2],
            local_view_features=features[2:],
            local_view_grids=grids[2:],
        )
        
        self.log('train/batch/loss', loss.detach().cpu())
        
        return loss
    
    def configure_optimizers(self):
        
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-4, weight_decay=5e-5)
        
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=self.trainer.max_epochs, eta_min=1e-6)
        
        return {"optimizer": optimizer, "lr_scheduler": scheduler}


model = VICRegL()

transform = VICRegLTransform()

In [4]:
import pickle

if os.path.exists("dataset_cache.pkl"):
    
    with open("dataset_cache.pkl", "rb") as fp:
        dataset = pickle.load(fp)

else:
    # or create a dataset from a folder containing images or videos:
    dataset = LightlyDataset("/data/2m/train", transform=transform)
    
    with open("dataset_cache.pkl", "wb+") as fp:
        pickle.dump(dataset, fp)

In [5]:
dataloader = torch.utils.data.DataLoader(
    dataset,
    batch_size=32,
    shuffle=True,
    drop_last=True,
    num_workers=8,
)

In [6]:
neptune_logger = NeptuneLogger(
    project="cape/dinov2",  
    tags=["training", "vicregl"],  # optional
)

In [7]:
lr_monitor = LearningRateMonitor(logging_interval='step')

In [8]:
accelerator = "gpu" # if torch.cuda.is_available() else "cpu"

trainer = pl.Trainer(
    max_steps=10, 
    devices=1, 
    accelerator=accelerator, 
    precision='16-mixed', 
    logger=neptune_logger,
    callbacks=[lr_monitor]
)


trainer.fit(model=model, train_dataloaders=dataloader)

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  self._run_instance = neptune.init_run(**self._neptune_init_args)


https://app.neptune.ai/cape/dinov2/e/DIN-219


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]

  | Name                  | Type                       | Params
---------------------------------------------------------------------
0 | backbone              | Sequential                 | 49.5 M
1 | projection_head       | BarlowTwinsProjectionHead  | 10.0 M
2 | local_projection_head | VicRegLLocalProjectionHead | 131 K 
3 | average_pool          | AdaptiveAvgPool2d          | 0     
4 | criterion             | VICRegLLoss                | 0     
---------------------------------------------------------------------
59.6 M    Trainable params
0         Non-trainable params
59.6 M    Total params
238.226   Total estimated model params size (MB)


Epoch 0:   0% 10/68036 [00:13<24:53:14,  1.32s/it, v_num=-219]

`Trainer.fit` stopped: `max_steps=10` reached.


Epoch 0:   0% 10/68036 [00:16<31:47:40,  1.68s/it, v_num=-219]


In [9]:
trainer.logger.experiment.stop()

Shutting down background jobs, please wait a moment...
Done!
Waiting for the remaining 3 operations to synchronize with Neptune. Do not kill this process.
Still waiting for the remaining 3 operations (0.00% done). Please wait.
All 3 operations synced, thanks for waiting!
Explore the metadata in the Neptune app:
https://app.neptune.ai/cape/dinov2/e/DIN-219/metadata


In [10]:
import torch

In [11]:
torch.cuda.empty_cache()

In [10]:
dataset.dataset.classes

['n00000001', 'n00000002', 'n00000003', 'n00000004', 'n00000005', 'n00000006']

In [12]:
val_transforms = torchvision.transforms.Compose(
    [
        torchvision.transforms.Resize((256, 256)),
        torchvision.transforms.CenterCrop((224, 224)),
        torchvision.transforms.ToTensor(),
        torchvision.transforms.Normalize(
            mean=utils.IMAGENET_NORMALIZE["mean"],
            std=utils.IMAGENET_NORMALIZE["std"],
        ),
    ]
)


In [18]:
from torchvision.datasets import ImageFolder

val_dataset = ImageFolder("/data/2m/val", transform=val_transforms)

In [22]:
from pytorch_metric_learning import testers

tester = testers.GlobalEmbeddingSpaceTester()
dataset_dict = {"val": val_dataset}
all_accuracies = tester.test(dataset_dict, 1, model.cuda())

  0% 0/625 [00:00<?, ?it/s]


AttributeError: 'tuple' object has no attribute 'size'

In [23]:
%debug

> [0;32m/root/lightly/lib/python3.10/site-packages/pytorch_metric_learning/testers/base_tester.py[0m(92)[0;36mcompute_all_embeddings[0;34m()[0m
[0;32m     90 [0;31m                    all_q = torch.zeros(
[0m[0;32m     91 [0;31m                        [0mlen[0m[0;34m([0m[0mdataloader[0m[0;34m.[0m[0mdataset[0m[0;34m)[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 92 [0;31m                        [0mq[0m[0;34m.[0m[0msize[0m[0;34m([0m[0;36m1[0m[0;34m)[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     93 [0;31m                        [0mdevice[0m[0;34m=[0m[0mself[0m[0;34m.[0m[0mdata_device[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     94 [0;31m                        [0mdtype[0m[0;34m=[0m[0mq[0m[0;34m.[0m[0mdtype[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  !q


(tensor([[ 0.0978, -0.1171, -0.4288,  ..., -0.2400,  0.3144, -0.3985],
        [-0.1264, -0.0027, -0.3156,  ..., -0.3442,  0.3584, -0.4917],
        [ 0.1914,  0.0583, -0.5164,  ..., -0.0239,  0.1547, -0.1759],
        ...,
        [ 0.3428,  0.0072,  0.0225,  ...,  0.0831,  0.3483, -0.2851],
        [ 0.0324,  0.1356, -0.2858,  ..., -0.4837,  0.0833, -0.3803],
        [ 0.0204, -0.0584, -0.3292,  ..., -0.3697, -0.0414, -0.3590]],
       device='cuda:0'), tensor([[[[-1.2808e+00, -3.8357e-01, -1.1779e-01,  ..., -2.5510e-02,
            2.3517e-01, -2.0001e-01],
          [-4.3602e-01, -1.0324e-01, -2.8352e-01,  ..., -3.4322e-03,
           -4.1164e-02, -2.4013e-01],
          [-5.6435e-01, -5.0985e-01, -1.9225e-01,  ...,  1.9506e-01,
            8.5388e-02, -4.5820e-01],
          ...,
          [-4.0865e-01, -4.1763e-01,  7.4128e-02,  ...,  1.1858e-01,
            4.7185e-01, -2.1594e-01],
          [-8.6833e-01,  2.2274e-01,  5.5374e-02,  ...,  3.0943e-01,
            7.6995e-01,  3.8

ipdb>  !q.size()


*** AttributeError: 'tuple' object has no attribute 'size'


ipdb>  !q.shape


*** AttributeError: 'tuple' object has no attribute 'shape'


ipdb>  len(q)


2


ipdb>  exit


In [29]:
from pathlib import Path

import torch
from pytorch_lightning import LightningModule, Trainer
from pytorch_lightning.callbacks import DeviceStatsMonitor
from pytorch_lightning.loggers import TensorBoardLogger
from torch.utils.data import DataLoader
from torchvision import transforms as T

from lightly.data import LightlyDataset
from lightly.transforms.utils import IMAGENET_NORMALIZE
from lightly.utils.benchmarking import KNNClassifier, MetricCallback
from lightly.utils.dist import print_rank_zero


def knn_eval(
    model: LightningModule,
    train_dir: Path,
    val_dir: Path,
    log_dir: Path,
    batch_size_per_device: int,
    num_workers: int,
    accelerator: str,
    devices: int,
    num_classes: int,
    strategy="ddp_find_unused_parameters_true"
) -> None:
    """Runs KNN evaluation on the given model.

    Parameters follow InstDisc [0] settings.

    The most important settings are:
        - Num nearest neighbors: 200
        - Temperature: 0.1

    References:
       - [0]: InstDict, 2018, https://arxiv.org/abs/1805.01978
    """
    print_rank_zero("Running KNN evaluation...")

    # Setup training data.
    transform = T.Compose(
        [
            T.Resize(256),
            T.CenterCrop(224),
            T.ToTensor(),
            T.Normalize(mean=IMAGENET_NORMALIZE["mean"], std=IMAGENET_NORMALIZE["std"]),
        ]
    )
    train_dataset = LightlyDataset(input_dir=str(train_dir), transform=transform)
    train_dataloader = DataLoader(
        train_dataset,
        batch_size=batch_size_per_device,
        shuffle=False,
        num_workers=num_workers,
        drop_last=False,
    )

    # Setup validation data.
    val_dataset = LightlyDataset(input_dir=str(val_dir), transform=transform)
    val_dataloader = DataLoader(
        val_dataset,
        batch_size=batch_size_per_device,
        shuffle=False,
        num_workers=num_workers,
    )

    classifier = KNNClassifier(
        model=model,
        num_classes=num_classes,
        feature_dtype=torch.float16,
    )

    # Run KNN evaluation.
    metric_callback = MetricCallback()
    trainer = Trainer(
        max_epochs=1,
        accelerator=accelerator,
        devices=devices,
        logger=TensorBoardLogger(save_dir=str(log_dir), name="knn_eval"),
        callbacks=[
            DeviceStatsMonitor(),
            metric_callback,
        ],
        strategy=strategy,
        num_sanity_val_steps=0,
    )
    trainer.fit(
        model=classifier,
        train_dataloaders=train_dataloader,
        val_dataloaders=val_dataloader,
    )
    for metric in ["val_top1", "val_top5"]:
        print_rank_zero(f"knn {metric}: {max(metric_callback.val_metrics[metric])}")

In [30]:
knn_eval(
    model=model, 
    train_dir="/data/2m/val/", 
    val_dir="/data/2m/val", 
    log_dir=".", 
    batch_size_per_device=32, 
    num_workers=8, 
    accelerator='gpu', 
    devices=1, 
    num_classes=len(dataset.dataset.classes),
    strategy="ddp_notebook"
)

Running KNN evaluation...


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


RuntimeError: Lightning can't create new processes if CUDA is already initialized. Did you manually call `torch.cuda.*` functions, have moved the model to the device, or allocated memory on the GPU any other way? Please remove any such calls, or change the selected strategy. You will have to restart the Python kernel.

In [32]:
dataset.dataset.classes

['n00000001', 'n00000002', 'n00000003', 'n00000004', 'n00000005', 'n00000006']

In [33]:
type(dataset)

lightly.data.dataset.LightlyDataset