In [67]:
from slimtp.modules import AMDataset
from slimtp.pipelines import AMPreprocessing
from torch.utils.data import DataLoader
from omegaconf import OmegaConf

from pj_cape_foundation_eval.models.dino_embedding_creator import DinoEmbeddingCreator, build_model_for_eval
from dinov2.configs import dinov2_default_config

In [94]:
import pytorch_lightning as pl
import pandas as pd
import numpy as np
import yaml
import torch
import nest_asyncio

In [69]:
nest_asyncio.apply()

In [70]:
data_config = omegaconf.OmegaConf.load("./configs/config.yaml")
ampreproc = AMPreprocessing(data_config)

In [71]:
ampreproc.run()

100%|██████████| 10/10 [00:00<00:00, 54.18it/s]
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block2_values] [items->Index(['geometry', 'imagery_source', 'split', 'use', 'cache_key'], dtype='object')]

  df.to_hdf(path, key="data", mode="w")


In [72]:
df = pd.read_hdf("/cnvrg/output/preprocessing/data.h5")
df['cache_key'] = df['cache_key'].apply(eval)
df.iloc[0]

geometry          b"\x01\x03\x00\x00\x00\x01\x00\x00\x00C\x00\x0...
imagery_source    sv2:nearmap_vertical_jpg:ffd055ee-e430-11e8-b9...
imagery_date                              2018-10-26 00:00:00+00:00
split                                                          test
use                                                            test
identifier                                                        0
cache_key         {'source': 'shower_v2', 'imagery_source': 'sv2...
Name: 0, dtype: object

In [73]:
class DinoInferenceDataset(AMDataset):
    def __getitem__(self, index: int):
        item = self.get_df_data(index)
        sample_name = item["identifier"]
        x_clean = self.get_aug_item(item, self.t_clean)
        return x_clean, sample_name

In [88]:
# Create Inference dataset
amdataset = DinoInferenceDataset(config=data_config, df=df)

# Create torch dataloader
data_loader = DataLoader(amdataset, batch_size=5, shuffle=False)

In [75]:
# Download checkpointb
checkpoint_path = "./data/teacher_checkpoint.pth"

# Only run the below command if the above path doesn't exist
# !gsutil cp 'gs://cape-ml-projects-data/data_stores/dinov2/experiments/a100x4/DIN-145/eval/training_324999/teacher_checkpoint.pth' $checkpoint_path

Copying gs://cape-ml-projects-data/data_stores/dinov2/experiments/a100x4/DIN-145/eval/training_324999/teacher_checkpoint.pth...
/ [1 files][  1.5 GiB/  1.5 GiB]   44.5 MiB/s                                   
Operation completed over 1 objects/1.5 GiB.                                      


In [77]:
# Download config corresponding to this checkpoint
config_path = "./data/teacher_config.yaml"

# Only run the below command if the above path doesn't exist
# !gsutil cp 'gs://cape-ml-projects-data/data_stores/dinov2/experiments/a100x4/DIN-145/config.yaml' $config_path

In [78]:
# Create the model
default_cfg = OmegaConf.create(dinov2_default_config)

new_config = OmegaConf.load(config_path)
cfg = OmegaConf.merge(default_cfg, new_config)

backbone_model = build_model_for_eval(cfg, checkpoint_path, cuda=True)
dino_model = DinoEmbeddingCreator(backbone_model=backbone_model)

Predicting: 0it [05:27, ?it/s]
Take key teacher in provided checkpoint dict
Pretrained weights found at ./data/teacher_checkpoint.pth and loaded with msg: _IncompatibleKeys(missing_keys=[], unexpected_keys=['dino_head.mlp.0.weight', 'dino_head.mlp.0.bias', 'dino_head.mlp.2.weight', 'dino_head.mlp.2.bias', 'dino_head.mlp.4.weight', 'dino_head.mlp.4.bias', 'dino_head.last_layer.weight_g', 'dino_head.last_layer.weight_v', 'ibot_head.mlp.0.weight', 'ibot_head.mlp.0.bias', 'ibot_head.mlp.2.weight', 'ibot_head.mlp.2.bias', 'ibot_head.mlp.4.weight', 'ibot_head.mlp.4.bias', 'ibot_head.last_layer.weight_g', 'ibot_head.last_layer.weight_v'])


In [79]:
trainer = pl.Trainer(
    max_epochs=1,
    accelerator="gpu",
    devices=1,
    log_every_n_steps=10
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [80]:
_ = dino_model.eval()

In [89]:
results = trainer.predict(dino_model, dataloaders=data_loader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  rank_zero_warn(


Predicting DataLoader 0: 100%|██████████| 2/2 [00:00<00:00,  3.70it/s]


In [90]:
results[0]

(tensor([[-0.4636, -2.2672,  1.0947,  ...,  0.2755, -0.2878,  0.0938],
         [ 0.4437,  0.2556, -0.1204,  ...,  0.2766, -0.0068,  0.2490],
         [-0.2920,  0.6529,  1.0361,  ...,  0.1713,  0.3475,  0.1404],
         [-0.0382, -0.8151, -0.0095,  ...,  0.1934, -0.0895,  0.5180],
         [ 0.0240, -0.6696, -1.9730,  ...,  0.0438, -0.4117, -0.0673]]),
 tensor([0, 1, 2, 3, 4]))

In [101]:
embeddings = torch.cat([r[0] for r in results], 0).cpu().numpy()
identifiers = np.concatenate([list(r[1]) for r in results])

In [104]:
embeddings_df = pd.DataFrame(embeddings, index=identifiers,
 columns=[f"emb_{i}" for i in range(embeddings.shape[1])])
