# Imports

In [1]:
import os
import torch
import multiprocessing
import torch.nn as nn
import numpy as np
from pathlib import Path
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from torch.utils.tensorboard import SummaryWriter
from torch.optim import AdamW
from torch.optim.lr_scheduler import CosineAnnealingLR, OneCycleLR
from PIL import Image
from transformers import AutoImageProcessor, DPTForDepthEstimation, get_scheduler
from datetime import datetime
from tqdm.auto import tqdm

`AnnotionFormat` is deprecated and will be removed in v4.38. Please use `transformers.image_utils.AnnotationFormat` instead.


In [2]:
torch.cuda.empty_cache()

# Constants

In [3]:
IMAGE_SIZE = (256, 255)

In [4]:
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(DEVICE)

cuda:0


In [5]:
BACKBONE_TYPE = 'small' # in ("small", "base", "large" or "giant")

In [6]:
HEAD_TYPE = 'nyu' # in ("nyu", "kitti")

In [7]:
MODEL_NAME = f"facebook/dpt-dinov2-{BACKBONE_TYPE}-{HEAD_TYPE}"

In [8]:
RESUME_PATH = "HF-model_small_nyu_20231230_082544_40"
if RESUME_PATH is not None:
    BACKBONE_TYPE = RESUME_PATH.split('/')[-1].split('_')[1]
    HEAD_TYPE = RESUME_PATH.split('/')[-1].split('_')[2]

In [9]:
TRAIN_PATH = "/home/jovyan/work/saved_data/data/thumbnails/train"
TEST_PATH = "/home/jovyan/work/saved_data/data/thumbnails/test"

In [10]:
EPOCHS = 100
BATCH_SIZE = 32
EVAL_INTERVAL = 10

In [11]:
START_LR = 1e-8
MIN_LR = 1e-9
MAX_LR = 1e-7

In [12]:
TRAIN_BACKBONE = True
TRAIN_NECK = True
TRAIN_HEAD = True

# Classes

In [13]:
class CustomNPZDataset(Dataset):
    def __init__(self, path, image_processor, transform=None):
        self.path = path
        self.files = list(Path(path).glob('*.npz'))
        self.transform = transform
        self.image_processor = image_processor

    def __len__(self):
        return len(self.files)

    def __getitem__(self, item):
        with np.load(str(self.files[item])) as data:
            X_numpy = data['X']
            y_numpy = data['y']
        X_torch = torch.from_numpy(X_numpy)
        y_torch = torch.from_numpy(y_numpy).unsqueeze(0)
        if self.transform is not None:
            X_torch = self.transform(X_torch)
            y_torch = self.transform(y_torch)
        return X_torch, y_torch

In [14]:
class SigLoss(nn.Module):
    def __init__(
        self, valid_mask=True, max_depth=None):
        super(SigLoss, self).__init__()
        
        self.valid_mask = valid_mask
        self.max_depth = max_depth

        self.eps = 0.001  # avoid grad explode

    def sigloss(self, input, target):
        if self.valid_mask:
            valid_mask = target > 0
            if self.max_depth is not None:
                valid_mask = torch.logical_and(target > 0, target <= self.max_depth)
            input = input[valid_mask]
            target = target[valid_mask]

        g = torch.log(input + self.eps) - torch.log(target + self.eps)
        Dg = torch.var(g) + 0.15 * torch.pow(torch.mean(g), 2)
        return torch.sqrt(Dg)

    def forward(self, depth_pred, depth_gt):
        loss_depth = self.sigloss(depth_pred, depth_gt)
        return loss_depth

In [15]:
class MaskedMAE(nn.Module):
    def __init__(self, valid_mask=True, max_depth=None):
        super(MaskedMAE, self).__init__()
        
        self.valid_mask = valid_mask
        self.max_depth = max_depth

    def mae(self, input, target):
        if self.valid_mask:
            valid_mask = target > 0
            if self.max_depth is not None:
                valid_mask = torch.logical_and(target > 0, target <= self.max_depth)
            input = input[valid_mask]
            target = target[valid_mask]

        mae = torch.abs(input - target).mean()
        return mae
    
    def forward(self, depth_pred, depth_gt):
        metric_mae = self.mae(depth_pred, depth_gt)
        return metric_mae

In [16]:
class MaskedR2Score(nn.Module):
    def __init__(self, valid_mask=True, max_depth=None):
        super(MaskedR2Score, self).__init__()

        self.valid_mask = valid_mask
        self.max_depth = max_depth

    def r2(self, input, target):
        if self.valid_mask:
            valid_mask = target > 0
            if self.max_depth is not None:
                valid_mask = torch.logical_and(target > 0, target <= self.max_depth)
            input = input[valid_mask]
            target = target[valid_mask]

        mean_target = torch.mean(target)
        ss_total = torch.sum((target - mean_target)**2)
        ss_residual = torch.sum((input - target)**2)

        r2 = 1 - (ss_residual / ss_total)
        return r2
    
    def forward(self, depth_pred, depth_gt):
        metric_r2 = self.r2(depth_pred, depth_gt)
        return metric_r2

# Initiallization

In [17]:
image_processor = AutoImageProcessor.from_pretrained(MODEL_NAME)
if RESUME_PATH is None:
    model = DPTForDepthEstimation.from_pretrained(MODEL_NAME)
else:
    model = DPTForDepthEstimation.from_pretrained(RESUME_PATH)

In [18]:
model.cuda()

DPTForDepthEstimation(
  (backbone): Dinov2Backbone(
    (embeddings): Dinov2Embeddings(
      (patch_embeddings): Dinov2PatchEmbeddings(
        (projection): Conv2d(3, 384, kernel_size=(14, 14), stride=(14, 14))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): Dinov2Encoder(
      (layer): ModuleList(
        (0-11): 12 x Dinov2Layer(
          (norm1): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
          (attention): Dinov2Attention(
            (attention): Dinov2SelfAttention(
              (query): Linear(in_features=384, out_features=384, bias=True)
              (key): Linear(in_features=384, out_features=384, bias=True)
              (value): Linear(in_features=384, out_features=384, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): Dinov2SelfOutput(
              (dense): Linear(in_features=384, out_features=384, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
         

In [19]:
augmentation_transform = transforms.Compose([
    transforms.RandomVerticalFlip(),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(90)
    #transforms.RandomResizedCrop((IMAGE_SIZE[0], IMAGE_SIZE[1]), scale=(0.8, 1.0)),
    #transforms.ColorJitter(brightness=0.4, contrast=0.2)
])

In [20]:
train_dataset = CustomNPZDataset(path=TRAIN_PATH, image_processor=image_processor, transform=augmentation_transform)
validation_dataset = CustomNPZDataset(path=TEST_PATH, image_processor=image_processor)

In [21]:
training_loader = DataLoader(train_dataset, shuffle=True, batch_size=BATCH_SIZE, num_workers=10, pin_memory=True)
validation_loader = DataLoader(validation_dataset, batch_size=BATCH_SIZE, num_workers=8, pin_memory=True)

In [22]:
optimizer = AdamW(model.parameters(), lr=START_LR)

In [23]:
total_steps = EPOCHS * len(training_loader)
#lr_scheduler = CosineAnnealingLR(optimizer, total_steps, eta_min=MIN_LR)
lr_scheduler = OneCycleLR(optimizer, MAX_LR, total_steps=total_steps, anneal_strategy='linear')

In [24]:
loss_fn = SigLoss()

In [25]:
mae_fn = MaskedMAE()
r2_fn = MaskedR2Score()

# Helper Functions

In [26]:
def run_model(data):
    inputs, labels = data

    images = [Image.fromarray(input.numpy().transpose(1, 2, 0)) for input in inputs]

    inputs = image_processor(images=images, return_tensors="pt")

    inputs = inputs.to(DEVICE)
    
    outputs = model(**inputs)

    predicted_depth = outputs['predicted_depth']

    predictions = torch.nn.functional.interpolate(
        predicted_depth.unsqueeze(1),
        size=IMAGE_SIZE,
        mode="bicubic",
        align_corners=False,
    )

    del inputs, outputs, predicted_depth
    torch.cuda.empty_cache()

    labels = labels.to(DEVICE)
    
    loss = loss_fn(predictions, labels)

    mae = mae_fn(predictions, labels)
    r2 = r2_fn(predictions, labels)

    del predictions, labels
    torch.cuda.empty_cache()
    
    return loss, mae, r2

In [27]:
def train_one_epoch():
    running_loss = 0.
    epoch_loss = 0.

    running_mae = 0.
    running_r2 = 0.
    epoch_mae = 0.
    epoch_r2 = 0.

    for i, data in tqdm(enumerate(training_loader), total=len(training_loader)):
        loss, mae, r2 = run_model(data)

        if loss.isnan().all():
            raise Exception('Exploding Gradients!')

        optimizer.zero_grad()
        
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0)

        optimizer.step()

        running_loss += loss.item()

        running_mae += mae.item()
        running_r2 += r2.item()

    epoch_loss = running_loss / len(training_loader)

    epoch_mae = running_mae / len(training_loader)
    epoch_r2 = running_r2 / len(training_loader)
    
    return epoch_loss, epoch_mae, epoch_r2

# Training Loop

In [28]:
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
writer = SummaryWriter(f"runs/dinov2_{BACKBONE_TYPE}_dpt_{HEAD_TYPE}_{timestamp}")

epoch_number = 1

best_vloss = 1_000_000.

for epoch in range(EPOCHS):
    print('EPOCH {}:'.format(epoch_number))

    model.train(True)
    
    for param in model.backbone.parameters():
        param.requires_grad = TRAIN_BACKBONE
    for param in model.neck.parameters():
        param.requires_grad = TRAIN_NECK
    for param in model.head.parameters():
        param.requires_grad = TRAIN_HEAD
        
    avg_loss, avg_mae, avg_r2 = train_one_epoch()

    print('LOSS Train {}'.format(avg_loss))

    print('MAE Train {}'.format(avg_mae))
    print('R2 Train {}'.format(avg_r2))

    writer.add_scalars('Training Loss',
                        { 'Training' : avg_loss },
                        epoch_number)
    writer.add_scalars('Training MAE',
                    { 'Training' : avg_mae },
                    epoch_number)
    writer.add_scalars('Training R2',
                    { 'Training' : avg_r2 },
                    epoch_number)

    if epoch_number % EVAL_INTERVAL == 0:
        running_vloss = 0.0

        running_vmae = 0.0
        running_vr2 = 0.0
    
        model.eval()
    
        with torch.no_grad():
            for i, vdata in tqdm(enumerate(validation_loader), total=len(validation_loader)):
                vloss, vmae, vr2 = run_model(vdata)
                running_vloss += vloss
                running_vmae += vmae
                running_vr2 += vr2
    
        avg_vloss = running_vloss / len(validation_loader)
        print('LOSS valid {}'.format(avg_vloss))
    
        avg_vmae = running_vmae / len(validation_loader)
        print('MAE valid {}'.format(avg_vmae))
        avg_vr2 = running_vr2 / len(validation_loader)
        print('R2 valid {}'.format(avg_vr2))
    
        writer.add_scalars('Validation Loss',
                        { 'Validation' : avg_vloss },
                        epoch_number)
        writer.add_scalars('Validation MAE',
                        { 'Validation' : avg_vmae },
                        epoch_number)
        writer.add_scalars('Validation R2',
                        { 'Validation' : avg_vr2 },
                        epoch_number)

        if avg_vloss < best_vloss:
            best_vloss = avg_vloss
            model_path = 'model_{}_{}_{}_{}'.format(BACKBONE_TYPE, HEAD_TYPE, timestamp, epoch_number)
            torch.save(model.state_dict(), model_path)
            os.mkdir(f"HF-{model_path}")
            model.save_pretrained(f"HF-{model_path}")
    
    writer.flush()
    
    epoch_number += 1

    lr_scheduler.step()

EPOCH 1:


  0%|          | 0/120 [00:00<?, ?it/s]

LOSS Train 1.9611224134763081
MAE Train 8.310295307636261
R2 Train -0.5459240287542343
EPOCH 2:


  0%|          | 0/120 [00:00<?, ?it/s]

LOSS Train 1.8725258549054464
MAE Train 7.941031324863434
R2 Train -0.42842679619789126
EPOCH 3:


  0%|          | 0/120 [00:00<?, ?it/s]

LOSS Train 1.8161075860261917
MAE Train 7.657207369804382
R2 Train -0.32806953142086664
EPOCH 4:


  0%|          | 0/120 [00:00<?, ?it/s]

LOSS Train 1.7852516978979112
MAE Train 7.428497409820556
R2 Train -0.24537241458892822
EPOCH 5:


  0%|          | 0/120 [00:00<?, ?it/s]

LOSS Train 1.763675394654274
MAE Train 7.198892585436503
R2 Train -0.17159229069948195
EPOCH 6:


  0%|          | 0/120 [00:00<?, ?it/s]

LOSS Train 1.7451706101497015
MAE Train 7.043346099058787
R2 Train -0.12012081742286682
EPOCH 7:


  0%|          | 0/120 [00:00<?, ?it/s]

LOSS Train 1.7353169898192087
MAE Train 6.910624627272288
R2 Train -0.08063958982626597
EPOCH 8:


  0%|          | 0/120 [00:00<?, ?it/s]

LOSS Train 1.7209773361682892
MAE Train 6.80936926205953
R2 Train -0.05071255515019099
EPOCH 9:


  0%|          | 0/120 [00:00<?, ?it/s]

LOSS Train 1.7176567941904068
MAE Train 6.693831300735473
R2 Train -0.013770333925882975
EPOCH 10:


  0%|          | 0/120 [00:00<?, ?it/s]

LOSS Train 1.7107423812150955
MAE Train 6.626129305362701
R2 Train 0.00046727657318115237


  0%|          | 0/140 [00:00<?, ?it/s]

LOSS valid nan
MAE valid 7.8555707931518555
R2 valid -0.07703547924757004
EPOCH 11:


  0%|          | 0/120 [00:00<?, ?it/s]

Exception: Exploding Gradients!