In [1]:
!git clone https://github.com/isl-org/ZoeDepth.git

Cloning into 'ZoeDepth'...
remote: Enumerating objects: 111, done.[K
remote: Counting objects: 100% (105/105), done.[K
remote: Compressing objects: 100% (77/77), done.[K
remote: Total 111 (delta 39), reused 63 (delta 24), pack-reused 6[K
Receiving objects: 100% (111/111), 4.09 MiB | 38.10 MiB/s, done.
Resolving deltas: 100% (39/39), done.


In [2]:
!ls /kaggle/working

ZoeDepth


In [22]:
from IPython.display import FileLink
FileLink(r'zoedepth.zip')

In [3]:
%cd /kaggle/working

/kaggle/working


In [20]:
!zip -r zoedepth.zip /kaggle/working/depth_anything

  adding: kaggle/working/depth_anything/ (stored 0%)
  adding: kaggle/working/depth_anything/4/ (stored 0%)
  adding: kaggle/working/depth_anything/4/rgb_00531.png (deflated 0%)
  adding: kaggle/working/depth_anything/4/rgb_00971.png (deflated 0%)
  adding: kaggle/working/depth_anything/4/rgb_00512.png (deflated 0%)
  adding: kaggle/working/depth_anything/4/rgb_00787.png (deflated 0%)
  adding: kaggle/working/depth_anything/4/rgb_01095.png (deflated 0%)
  adding: kaggle/working/depth_anything/4/rgb_01355.png (deflated 0%)
  adding: kaggle/working/depth_anything/4/rgb_00581.png (deflated 0%)
  adding: kaggle/working/depth_anything/4/rgb_00187.png (deflated 0%)
  adding: kaggle/working/depth_anything/4/rgb_00637.png (deflated 0%)
  adding: kaggle/working/depth_anything/4/rgb_01228.png (deflated 0%)
  adding: kaggle/working/depth_anything/4/rgb_00035.png (deflated 0%)
  adding: kaggle/working/depth_anything/4/rgb_00728.png (deflated 0%)
  adding: kaggle/working/depth_anything/4/rgb_00566.

In [3]:
%%writefile /kaggle/working/ZoeDepth/zoedepth/data/data_mono.py

# MIT License

# Copyright (c) 2022 Intelligent Systems Lab Org

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

# File author: Shariq Farooq Bhat

# This file is partly inspired from BTS (https://github.com/cleinc/bts/blob/master/pytorch/bts_dataloader.py); author: Jin Han Lee

import itertools
import os
import random

import numpy as np
import cv2
import torch
import torch.nn as nn
import torch.utils.data.distributed
from zoedepth.utils.easydict import EasyDict as edict
from PIL import Image, ImageOps
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms

from zoedepth.utils.config import change_dataset

from .ddad import get_ddad_loader
from .diml_indoor_test import get_diml_indoor_loader
from .diml_outdoor_test import get_diml_outdoor_loader
from .diode import get_diode_loader
from .hypersim import get_hypersim_loader
from .ibims import get_ibims_loader
from .sun_rgbd_loader import get_sunrgbd_loader
from .vkitti import get_vkitti_loader
from .vkitti2 import get_vkitti2_loader
import matplotlib.pyplot as plt
from .preprocess import CropParams, get_white_border, get_black_border
import numpy as np
from PIL import Image

def adu2photons(image, qe=0.69, sensitivity=5.88):
    return image / (qe * sensitivity + 1e-7)

def add_camera_noise(input_irrad_photons, qe=0.69, sensitivity=5.88,
                     dark_noise=2.29, bitdepth=8, baseline=100,
                     rs=np.random.RandomState(seed=42)):
 
    # Add shot noise
    photons = rs.poisson(input_irrad_photons, size=input_irrad_photons.shape)
    
    # Convert to electrons
    electrons = qe * photons
    
    # Add dark noise
    electrons_out = rs.normal(scale=dark_noise, size=electrons.shape) + electrons
    
    # Convert to ADU and add baseline
    max_adu     = 2**bitdepth - 1
    adu         = (electrons_out * sensitivity).astype(int) # Convert to discrete numbers
    adu = np.clip(adu, 0, max_adu)
    
    return adu

def _is_pil_image(img):
    return isinstance(img, Image.Image)


def _is_numpy_image(img):
    return isinstance(img, np.ndarray) and (img.ndim in {2, 3})


def preprocessing_transforms(mode, **kwargs):
    return transforms.Compose([
        ToTensor(mode=mode, **kwargs)
    ])


class DepthDataLoader(object):
    def __init__(self, config, mode, dn=0, device='cpu', transform=None, **kwargs):
        """
        Data loader for depth datasets

        Args:
            config (dict): Config dictionary. Refer to utils/config.py
            mode (str): "train" or "online_eval"
            device (str, optional): Device to load the data on. Defaults to 'cpu'.
            transform (torchvision.transforms, optional): Transform to apply to the data. Defaults to None.
        """

        self.config = config
        self.dn = dn

        if config.dataset == 'ibims':
            self.data = get_ibims_loader(config, batch_size=1, num_workers=1)
            return

        if config.dataset == 'sunrgbd':
            self.data = get_sunrgbd_loader(
                data_dir_root=config.sunrgbd_root, batch_size=1, num_workers=1)
            return

        if config.dataset == 'diml_indoor':
            self.data = get_diml_indoor_loader(
                data_dir_root=config.diml_indoor_root, batch_size=1, num_workers=1)
            return

        if config.dataset == 'diml_outdoor':
            self.data = get_diml_outdoor_loader(
                data_dir_root=config.diml_outdoor_root, batch_size=1, num_workers=1)
            return

        if "diode" in config.dataset:
            self.data = get_diode_loader(
                config[config.dataset+"_root"], batch_size=1, num_workers=1)
            return

        if config.dataset == 'hypersim_test':
            self.data = get_hypersim_loader(
                config.hypersim_test_root, batch_size=1, num_workers=1)
            return

        if config.dataset == 'vkitti':
            self.data = get_vkitti_loader(
                config.vkitti_root, batch_size=1, num_workers=1)
            return

        if config.dataset == 'vkitti2':
            self.data = get_vkitti2_loader(
                config.vkitti2_root, batch_size=1, num_workers=1)
            return

        if config.dataset == 'ddad':
            self.data = get_ddad_loader(config.ddad_root, resize_shape=(
                352, 1216), batch_size=1, num_workers=1)
            return

        img_size = self.config.get("img_size", None)
        img_size = img_size if self.config.get(
            "do_input_resize", False) else None

        if transform is None:
            transform = preprocessing_transforms(mode, size=img_size)

        if mode == 'train':

            Dataset = DataLoadPreprocess
            self.training_samples = Dataset(
                config, mode, transform=transform, device=device)

            if config.distributed:
                self.train_sampler = torch.utils.data.distributed.DistributedSampler(
                    self.training_samples)
            else:
                self.train_sampler = None

            self.data = DataLoader(self.training_samples,
                                   batch_size=config.batch_size,
                                   shuffle=(self.train_sampler is None),
                                   num_workers=config.workers,
                                   pin_memory=True,
                                   persistent_workers=True,
                                #    prefetch_factor=2,
                                   sampler=self.train_sampler)

        elif mode == 'online_eval':
            self.testing_samples = DataLoadPreprocess(
                config, mode, transform=transform, dn=self.dn)
            if config.distributed:  # redundant. here only for readability and to be more explicit
                # Give whole test set to all processes (and report evaluation only on one) regardless
                self.eval_sampler = None
            else:
                self.eval_sampler = None
            self.data = DataLoader(self.testing_samples, 1,
                                   shuffle=kwargs.get("shuffle_test", False),
                                   num_workers=1,
                                   pin_memory=False,
                                   sampler=self.eval_sampler)

        elif mode == 'test':
            self.testing_samples = DataLoadPreprocess(
                config, mode, transform=transform)
            self.data = DataLoader(self.testing_samples,
                                   1, shuffle=False, num_workers=1)

        else:
            print(
                'mode should be one of \'train, test, online_eval\'. Got {}'.format(mode))

            
def image2depth(path):
    depth = cv2.imread(path, cv2.IMREAD_UNCHANGED)
    depth = depth.astype('float32')
    depth /= (2**16 - 1)
    depth *= 10.0
    return depth

def repetitive_roundrobin(*iterables):
    """
    cycles through iterables but sample wise
    first yield first sample from first iterable then first sample from second iterable and so on
    then second sample from first iterable then second sample from second iterable and so on

    If one iterable is shorter than the others, it is repeated until all iterables are exhausted
    repetitive_roundrobin('ABC', 'D', 'EF') --> A D E B D F C D E
    """
    # Repetitive roundrobin
    iterables_ = [iter(it) for it in iterables]
    exhausted = [False] * len(iterables)
    while not all(exhausted):
        for i, it in enumerate(iterables_):
            try:
                yield next(it)
            except StopIteration:
                exhausted[i] = True
                iterables_[i] = itertools.cycle(iterables[i])
                # First elements may get repeated if one iterable is shorter than the others
                yield next(iterables_[i])


class RepetitiveRoundRobinDataLoader(object):
    def __init__(self, *dataloaders):
        self.dataloaders = dataloaders

    def __iter__(self):
        return repetitive_roundrobin(*self.dataloaders)

    def __len__(self):
        # First samples get repeated, thats why the plus one
        return len(self.dataloaders) * (max(len(dl) for dl in self.dataloaders) + 1)


class MixedNYUKITTI(object):
    def __init__(self, config, mode, device='cpu', **kwargs):
        config = edict(config)
        config.workers = config.workers // 2
        self.config = config
        nyu_conf = change_dataset(edict(config), 'nyu')
        kitti_conf = change_dataset(edict(config), 'kitti')

        # make nyu default for testing
        self.config = config = nyu_conf
        img_size = self.config.get("img_size", None)
        img_size = img_size if self.config.get(
            "do_input_resize", False) else None
        if mode == 'train':
            nyu_loader = DepthDataLoader(
                nyu_conf, mode, device=device, transform=preprocessing_transforms(mode, size=img_size)).data
            kitti_loader = DepthDataLoader(
                kitti_conf, mode, device=device, transform=preprocessing_transforms(mode, size=img_size)).data
            # It has been changed to repetitive roundrobin
            self.data = RepetitiveRoundRobinDataLoader(
                nyu_loader, kitti_loader)
        else:
            self.data = DepthDataLoader(nyu_conf, mode, device=device).data


def remove_leading_slash(s):
    if s[0] == '/' or s[0] == '\\':
        return s[1:]
    return s


class CachedReader:
    def __init__(self, shared_dict=None):
        if shared_dict:
            self._cache = shared_dict
        else:
            self._cache = {}

    def open(self, fpath):
        im = self._cache.get(fpath, None)
        if im is None:
            im = self._cache[fpath] = Image.open(fpath)
        return im


class ImReader:
    def __init__(self):
        pass

    # @cache
    def open(self, fpath):
        return Image.open(fpath)

def make_noise(path_in, path_out, dn):
    image = Image.open(path_in)
    photons = adu2photons(np.asarray(image))
    noised = add_camera_noise(photons, dark_noise=dn)

    out_image = Image.fromarray(noised.astype(np.uint8))
    out_image.save(path_out)
    
    
class DataLoadPreprocess(Dataset):
    def __init__(self, config, mode, dn=0, transform=None, is_for_online_eval=False, **kwargs):
        self.config = config
        if mode == 'online_eval':
            with open(config.filenames_file_eval, 'r') as f:
                self.filenames = f.readlines()
        else:
            with open(config.filenames_file, 'r') as f:
                self.filenames = f.readlines()

        self.mode = mode
        self.dn = dn
        self.transform = transform
        self.to_tensor = ToTensor(mode)
        self.is_for_online_eval = is_for_online_eval
        if config.use_shared_dict:
            self.reader = CachedReader(config.shared_dict)
        else:
            self.reader = ImReader()

    def postprocess(self, sample):
        return sample

    def __getitem__(self, idx):
        sample_path = self.filenames[idx]
        focal = float(sample_path.split()[2])
        sample = {}

        if self.mode == 'train':
            if self.config.dataset == 'kitti' and self.config.use_right and random.random() > 0.5:
                image_path = os.path.join(
                    self.config.data_path, remove_leading_slash(sample_path.split()[3]))
                depth_path = os.path.join(
                    self.config.gt_path, remove_leading_slash(sample_path.split()[4]))
            else:
                image_path = os.path.join(
                    self.config.data_path, remove_leading_slash(sample_path.split()[0]))
                depth_path = os.path.join(
                    self.config.gt_path, remove_leading_slash(sample_path.split()[1]))
            
            if self.dn != 0:
                new_path = os.path.join("/kaggle/working/noise", image_path.split("/")[-1])
                make_noise(image_path, new_path, self.dn)
                image_path = new_path
            
            image = self.reader.open(image_path)
            plt.imshow(image)
            depth_gt = self.reader.open(depth_path)
            w, h = image.size

            if self.config.do_kb_crop:
                height = image.height
                width = image.width
                top_margin = int(height - 352)
                left_margin = int((width - 1216) / 2)
                depth_gt = depth_gt.crop(
                    (left_margin, top_margin, left_margin + 1216, top_margin + 352))
                image = image.crop(
                    (left_margin, top_margin, left_margin + 1216, top_margin + 352))

            # Avoid blank boundaries due to pixel registration?
            # Train images have white border. Test images have black border.
            if self.config.dataset == 'nyu' and self.config.avoid_boundary:
                # print("Avoiding Blank Boundaries!")
                # We just crop and pad again with reflect padding to original size
                # original_size = image.size
                crop_params = get_white_border(np.array(image, dtype=np.uint8))
                image = image.crop((crop_params.left, crop_params.top, crop_params.right, crop_params.bottom))
                depth_gt = depth_gt.crop((crop_params.left, crop_params.top, crop_params.right, crop_params.bottom))

                # Use reflect padding to fill the blank
                image = np.array(image)
                image = np.pad(image, ((crop_params.top, h - crop_params.bottom), (crop_params.left, w - crop_params.right), (0, 0)), mode='reflect')
                image = Image.fromarray(image)

                depth_gt = np.array(depth_gt)
                depth_gt = np.pad(depth_gt, ((crop_params.top, h - crop_params.bottom), (crop_params.left, w - crop_params.right)), 'constant', constant_values=0)
                depth_gt = Image.fromarray(depth_gt)


            if self.config.do_random_rotate and (self.config.aug):
                random_angle = (random.random() - 0.5) * 2 * self.config.degree
                image = self.rotate_image(image, random_angle)
                depth_gt = self.rotate_image(
                    depth_gt, random_angle, flag=Image.NEAREST)

            image = np.asarray(image, dtype=np.float32) / 255.0
            depth_gt = np.asarray(depth_gt, dtype=np.float32)
            depth_gt = np.expand_dims(depth_gt, axis=2)

            if self.config.dataset == 'nyu':
                depth_gt = depth_gt / 1000.0
            else:
                depth_gt = depth_gt / 256.0

            if self.config.aug and (self.config.random_crop):
                image, depth_gt = self.random_crop(
                    image, depth_gt, self.config.input_height, self.config.input_width)
            
            if self.config.aug and self.config.random_translate:
                # print("Random Translation!")
                image, depth_gt = self.random_translate(image, depth_gt, self.config.max_translation)

            image, depth_gt = self.train_preprocess(image, depth_gt)
            mask = np.logical_and(depth_gt > self.config.min_depth,
                                  depth_gt < self.config.max_depth).squeeze()[None, ...]
            sample = {'image': image, 'depth': depth_gt, 'focal': focal,
                      'mask': mask, **sample}

        else:
            if self.mode == 'online_eval':
                data_path = self.config.data_path_eval
            else:
                data_path = self.config.data_path

            image_path = os.path.join(
                data_path, remove_leading_slash(sample_path.split()[0]))

            if self.dn != 0:
                new_path = os.path.join("/kaggle/working/noise", image_path.split("/")[-1])
                make_noise(image_path, new_path, self.dn)
                image_path = new_path
                
            image = np.asarray(self.reader.open(image_path),
                               dtype=np.float32) / 255.0
            plt.imshow(image)
            if self.mode == 'online_eval':
                gt_path = self.config.gt_path_eval
                depth_path = os.path.join(
                    gt_path, remove_leading_slash(sample_path.split()[1]))
                has_valid_depth = False
                try:
                    depth_gt = image2depth(depth_path)#self.reader.open(depth_path)
                    has_valid_depth = True
                except IOError:
                    depth_gt = False
                    # print('Missing gt for {}'.format(image_path))

                if has_valid_depth:
                    depth_gt = np.expand_dims(depth_gt, axis=2)


                    mask = np.logical_and(
                        depth_gt >= self.config.min_depth, depth_gt <= self.config.max_depth).squeeze()[None, ...]
                else:
                    mask = False

            if self.config.do_kb_crop:
                height = image.shape[0]
                width = image.shape[1]
                top_margin = int(height - 352)
                left_margin = int((width - 1216) / 2)
                image = image[top_margin:top_margin + 352,
                              left_margin:left_margin + 1216, :]
                if self.mode == 'online_eval' and has_valid_depth:
                    depth_gt = depth_gt[top_margin:top_margin +
                                        352, left_margin:left_margin + 1216, :]

            if self.mode == 'online_eval':
                sample = {'image_path': image_path, 'image': image, 'depth': depth_gt, 'focal': focal, 'has_valid_depth': has_valid_depth,
                          'image_path': sample_path.split()[0], 'depth_path': sample_path.split()[1],
                          'mask': mask}
            else:
                sample = {'image': image, 'focal': focal}

        if (self.mode == 'train') or ('has_valid_depth' in sample and sample['has_valid_depth']):
            mask = np.logical_and(depth_gt > self.config.min_depth,
                                  depth_gt < self.config.max_depth).squeeze()[None, ...]
            sample['mask'] = mask

        if self.transform:
            sample = self.transform(sample)

        sample = self.postprocess(sample)
        sample['dataset'] = self.config.dataset
        sample = {**sample, 'image_path': sample_path.split()[0], 'depth_path': sample_path.split()[1]}

        return sample

    def rotate_image(self, image, angle, flag=Image.BILINEAR):
        result = image.rotate(angle, resample=flag)
        return result

    def random_crop(self, img, depth, height, width):
        assert img.shape[0] >= height
        assert img.shape[1] >= width
        assert img.shape[0] == depth.shape[0]
        assert img.shape[1] == depth.shape[1]
        x = random.randint(0, img.shape[1] - width)
        y = random.randint(0, img.shape[0] - height)
        img = img[y:y + height, x:x + width, :]
        depth = depth[y:y + height, x:x + width, :]

        return img, depth
    
    def random_translate(self, img, depth, max_t=20):
        assert img.shape[0] == depth.shape[0]
        assert img.shape[1] == depth.shape[1]
        p = self.config.translate_prob
        do_translate = random.random()
        if do_translate > p:
            return img, depth
        x = random.randint(-max_t, max_t)
        y = random.randint(-max_t, max_t)
        M = np.float32([[1, 0, x], [0, 1, y]])
        # print(img.shape, depth.shape)
        img = cv2.warpAffine(img, M, (img.shape[1], img.shape[0]))
        depth = cv2.warpAffine(depth, M, (depth.shape[1], depth.shape[0]))
        depth = depth.squeeze()[..., None]  # add channel dim back. Affine warp removes it
        # print("after", img.shape, depth.shape)
        return img, depth

    def train_preprocess(self, image, depth_gt):
        if self.config.aug:
            # Random flipping
            do_flip = random.random()
            if do_flip > 0.5:
                image = (image[:, ::-1, :]).copy()
                depth_gt = (depth_gt[:, ::-1, :]).copy()

            # Random gamma, brightness, color augmentation
            do_augment = random.random()
            if do_augment > 0.5:
                image = self.augment_image(image)

        return image, depth_gt

    def augment_image(self, image):
        # gamma augmentation
        gamma = random.uniform(0.9, 1.1)
        image_aug = image ** gamma

        # brightness augmentation
        if self.config.dataset == 'nyu':
            brightness = random.uniform(0.75, 1.25)
        else:
            brightness = random.uniform(0.9, 1.1)
        image_aug = image_aug * brightness

        # color augmentation
        colors = np.random.uniform(0.9, 1.1, size=3)
        white = np.ones((image.shape[0], image.shape[1]))
        color_image = np.stack([white * colors[i] for i in range(3)], axis=2)
        image_aug *= color_image
        image_aug = np.clip(image_aug, 0, 1)

        return image_aug

    def __len__(self):
        return len(self.filenames)


class ToTensor(object):
    def __init__(self, mode, do_normalize=False, size=None):
        self.mode = mode
        self.normalize = transforms.Normalize(
            mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) if do_normalize else nn.Identity()
        self.size = size
        if size is not None:
            self.resize = transforms.Resize(size=size)
        else:
            self.resize = nn.Identity()

    def __call__(self, sample):
        image, focal = sample['image'], sample['focal']
        image = self.to_tensor(image)
        image = self.normalize(image)
        image = self.resize(image)

        if self.mode == 'test':
            return {'image': image, 'focal': focal}

        depth = sample['depth']
        if self.mode == 'train':
            depth = self.to_tensor(depth)
            return {**sample, 'image': image, 'depth': depth, 'focal': focal}
        else:
            has_valid_depth = sample['has_valid_depth']
            image = self.resize(image)
            return {**sample, 'image': image, 'depth': depth, 'focal': focal, 'has_valid_depth': has_valid_depth,
                    'image_path': sample['image_path'], 'depth_path': sample['depth_path']}

    def to_tensor(self, pic):
        if not (_is_pil_image(pic) or _is_numpy_image(pic)):
            raise TypeError(
                'pic should be PIL Image or ndarray. Got {}'.format(type(pic)))

        if isinstance(pic, np.ndarray):
            img = torch.from_numpy(pic.transpose((2, 0, 1)))
            return img

        # handle PIL Image
        if pic.mode == 'I':
            img = torch.from_numpy(np.array(pic, np.int32, copy=False))
        elif pic.mode == 'I;16':
            img = torch.from_numpy(np.array(pic, np.int16, copy=False))
        else:
            img = torch.ByteTensor(
                torch.ByteStorage.from_buffer(pic.tobytes()))
        # PIL image mode: 1, L, P, I, F, RGB, YCbCr, RGBA, CMYK
        if pic.mode == 'YCbCr':
            nchannel = 3
        elif pic.mode == 'I;16':
            nchannel = 1
        else:
            nchannel = len(pic.mode)
        img = img.view(pic.size[1], pic.size[0], nchannel)

        img = img.transpose(0, 1).transpose(0, 2).contiguous()
        if isinstance(img, torch.ByteTensor):
            return img.float()
        else:
            return img

Overwriting /kaggle/working/ZoeDepth/zoedepth/data/data_mono.py


In [4]:
%%writefile /kaggle/working/ZoeDepth/new_evaluate.py

# MIT License

# Copyright (c) 2022 Intelligent Systems Lab Org

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished ааto do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

# File author: Shariq Farooq Bhat

import argparse
from pprint import pprint

import torch
from zoedepth.utils.easydict import EasyDict as edict
from tqdm import tqdm
import sys
from zoedepth.data.data_mono import DepthDataLoader
from zoedepth.models.builder import build_model
from zoedepth.utils.arg_utils import parse_unknown
from zoedepth.utils.config import change_dataset, get_config, ALL_EVAL_DATASETS, ALL_INDOOR, ALL_OUTDOOR
from zoedepth.utils.misc import (RunningAverageDict, colors, compute_metrics,
                        count_parameters)
import numpy as np
import cv2
from PIL import Image
import matplotlib.pyplot as plt
import os, shutil
import time

def main(config):
    model = build_model(config)
    test_loader = DepthDataLoader(config, 'online_eval', dn=5).data
    model = model.cuda()
    metrics = evaluate(model, test_loader, config)
    return metrics


def evalu(config, model_name="depth_anything", batch_size=1):
    demo_root = "/kaggle/working"
    model = build_model(config)
    model = model.cuda()
    all_metrics = list()

    demo_dir = os.path.join(demo_root, model_name)
    if os.path.exists(demo_dir):
        shutil.rmtree(demo_dir)
    os.mkdir(demo_dir)
    for dn in range(21, 31):
        print("Noise " + str(dn) + " processing...")
        loader = DepthDataLoader(config, 'online_eval', dn=dn).data

        noise_dir = os.path.join(demo_dir, str(dn))
        os.mkdir(noise_dir)
        
        metrics = evaluate(model, loader, config, noise_dir)
        print(metrics)
        all_metrics.append(metrics)
    return all_metrics

def image2depth(path):
    depth = cv2.imread(path, cv2.IMREAD_UNCHANGED)
    depth = depth.astype('float32')
    depth /= (2**16 - 1)
    depth *= 10.0
    return depth


@torch.no_grad()
def infer(model, images, **kwargs):
    """Inference with flip augmentation"""
    # images.shape = N, C, H, W
    def get_depth_from_prediction(pred):
        if isinstance(pred, torch.Tensor):
            pred = pred  # pass
        elif isinstance(pred, (list, tuple)):
            pred = pred[-1]
        elif isinstance(pred, dict):
            pred = pred['metric_depth'] if 'metric_depth' in pred else pred['out']
        else:
            raise NotImplementedError(f"Unknown output type {type(pred)}")
        return pred

    times = list()
    
    for i in range(200):
        start = time.time()
        pred1 = model(images, **kwargs)
        times.append(time.time() - start)
    print(sum(times) / len(times))
    sys.exit(0)
    pred1 = get_depth_from_prediction(pred1)

    pred2 = model(torch.flip(images, [3]), **kwargs)
    pred2 = get_depth_from_prediction(pred2)
    pred2 = torch.flip(pred2, [3])

    mean_pred = 0.5 * (pred1 + pred2)

    return mean_pred


def write_demo(pred, path, dir_root):
    pred = pred.cpu().detach().numpy()
    pred = pred[0, 0]
    path = os.path.join(dir_root, path.split("/")[-1])
    pred = pred * 255 / pred.max()
    pred = pred.astype(np.uint8)
    img = Image.fromarray(pred)
    img.save(path)



#depth_gt = np.expand_dims(depth_gt, axis=2)
@torch.no_grad()
def evaluate(model, test_loader, config, demo_dir, round_vals=True, round_precision=3):
    model.eval()
    metrics = list()
    for i, sample in tqdm(enumerate(test_loader), total=len(test_loader)):
        if 'has_valid_depth' in sample:
            if not sample['has_valid_depth']:
                continue
        image, depth = sample['image'], sample['depth']
        img_path = sample['image_path']
        
        image, depth = image.cuda(), depth.cuda()
        depth = depth.squeeze().unsqueeze(0).unsqueeze(0)
        focal = sample.get('focal', torch.Tensor(
            [715.0873]).cuda())  # This magic number (focal) is only used for evaluating BTS model
        pred = infer(model, image, dataset=sample['dataset'][0], focal=focal)
        write_demo(pred, img_path[0], demo_dir)
        
        vals = compute_metrics(depth, pred)
        metrics.append(vals)

    result = dict()
    for k in metrics[0].keys():
        result[k] = sum([m[k] for m in metrics]) / 654
        
    return result






def eval_model(model_name, pretrained_resource, dataset='nyu', **kwargs):

    # Load default pretrained resource defined in config if not set
    overwrite = {**kwargs, "pretrained_resource": pretrained_resource} if pretrained_resource else kwargs
    config = get_config(model_name, "eval", dataset, **overwrite)
    print(config)
    # config = change_dataset(config, dataset)  # change the dataset
    pprint(config)
    print(f"Evaluating {model_name} on {dataset}...")
    metrics = evalu(config)
    return metrics


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("-m", "--model", type=str,
                        required=True, help="Name of the model to evaluate")
    parser.add_argument("-p", "--pretrained_resource", type=str,
                        required=False, default="", help="Pretrained resource to use for fetching weights. If not set, default resource from model config is used,  Refer models.model_io.load_state_from_resource for more details.")
    parser.add_argument("-d", "--dataset", type=str, required=False,
                        default='nyu', help="Dataset to evaluate on")

    args, unknown_args = parser.parse_known_args()
    overwrite_kwargs = parse_unknown(unknown_args)

    if "ALL_INDOOR" in args.dataset:
        datasets = ALL_INDOOR
    elif "ALL_OUTDOOR" in args.dataset:
        datasets = ALL_OUTDOOR
    elif "ALL" in args.dataset:
        datasets = ALL_EVAL_DATASETS
    elif "," in args.dataset:
        datasets = args.dataset.split(",")
    else:
        datasets = [args.dataset]
    
    for dataset in datasets:
        metrics = eval_model(args.model, pretrained_resource=args.pretrained_resource,
                    dataset=dataset, **overwrite_kwargs)
    print(metrics)



Writing /kaggle/working/ZoeDepth/new_evaluate.py


In [5]:
%%writefile /kaggle/working/ZoeDepth/zoedepth/utils/misc.py

# MIT License

# Copyright (c) 2022 Intelligent Systems Lab Org

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

# File author: Shariq Farooq Bhat

"""Miscellaneous utility functions."""

from scipy import ndimage

import base64
import math
import re
from io import BytesIO

import matplotlib
import matplotlib.cm
import numpy as np
import requests
import torch
import torch.distributed as dist
import torch.nn
import torch.nn as nn
import torch.utils.data.distributed
from PIL import Image
from torchvision.transforms import ToTensor


class RunningAverage:
    def __init__(self):
        self.avg = 0
        self.count = 0

    def append(self, value):
        self.avg = (value + self.count * self.avg) / (self.count + 1)
        self.count += 1

    def get_value(self):
        return self.avg


def denormalize(x):
    """Reverses the imagenet normalization applied to the input.

    Args:
        x (torch.Tensor - shape(N,3,H,W)): input tensor

    Returns:
        torch.Tensor - shape(N,3,H,W): Denormalized input
    """
    mean = torch.Tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1).to(x.device)
    std = torch.Tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1).to(x.device)
    return x * std + mean


class RunningAverageDict:
    """A dictionary of running averages."""
    def __init__(self):
        self._dict = None

    def update(self, new_dict):
        if new_dict is None:
            return

        if self._dict is None:
            self._dict = dict()
            for key, value in new_dict.items():
                self._dict[key] = RunningAverage()

        for key, value in new_dict.items():
            self._dict[key].append(value)

    def get_value(self):
        if self._dict is None:
            return None
        return {key: value.get_value() for key, value in self._dict.items()}


def colorize(value, vmin=None, vmax=None, cmap='gray_r', invalid_val=-99, invalid_mask=None, background_color=(128, 128, 128, 255), gamma_corrected=False, value_transform=None):
    """Converts a depth map to a color image.

    Args:
        value (torch.Tensor, numpy.ndarry): Input depth map. Shape: (H, W) or (1, H, W) or (1, 1, H, W). All singular dimensions are squeezed
        vmin (float, optional): vmin-valued entries are mapped to start color of cmap. If None, value.min() is used. Defaults to None.
        vmax (float, optional):  vmax-valued entries are mapped to end color of cmap. If None, value.max() is used. Defaults to None.
        cmap (str, optional): matplotlib colormap to use. Defaults to 'magma_r'.
        invalid_val (int, optional): Specifies value of invalid pixels that should be colored as 'background_color'. Defaults to -99.
        invalid_mask (numpy.ndarray, optional): Boolean mask for invalid regions. Defaults to None.
        background_color (tuple[int], optional): 4-tuple RGB color to give to invalid pixels. Defaults to (128, 128, 128, 255).
        gamma_corrected (bool, optional): Apply gamma correction to colored image. Defaults to False.
        value_transform (Callable, optional): Apply transform function to valid pixels before coloring. Defaults to None.

    Returns:
        numpy.ndarray, dtype - uint8: Colored depth map. Shape: (H, W, 4)
    """
    if isinstance(value, torch.Tensor):
        value = value.detach().cpu().numpy()

    value = value.squeeze()
    if invalid_mask is None:
        invalid_mask = value == invalid_val
    mask = np.logical_not(invalid_mask)

    # normalize
    vmin = np.percentile(value[mask],2) if vmin is None else vmin
    vmax = np.percentile(value[mask],85) if vmax is None else vmax
    if vmin != vmax:
        value = (value - vmin) / (vmax - vmin)  # vmin..vmax
    else:
        # Avoid 0-division
        value = value * 0.

    # squeeze last dim if it exists
    # grey out the invalid values

    value[invalid_mask] = np.nan
    cmapper = matplotlib.cm.get_cmap(cmap)
    if value_transform:
        value = value_transform(value)
        # value = value / value.max()
    value = cmapper(value, bytes=True)  # (nxmx4)

    # img = value[:, :, :]
    img = value[...]
    img[invalid_mask] = background_color

    #     return img.transpose((2, 0, 1))
    if gamma_corrected:
        # gamma correction
        img = img / 255
        img = np.power(img, 2.2)
        img = img * 255
        img = img.astype(np.uint8)
    return img


def count_parameters(model, include_all=False):
    return sum(p.numel() for p in model.parameters() if p.requires_grad or include_all)



import scipy.signal as sps

def evaluate_model(inference):
    test_path = "/kaggle/input/nyu-depth-v2/nyu_data/data/nyu2_test"
    cnt_test = len(os.listdir(test_path))
    index = 0
    metrics = list()
    for filename in tqdm(os.listdir(test_path)):
        if "colors" not in filename:
            continue
        image = Image.open(os.path.join(test_path, filename))
        depth = inference(image)
        
        depth_filename = filename.replace("colors", "depth")
        image = Image.open(os.path.join(test_path, depth_filename))
        ground_truth_depth = np.asarray(image)
        
        ground_truth_depth = 1 / ground_truth_depth
        ground_truth_depth = ground_truth_depth / ground_truth_depth.max()
        
        metrics_values = get_metrics(depth, ground_truth_depth)
        metrics.append(metrics_values)
    
    result = dict()
    for k, v in metrics[0].items():
        result[k] = sum([x[k] for x in metrics]) / cnt_test
    return result

def get_metrics(x, y, size=5):
    x[y == 0] = 0
    values = dict()
    values['rmse'] = np.sum(rmse(x, y))
    values['mae'] = np.sum(mae(x, y))
    values['mre'] = np.sum(mre(x, y))
    values['gradient'] = np.sum(gradient_metric(x, y))
    values['rank'], values['census'] = rank_and_census(x, y, size)
    values['rank'] = np.sum(values['rank'])
    values['census'] = np.sum(values['census'])
    
    values['delta1'] = np.sum(get_delta(x, y, 1.25))
    values['delta2'] = np.sum(get_delta(x, y, 1.25 ** 2))
    values['delta3'] = np.sum(get_delta(x, y, 1.25 ** 3))

    return values

def get_delta(x, y, delta):
    frac1, frac2 = x / (y + 1e-6), y / (x + 1e-6)
    frac1, frac2 = frac1[..., np.newaxis], frac2[..., np.newaxis]
    frac = np.concatenate((frac1, frac2), axis=-1)

    delta_values = np.max(frac, axis=-1)
    return np.sum((delta_values < delta), axis=(1, 2)) / (x.shape[1] * x.shape[2])

def rmse(x, y):
    return np.sqrt(np.sum((x - y) ** 2, axis=(1, 2)) / (x.shape[1] * x.shape[2]))

def mae(x, y):
    return np.sum(np.abs(x - y), axis=(1, 2)) / (x.shape[1] * x.shape[2])

def mre(x, y):
    return np.sum(np.abs(x - y) / (y + 1e-7), axis=(1, 2)) / (x.shape[1] * x.shape[2])

def convolution(image, conv):
    height, width = image.shape[0], image.shape[1]
    padding_width = conv.shape[0] // 2
    image = np.pad(image, padding_width, 'constant')
    result = np.zeros((height, width))
    for i in range(padding_width, height + padding_width):
        for j in range(padding_width, width + padding_width):
            result[i-padding_width][j-padding_width] = np.sum(image[(i - padding_width):(i + padding_width + 1), (j - padding_width):(j + padding_width + 1)] * conv)
    return result

def gradient_metric(x, y):
    kernel1 = np.array([[
        [1, 0, -1],
        [2, 0, -2],
        [1, 0, -1],
    ]])
    
    kernel2 = np.array([[
        [1, 2, 1],
        [0, 0, 0],
        [-1, -2, -1],
    ]])
    
    x_deriv1, y_deriv1 = sps.fftconvolve(x, kernel1, mode='same'), sps.fftconvolve(y, kernel1, mode='same')
    x_deriv2, y_deriv2 = sps.fftconvolve(x, kernel2, mode='same'), sps.fftconvolve(y, kernel2, mode='same')

    return np.sum(np.abs(x_deriv1 - y_deriv1) + np.abs(x_deriv2 - y_deriv2), axis=(1, 2)) / (x.shape[1] * x.shape[2])

def neighborhood(x, size=5):
    indices = np.indices(x.shape[1:])
    padding_width = size // 2
    indices += padding_width
    pad_x = np.pad(x, ((0,), (padding_width,), (padding_width,)), 'constant')
    
    index_maps = list()
    for i in range(-padding_width, padding_width+1):
        for j in range(-padding_width, padding_width+1):
            index_map = np.zeros(indices.shape).astype(int)
            index_map[0] = indices[0] + i
            index_map[1] = indices[1] + j
            index_maps.append(index_map)
            
    index_maps = np.array(index_maps)
    return pad_x[:, index_maps[:, 0, :, :], index_maps[:, 1, :, :]]
    
def one_hot(x, size):
    center_index = size ** 2 // 2
    center = x[:, center_index]
    center = center[:, np.newaxis]

    encoding = (x < center).astype(int)
    return np.delete(encoding, center_index, axis=1)
    
def rank_metric(en_x, en_y, size=5):
    rank_x, rank_y = np.sum(en_x, axis=1), np.sum(en_y, axis=1)
    return np.sum(np.abs(rank_x - rank_y), axis=(1, 2)) / (en_x.shape[2] * en_x.shape[3])

def census_metric(en_x, en_y, size=5):
    mask = (en_x != en_y).astype(int)
    return np.sum(mask, axis=(1, 2, 3)) / (en_x.shape[2] * en_x.shape[3])

def rank_and_census(x, y, size=5):
    nx, ny = neighborhood(x, size), neighborhood(y, size)
    en_x, en_y = one_hot(nx, size), one_hot(ny, size)
    rank, census = rank_metric(en_x, en_y, size), census_metric(en_x, en_y, size)
    return rank, census





def compute_errors(gt, pred):
    """Compute metrics for 'pred' compared to 'gt'

    Args:
        gt (numpy.ndarray): Ground truth values
        pred (numpy.ndarray): Predicted values

        gt.shape should be equal to pred.shape

    Returns:
        dict: Dictionary containing the following metrics:
            'a1': Delta1 accuracy: Fraction of pixels that are within a scale factor of 1.25
            'a2': Delta2 accuracy: Fraction of pixels that are within a scale factor of 1.25^2
            'a3': Delta3 accuracy: Fraction of pixels that are within a scale factor of 1.25^3
            'abs_rel': Absolute relative error
            'rmse': Root mean squared error
            'log_10': Absolute log10 error
            'sq_rel': Squared relative error
            'rmse_log': Root mean squared error on the log scale
            'silog': Scale invariant log error
    """
    print("SHAPES:")
    print(gt.shape, pred.shape)
    
    thresh = np.maximum((gt / pred), (pred / gt))
    a1 = (thresh < 1.25).mean()
    a2 = (thresh < 1.25 ** 2).mean()
    a3 = (thresh < 1.25 ** 3).mean()

    abs_rel = np.mean(np.abs(gt - pred) / gt)
    sq_rel = np.mean(((gt - pred) ** 2) / gt)

    rmse = (gt - pred) ** 2
    rmse = np.sqrt(rmse.mean())

    rmse_log = (np.log(gt) - np.log(pred)) ** 2
    rmse_log = np.sqrt(rmse_log.mean())

    err = np.log(pred) - np.log(gt)
    silog = np.sqrt(np.mean(err ** 2) - np.mean(err) ** 2) * 100

    log_10 = (np.abs(np.log10(gt) - np.log10(pred))).mean()
    result = dict(a1=a1, a2=a2, a3=a3, abs_rel=abs_rel, rmse=rmse, log_10=log_10, rmse_log=rmse_log,
                silog=silog, sq_rel=sq_rel)
    print(result)
    return result

def compute_metrics(gt, pred, interpolate=True, garg_crop=False, eigen_crop=True, dataset='nyu', min_depth_eval=0.1, max_depth_eval=10, **kwargs):
    """Compute metrics of predicted depth maps. Applies cropping and masking as necessary or specified via arguments. Refer to compute_errors for more details on metrics.
    """
    if 'config' in kwargs:
        config = kwargs['config']
        garg_crop = config.garg_crop
        eigen_crop = config.eigen_crop
        min_depth_eval = config.min_depth_eval
        max_depth_eval = config.max_depth_eval

    if gt.shape[-2:] != pred.shape[-2:] and interpolate:
        pred = nn.functional.interpolate(
            pred, gt.shape[-2:], mode='bilinear', align_corners=True)

    pred = pred.squeeze().cpu().numpy()
    pred[pred < min_depth_eval] = min_depth_eval
    pred[pred > max_depth_eval] = max_depth_eval
    pred[np.isinf(pred)] = max_depth_eval
    pred[np.isnan(pred)] = min_depth_eval

    gt_depth = gt.squeeze().cpu().numpy()
    valid_mask = np.logical_and(
        gt_depth > min_depth_eval, gt_depth < max_depth_eval)
    

    gt = gt[0, 0]

    
    if garg_crop or eigen_crop:
        gt_height, gt_width = gt_depth.shape
        eval_mask = np.zeros(valid_mask.shape)

        if garg_crop:
            eval_mask[int(0.40810811 * gt_height):int(0.99189189 * gt_height),
                      int(0.03594771 * gt_width):int(0.96405229 * gt_width)] = 1

        elif eigen_crop:
            # print("-"*10, " EIGEN CROP ", "-"*10)
            if dataset == 'kitti':
                eval_mask[int(0.3324324 * gt_height):int(0.91351351 * gt_height),
                          int(0.0359477 * gt_width):int(0.96405229 * gt_width)] = 1
            else:
                # assert gt_depth.shape == (480, 640), "Error: Eigen crop is currently only valid for (480, 640) images"
                #eval_mask[45:471, 41:601] = 1
                gt = gt[45:471, 41:601]
                pred = pred[45:471, 41:601]
        else:
            eval_mask = np.ones(valid_mask.shape)
    gt = gt.cpu().detach().numpy()
    
    pred, gt = pred[np.newaxis, ...], gt[np.newaxis, ...]
    metrics = get_metrics(pred, gt)
    return metrics


#################################### Model uilts ################################################


def parallelize(config, model, find_unused_parameters=True):

    if config.gpu is not None:
        torch.cuda.set_device(config.gpu)
        model = model.cuda(config.gpu)

    config.multigpu = False
    if config.distributed:
        # Use DDP
        config.multigpu = True
        config.rank = config.rank * config.ngpus_per_node + config.gpu
        dist.init_process_group(backend=config.dist_backend, init_method=config.dist_url,
                                world_size=config.world_size, rank=config.rank)
        config.batch_size = int(config.batch_size / config.ngpus_per_node)
        # config.batch_size = 8
        config.workers = int(
            (config.num_workers + config.ngpus_per_node - 1) / config.ngpus_per_node)
        print("Device", config.gpu, "Rank",  config.rank, "batch size",
              config.batch_size, "Workers", config.workers)
        torch.cuda.set_device(config.gpu)
        model = nn.SyncBatchNorm.convert_sync_batchnorm(model)
        model = model.cuda(config.gpu)
        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[config.gpu], output_device=config.gpu,
                                                          find_unused_parameters=find_unused_parameters)

    elif config.gpu is None:
        # Use DP
        config.multigpu = True
        model = model.cuda()
        model = torch.nn.DataParallel(model)

    return model


#################################################################################################


#####################################################################################################


class colors:
    '''Colors class:
    Reset all colors with colors.reset
    Two subclasses fg for foreground and bg for background.
    Use as colors.subclass.colorname.
    i.e. colors.fg.red or colors.bg.green
    Also, the generic bold, disable, underline, reverse, strikethrough,
    and invisible work with the main class
    i.e. colors.bold
    '''
    reset = '\033[0m'
    bold = '\033[01m'
    disable = '\033[02m'
    underline = '\033[04m'
    reverse = '\033[07m'
    strikethrough = '\033[09m'
    invisible = '\033[08m'

    class fg:
        black = '\033[30m'
        red = '\033[31m'
        green = '\033[32m'
        orange = '\033[33m'
        blue = '\033[34m'
        purple = '\033[35m'
        cyan = '\033[36m'
        lightgrey = '\033[37m'
        darkgrey = '\033[90m'
        lightred = '\033[91m'
        lightgreen = '\033[92m'
        yellow = '\033[93m'
        lightblue = '\033[94m'
        pink = '\033[95m'
        lightcyan = '\033[96m'

    class bg:
        black = '\033[40m'
        red = '\033[41m'
        green = '\033[42m'
        orange = '\033[43m'
        blue = '\033[44m'
        purple = '\033[45m'
        cyan = '\033[46m'
        lightgrey = '\033[47m'


def printc(text, color):
    print(f"{color}{text}{colors.reset}")

############################################

def get_image_from_url(url):
    response = requests.get(url)
    img = Image.open(BytesIO(response.content)).convert("RGB")
    return img

def url_to_torch(url, size=(384, 384)):
    img = get_image_from_url(url)
    img = img.resize(size, Image.ANTIALIAS)
    img = torch.from_numpy(np.asarray(img)).float()
    img = img.permute(2, 0, 1)
    img.div_(255)
    return img

def pil_to_batched_tensor(img):
    return ToTensor()(img).unsqueeze(0)

def save_raw_16bit(depth, fpath="raw.png"):
    if isinstance(depth, torch.Tensor):
        depth = depth.squeeze().cpu().numpy()
    
    assert isinstance(depth, np.ndarray), "Depth must be a torch tensor or numpy array"
    assert depth.ndim == 2, "Depth must be 2D"
    depth = depth * 256  # scale for 16-bit png
    depth = depth.astype(np.uint16)
    depth = Image.fromarray(depth)
    depth.save(fpath)
    print("Saved raw depth to", fpath)

Overwriting /kaggle/working/ZoeDepth/zoedepth/utils/misc.py


In [6]:
%%writefile /kaggle/working/ZoeDepth/zoedepth/utils/config.py

# MIT License

# Copyright (c) 2022 Intelligent Systems Lab Org

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

# File author: Shariq Farooq Bhat

import json
import os

from zoedepth.utils.easydict import EasyDict as edict

from zoedepth.utils.arg_utils import infer_type
import pathlib
import platform

ROOT = pathlib.Path(__file__).parent.parent.resolve()

HOME_DIR = "/kaggle/input/nyuv2-official-split-dataset/test/official"

COMMON_CONFIG = {
    "save_dir": os.path.expanduser("~/shortcuts/monodepth3_checkpoints"),
    "project": "ZoeDepth",
    "tags": '',
    "notes": "",
    "gpu": None,
    "root": ".",
    "uid": None,
    "print_losses": False
}

DATASETS_CONFIG = {
    "kitti": {
        "dataset": "kitti",
        "min_depth": 0.001,
        "max_depth": 80,
        "data_path": os.path.join(HOME_DIR, "shortcuts/datasets/kitti/raw"),
        "gt_path": os.path.join(HOME_DIR, "shortcuts/datasets/kitti/gts"),
        "filenames_file": "./train_test_inputs/kitti_eigen_train_files_with_gt.txt",
        "input_height": 352,
        "input_width": 1216,  # 704
        "data_path_eval": os.path.join(HOME_DIR, "shortcuts/datasets/kitti/raw"),
        "gt_path_eval": os.path.join(HOME_DIR, "shortcuts/datasets/kitti/gts"),
        "filenames_file_eval": "./train_test_inputs/kitti_eigen_test_files_with_gt.txt",

        "min_depth_eval": 1e-3,
        "max_depth_eval": 80,

        "do_random_rotate": True,
        "degree": 1.0,
        "do_kb_crop": True,
        "garg_crop": True,
        "eigen_crop": False,
        "use_right": False
    },
    "kitti_test": {
        "dataset": "kitti",
        "min_depth": 0.001,
        "max_depth": 80,
        "data_path": os.path.join(HOME_DIR, "shortcuts/datasets/kitti/raw"),
        "gt_path": os.path.join(HOME_DIR, "shortcuts/datasets/kitti/gts"),
        "filenames_file": "./train_test_inputs/kitti_eigen_train_files_with_gt.txt",
        "input_height": 352,
        "input_width": 1216,
        "data_path_eval": os.path.join(HOME_DIR, "shortcuts/datasets/kitti/raw"),
        "gt_path_eval": os.path.join(HOME_DIR, "shortcuts/datasets/kitti/gts"),
        "filenames_file_eval": "./train_test_inputs/kitti_eigen_test_files_with_gt.txt",

        "min_depth_eval": 1e-3,
        "max_depth_eval": 80,

        "do_random_rotate": False,
        "degree": 1.0,
        "do_kb_crop": True,
        "garg_crop": True,
        "eigen_crop": False,
        "use_right": False
    },
    "nyu": {
        "dataset": "nyu",
        "avoid_boundary": False,
        "min_depth": 1e-3,   # originally 0.1
        "max_depth": 10,
        "data_path": HOME_DIR,
        "gt_path": HOME_DIR,
        "filenames_file": "./train_test_inputs/nyudepthv2_train_files_with_gt.txt",
        "input_height": 480,
        "input_width": 640,
        "data_path_eval": HOME_DIR,
        "gt_path_eval": HOME_DIR,
        "filenames_file_eval": "./train_test_inputs/nyudepthv2_test_files_with_gt.txt",
        "min_depth_eval": 1e-3,
        "max_depth_eval": 10,
        "min_depth_diff": -10,
        "max_depth_diff": 10,

        "do_random_rotate": True,
        "degree": 1.0,
        "do_kb_crop": False,
        "garg_crop": False,
        "eigen_crop": True
    },
    "ibims": {
        "dataset": "ibims",
        "ibims_root": os.path.join(HOME_DIR, "shortcuts/datasets/ibims/ibims1_core_raw/"),
        "eigen_crop": True,
        "garg_crop": False,
        "do_kb_crop": False,
        "min_depth_eval": 0,
        "max_depth_eval": 10,
        "min_depth": 1e-3,
        "max_depth": 10
    },
    "sunrgbd": {
        "dataset": "sunrgbd",
        "sunrgbd_root": os.path.join(HOME_DIR, "shortcuts/datasets/SUNRGBD/test/"),
        "eigen_crop": True,
        "garg_crop": False,
        "do_kb_crop": False,
        "min_depth_eval": 0,
        "max_depth_eval": 8,
        "min_depth": 1e-3,
        "max_depth": 10
    },
    "diml_indoor": {
        "dataset": "diml_indoor",
        "diml_indoor_root": os.path.join(HOME_DIR, "shortcuts/datasets/diml_indoor_test/"),
        "eigen_crop": True,
        "garg_crop": False,
        "do_kb_crop": False,
        "min_depth_eval": 0,
        "max_depth_eval": 10,
        "min_depth": 1e-3,
        "max_depth": 10
    },
    "diml_outdoor": {
        "dataset": "diml_outdoor",
        "diml_outdoor_root": os.path.join(HOME_DIR, "shortcuts/datasets/diml_outdoor_test/"),
        "eigen_crop": False,
        "garg_crop": True,
        "do_kb_crop": False,
        "min_depth_eval": 2,
        "max_depth_eval": 80,
        "min_depth": 1e-3,
        "max_depth": 80
    },
    "diode_indoor": {
        "dataset": "diode_indoor",
        "diode_indoor_root": os.path.join(HOME_DIR, "shortcuts/datasets/diode_indoor/"),
        "eigen_crop": True,
        "garg_crop": False,
        "do_kb_crop": False,
        "min_depth_eval": 1e-3,
        "max_depth_eval": 10,
        "min_depth": 1e-3,
        "max_depth": 10
    },
    "diode_outdoor": {
        "dataset": "diode_outdoor",
        "diode_outdoor_root": os.path.join(HOME_DIR, "shortcuts/datasets/diode_outdoor/"),
        "eigen_crop": False,
        "garg_crop": True,
        "do_kb_crop": False,
        "min_depth_eval": 1e-3,
        "max_depth_eval": 80,
        "min_depth": 1e-3,
        "max_depth": 80
    },
    "hypersim_test": {
        "dataset": "hypersim_test",
        "hypersim_test_root": os.path.join(HOME_DIR, "shortcuts/datasets/hypersim_test/"),
        "eigen_crop": True,
        "garg_crop": False,
        "do_kb_crop": False,
        "min_depth_eval": 1e-3,
        "max_depth_eval": 80,
        "min_depth": 1e-3,
        "max_depth": 10
    },
    "vkitti": {
        "dataset": "vkitti",
        "vkitti_root": os.path.join(HOME_DIR, "shortcuts/datasets/vkitti_test/"),
        "eigen_crop": False,
        "garg_crop": True,
        "do_kb_crop": True,
        "min_depth_eval": 1e-3,
        "max_depth_eval": 80,
        "min_depth": 1e-3,
        "max_depth": 80
    },
    "vkitti2": {
        "dataset": "vkitti2",
        "vkitti2_root": os.path.join(HOME_DIR, "shortcuts/datasets/vkitti2/"),
        "eigen_crop": False,
        "garg_crop": True,
        "do_kb_crop": True,
        "min_depth_eval": 1e-3,
        "max_depth_eval": 80,
        "min_depth": 1e-3,
        "max_depth": 80,
    },
    "ddad": {
        "dataset": "ddad",
        "ddad_root": os.path.join(HOME_DIR, "shortcuts/datasets/ddad/ddad_val/"),
        "eigen_crop": False,
        "garg_crop": True,
        "do_kb_crop": True,
        "min_depth_eval": 1e-3,
        "max_depth_eval": 80,
        "min_depth": 1e-3,
        "max_depth": 80,
    },
}

ALL_INDOOR = ["nyu", "ibims", "sunrgbd", "diode_indoor", "hypersim_test"]
ALL_OUTDOOR = ["kitti", "diml_outdoor", "diode_outdoor",  "vkitti2", "ddad"]
ALL_EVAL_DATASETS = ALL_INDOOR + ALL_OUTDOOR

COMMON_TRAINING_CONFIG = {
    "dataset": "nyu",
    "distributed": True,
    "workers": 16,
    "clip_grad": 0.1,
    "use_shared_dict": False,
    "shared_dict": None,
    "use_amp": False,

    "aug": True,
    "random_crop": False,
    "random_translate": False,
    "translate_prob": 0.2,
    "max_translation": 100,

    "validate_every": 0.25,
    "log_images_every": 0.1,
    "prefetch": False,
}


def flatten(config, except_keys=('bin_conf')):
    def recurse(inp):
        if isinstance(inp, dict):
            for key, value in inp.items():
                if key in except_keys:
                    yield (key, value)
                if isinstance(value, dict):
                    yield from recurse(value)
                else:
                    yield (key, value)

    return dict(list(recurse(config)))


def split_combined_args(kwargs):
    """Splits the arguments that are combined with '__' into multiple arguments.
       Combined arguments should have equal number of keys and values.
       Keys are separated by '__' and Values are separated with ';'.
       For example, '__n_bins__lr=256;0.001'

    Args:
        kwargs (dict): key-value pairs of arguments where key-value is optionally combined according to the above format. 

    Returns:
        dict: Parsed dict with the combined arguments split into individual key-value pairs.
    """
    new_kwargs = dict(kwargs)
    for key, value in kwargs.items():
        if key.startswith("__"):
            keys = key.split("__")[1:]
            values = value.split(";")
            assert len(keys) == len(
                values), f"Combined arguments should have equal number of keys and values. Keys are separated by '__' and Values are separated with ';'. For example, '__n_bins__lr=256;0.001. Given (keys,values) is ({keys}, {values})"
            for k, v in zip(keys, values):
                new_kwargs[k] = v
    return new_kwargs


def parse_list(config, key, dtype=int):
    """Parse a list of values for the key if the value is a string. The values are separated by a comma. 
    Modifies the config in place.
    """
    if key in config:
        if isinstance(config[key], str):
            config[key] = list(map(dtype, config[key].split(',')))
        assert isinstance(config[key], list) and all([isinstance(e, dtype) for e in config[key]]
                                                     ), f"{key} should be a list of values dtype {dtype}. Given {config[key]} of type {type(config[key])} with values of type {[type(e) for e in config[key]]}."


def get_model_config(model_name, model_version=None):
    """Find and parse the .json config file for the model.

    Args:
        model_name (str): name of the model. The config file should be named config_{model_name}[_{model_version}].json under the models/{model_name} directory.
        model_version (str, optional): Specific config version. If specified config_{model_name}_{model_version}.json is searched for and used. Otherwise config_{model_name}.json is used. Defaults to None.

    Returns:
        easydict: the config dictionary for the model.
    """
    config_fname = f"config_{model_name}_{model_version}.json" if model_version is not None else f"config_{model_name}.json"
    config_file = os.path.join(ROOT, "models", model_name, config_fname)
    if not os.path.exists(config_file):
        return None

    with open(config_file, "r") as f:
        config = edict(json.load(f))

    # handle dictionary inheritance
    # only training config is supported for inheritance
    if "inherit" in config.train and config.train.inherit is not None:
        inherit_config = get_model_config(config.train["inherit"]).train
        for key, value in inherit_config.items():
            if key not in config.train:
                config.train[key] = value
    return edict(config)


def update_model_config(config, mode, model_name, model_version=None, strict=False):
    model_config = get_model_config(model_name, model_version)
    if model_config is not None:
        config = {**config, **
                  flatten({**model_config.model, **model_config[mode]})}
    elif strict:
        raise ValueError(f"Config file for model {model_name} not found.")
    return config


def check_choices(name, value, choices):
    # return  # No checks in dev branch
    if value not in choices:
        raise ValueError(f"{name} {value} not in supported choices {choices}")


KEYS_TYPE_BOOL = ["use_amp", "distributed", "use_shared_dict", "same_lr", "aug", "three_phase",
                  "prefetch", "cycle_momentum"]  # Casting is not necessary as their int casted values in config are 0 or 1


def get_config(model_name, mode='train', dataset=None, **overwrite_kwargs):
    """Main entry point to get the config for the model.

    Args:
        model_name (str): name of the desired model.
        mode (str, optional): "train" or "infer". Defaults to 'train'.
        dataset (str, optional): If specified, the corresponding dataset configuration is loaded as well. Defaults to None.
    
    Keyword Args: key-value pairs of arguments to overwrite the default config.

    The order of precedence for overwriting the config is (Higher precedence first):
        # 1. overwrite_kwargs
        # 2. "config_version": Config file version if specified in overwrite_kwargs. The corresponding config loaded is config_{model_name}_{config_version}.json
        # 3. "version_name": Default Model version specific config specified in overwrite_kwargs. The corresponding config loaded is config_{model_name}_{version_name}.json
        # 4. common_config: Default config for all models specified in COMMON_CONFIG

    Returns:
        easydict: The config dictionary for the model.
    """


    check_choices("Model", model_name, ["zoedepth", "zoedepth_nk"])
    check_choices("Mode", mode, ["train", "infer", "eval"])
    if mode == "train":
        check_choices("Dataset", dataset, ["nyu", "kitti", "mix", None])

    config = flatten({**COMMON_CONFIG, **COMMON_TRAINING_CONFIG})
    config = update_model_config(config, mode, model_name)

    # update with model version specific config
    version_name = overwrite_kwargs.get("version_name", config["version_name"])
    config = update_model_config(config, mode, model_name, version_name)

    # update with config version if specified
    config_version = overwrite_kwargs.get("config_version", None)
    if config_version is not None:
        print("Overwriting config with config_version", config_version)
        config = update_model_config(config, mode, model_name, config_version)

    # update with overwrite_kwargs
    # Combined args are useful for hyperparameter search
    overwrite_kwargs = split_combined_args(overwrite_kwargs)
    config = {**config, **overwrite_kwargs}

    # Casting to bool   # TODO: Not necessary. Remove and test
    for key in KEYS_TYPE_BOOL:
        if key in config:
            config[key] = bool(config[key])

    # Model specific post processing of config
    parse_list(config, "n_attractors")

    # adjust n_bins for each bin configuration if bin_conf is given and n_bins is passed in overwrite_kwargs
    if 'bin_conf' in config and 'n_bins' in overwrite_kwargs:
        bin_conf = config['bin_conf']  # list of dicts
        n_bins = overwrite_kwargs['n_bins']
        new_bin_conf = []
        for conf in bin_conf:
            conf['n_bins'] = n_bins
            new_bin_conf.append(conf)
        config['bin_conf'] = new_bin_conf

    if mode == "train":
        orig_dataset = dataset
        if dataset == "mix":
            dataset = 'nyu'  # Use nyu as default for mix. Dataset config is changed accordingly while loading the dataloader
        if dataset is not None:
            config['project'] = f"MonoDepth3-{orig_dataset}"  # Set project for wandb

    if dataset is not None:
        config['dataset'] = dataset
        config = {**DATASETS_CONFIG[dataset], **config}
        

    config['model'] = model_name
    typed_config = {k: infer_type(v) for k, v in config.items()}
    # add hostname to config
    config['hostname'] = platform.node()
    return edict(typed_config)


def change_dataset(config, new_dataset):
    config.update(DATASETS_CONFIG[new_dataset])
    return config

Overwriting /kaggle/working/ZoeDepth/zoedepth/utils/config.py


In [7]:
import os

test_path = "/kaggle/input/nyuv2-official-split-dataset/test/official"
path = "/kaggle/working/ZoeDepth/train_test_inputs/nyudepthv2_test_files_with_gt.txt"


with open(path, "w") as f:
    focal = "518.8579"
    for filename in os.listdir(test_path):
        if "rgb" in filename:
            f.write("{0} {1} {2}\n".format(filename, filename.replace("rgb", "depth"), focal))
        

with open(path, "r") as f:
    string = f.read()
    



with open(path, "w") as f:
    f.write(string[:-1])

In [8]:
%%writefile /kaggle/working/ZoeDepth/zoedepth/models/base_models/midas.py
# MIT License

# Copyright (c) 2022 Intelligent Systems Lab Org

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

# File author: Shariq Farooq Bhat

import torch
import torch.nn as nn
import numpy as np
from torchvision.transforms import Normalize


def denormalize(x):
    """Reverses the imagenet normalization applied to the input.

    Args:
        x (torch.Tensor - shape(N,3,H,W)): input tensor

    Returns:
        torch.Tensor - shape(N,3,H,W): Denormalized input
    """
    mean = torch.Tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1).to(x.device)
    std = torch.Tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1).to(x.device)
    return x * std + mean

def get_activation(name, bank):
    def hook(model, input, output):
        bank[name] = output
    return hook


class Resize(object):
    """Resize sample to given size (width, height).
    """

    def __init__(
        self,
        width,
        height,
        resize_target=True,
        keep_aspect_ratio=False,
        ensure_multiple_of=1,
        resize_method="lower_bound",
    ):
        """Init.
        Args:
            width (int): desired output width
            height (int): desired output height
            resize_target (bool, optional):
                True: Resize the full sample (image, mask, target).
                False: Resize image only.
                Defaults to True.
            keep_aspect_ratio (bool, optional):
                True: Keep the aspect ratio of the input sample.
                Output sample might not have the given width and height, and
                resize behaviour depends on the parameter 'resize_method'.
                Defaults to False.
            ensure_multiple_of (int, optional):
                Output width and height is constrained to be multiple of this parameter.
                Defaults to 1.
            resize_method (str, optional):
                "lower_bound": Output will be at least as large as the given size.
                "upper_bound": Output will be at max as large as the given size. (Output size might be smaller than given size.)
                "minimal": Scale as least as possible.  (Output size might be smaller than given size.)
                Defaults to "lower_bound".
        """
        print("Params passed to Resize transform:")
        print("\twidth: ", width)
        print("\theight: ", height)
        print("\tresize_target: ", resize_target)
        print("\tkeep_aspect_ratio: ", keep_aspect_ratio)
        print("\tensure_multiple_of: ", ensure_multiple_of)
        print("\tresize_method: ", resize_method)

        self.__width = width
        self.__height = height

        self.__keep_aspect_ratio = keep_aspect_ratio
        self.__multiple_of = ensure_multiple_of
        self.__resize_method = resize_method

    def constrain_to_multiple_of(self, x, min_val=0, max_val=None):
        y = (np.round(x / self.__multiple_of) * self.__multiple_of).astype(int)

        if max_val is not None and y > max_val:
            y = (np.floor(x / self.__multiple_of)
                 * self.__multiple_of).astype(int)

        if y < min_val:
            y = (np.ceil(x / self.__multiple_of)
                 * self.__multiple_of).astype(int)

        return y

    def get_size(self, width, height):
        # determine new height and width
        scale_height = self.__height / height
        scale_width = self.__width / width

        if self.__keep_aspect_ratio:
            if self.__resize_method == "lower_bound":
                # scale such that output size is lower bound
                if scale_width > scale_height:
                    # fit width
                    scale_height = scale_width
                else:
                    # fit height
                    scale_width = scale_height
            elif self.__resize_method == "upper_bound":
                # scale such that output size is upper bound
                if scale_width < scale_height:
                    # fit width
                    scale_height = scale_width
                else:
                    # fit height
                    scale_width = scale_height
            elif self.__resize_method == "minimal":
                # scale as least as possbile
                if abs(1 - scale_width) < abs(1 - scale_height):
                    # fit width
                    scale_height = scale_width
                else:
                    # fit height
                    scale_width = scale_height
            else:
                raise ValueError(
                    f"resize_method {self.__resize_method} not implemented"
                )

        if self.__resize_method == "lower_bound":
            new_height = self.constrain_to_multiple_of(
                scale_height * height, min_val=self.__height
            )
            new_width = self.constrain_to_multiple_of(
                scale_width * width, min_val=self.__width
            )
        elif self.__resize_method == "upper_bound":
            new_height = self.constrain_to_multiple_of(
                scale_height * height, max_val=self.__height
            )
            new_width = self.constrain_to_multiple_of(
                scale_width * width, max_val=self.__width
            )
        elif self.__resize_method == "minimal":
            new_height = self.constrain_to_multiple_of(scale_height * height)
            new_width = self.constrain_to_multiple_of(scale_width * width)
        else:
            raise ValueError(
                f"resize_method {self.__resize_method} not implemented")

        return (new_width, new_height)

    def __call__(self, x):
        width, height = self.get_size(*x.shape[-2:][::-1])
        return nn.functional.interpolate(x, (int(height), int(width)), mode='bilinear', align_corners=True)

class PrepForMidas(object):
    def __init__(self, resize_mode="minimal", keep_aspect_ratio=True, img_size=384, do_resize=True):
        if isinstance(img_size, int):
            img_size = (img_size, img_size)
        net_h, net_w = img_size
        self.normalization = Normalize(
            mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
        self.resizer = Resize(net_w, net_h, keep_aspect_ratio=keep_aspect_ratio, ensure_multiple_of=32, resize_method=resize_mode) \
            if do_resize else nn.Identity()

    def __call__(self, x):
        return self.normalization(self.resizer(x))


class MidasCore(nn.Module):
    def __init__(self, midas, trainable=False, fetch_features=True, layer_names=('out_conv', 'l4_rn', 'r4', 'r3', 'r2', 'r1'), freeze_bn=False, keep_aspect_ratio=True,
                 img_size=384, **kwargs):
        """Midas Base model used for multi-scale feature extraction.

        Args:
            midas (torch.nn.Module): Midas model.
            trainable (bool, optional): Train midas model. Defaults to False.
            fetch_features (bool, optional): Extract multi-scale features. Defaults to True.
            layer_names (tuple, optional): Layers used for feature extraction. Order = (head output features, last layer features, ...decoder features). Defaults to ('out_conv', 'l4_rn', 'r4', 'r3', 'r2', 'r1').
            freeze_bn (bool, optional): Freeze BatchNorm. Generally results in better finetuning performance. Defaults to False.
            keep_aspect_ratio (bool, optional): Keep the aspect ratio of input images while resizing. Defaults to True.
            img_size (int, tuple, optional): Input resolution. Defaults to 384.
        """
        super().__init__()
        self.core = midas
        self.output_channels = None
        self.core_out = {}
        self.trainable = trainable
        self.fetch_features = fetch_features
        # midas.scratch.output_conv = nn.Identity()
        self.handles = []
        # self.layer_names = ['out_conv','l4_rn', 'r4', 'r3', 'r2', 'r1']
        self.layer_names = layer_names

        self.set_trainable(trainable)
        self.set_fetch_features(fetch_features)

        self.prep = PrepForMidas(keep_aspect_ratio=keep_aspect_ratio,
                                 img_size=img_size, do_resize=kwargs.get('do_resize', True))

        if freeze_bn:
            self.freeze_bn()

    def set_trainable(self, trainable):
        self.trainable = trainable
        if trainable:
            self.unfreeze()
        else:
            self.freeze()
        return self

    def set_fetch_features(self, fetch_features):
        self.fetch_features = fetch_features
        if fetch_features:
            if len(self.handles) == 0:
                self.attach_hooks(self.core)
        else:
            self.remove_hooks()
        return self

    def freeze(self):
        for p in self.parameters():
            p.requires_grad = False
        self.trainable = False
        return self

    def unfreeze(self):
        for p in self.parameters():
            p.requires_grad = True
        self.trainable = True
        return self

    def freeze_bn(self):
        for m in self.modules():
            if isinstance(m, nn.BatchNorm2d):
                m.eval()
        return self

    def forward(self, x, denorm=False, return_rel_depth=False):
        with torch.no_grad():
            if denorm:
                x = denormalize(x)
            x = self.prep(x)
            # print("Shape after prep: ", x.shape)

        with torch.set_grad_enabled(self.trainable):

            # print("Input size to Midascore", x.shape)
            rel_depth = self.core(x)
            # print("Output from midas shape", rel_depth.shape)
            if not self.fetch_features:
                return rel_depth
        out = [self.core_out[k] for k in self.layer_names]

        if return_rel_depth:
            return rel_depth, out
        return out

    def get_rel_pos_params(self):
        for name, p in self.core.pretrained.named_parameters():
            if "relative_position" in name:
                yield p

    def get_enc_params_except_rel_pos(self):
        for name, p in self.core.pretrained.named_parameters():
            if "relative_position" not in name:
                yield p

    def freeze_encoder(self, freeze_rel_pos=False):
        if freeze_rel_pos:
            for p in self.core.pretrained.parameters():
                p.requires_grad = False
        else:
            for p in self.get_enc_params_except_rel_pos():
                p.requires_grad = False
        return self

    def attach_hooks(self, midas):
        if len(self.handles) > 0:
            self.remove_hooks()
        if "out_conv" in self.layer_names:
            self.handles.append(list(midas.scratch.output_conv.children())[
                                3].register_forward_hook(get_activation("out_conv", self.core_out)))
        if "r4" in self.layer_names:
            self.handles.append(midas.scratch.refinenet4.register_forward_hook(
                get_activation("r4", self.core_out)))
        if "r3" in self.layer_names:
            self.handles.append(midas.scratch.refinenet3.register_forward_hook(
                get_activation("r3", self.core_out)))
        if "r2" in self.layer_names:
            self.handles.append(midas.scratch.refinenet2.register_forward_hook(
                get_activation("r2", self.core_out)))
        if "r1" in self.layer_names:
            self.handles.append(midas.scratch.refinenet1.register_forward_hook(
                get_activation("r1", self.core_out)))
        if "l4_rn" in self.layer_names:
            self.handles.append(midas.scratch.layer4_rn.register_forward_hook(
                get_activation("l4_rn", self.core_out)))

        return self

    def remove_hooks(self):
        for h in self.handles:
            h.remove()
        return self

    def __del__(self):
        self.remove_hooks()

    def set_output_channels(self, model_type):
        self.output_channels = MIDAS_SETTINGS[model_type]

    @staticmethod
    def build(midas_model_type="DPT_BEiT_L_384", train_midas=False, use_pretrained_midas=True, fetch_features=False, freeze_bn=True, force_keep_ar=False, force_reload=False, **kwargs):
        if midas_model_type not in MIDAS_SETTINGS:
            raise ValueError(
                f"Invalid model type: {midas_model_type}. Must be one of {list(MIDAS_SETTINGS.keys())}")
        if "img_size" in kwargs:
            kwargs = MidasCore.parse_img_size(kwargs)
        img_size = kwargs.pop("img_size", [384, 384])
        print("img_size", img_size)
        midas = torch.hub.load("intel-isl/MiDaS", midas_model_type,
                               pretrained=use_pretrained_midas, force_reload=force_reload)
        kwargs.update({'keep_aspect_ratio': force_keep_ar})
        midas_core = MidasCore(midas, trainable=train_midas, fetch_features=fetch_features,
                               freeze_bn=freeze_bn, img_size=img_size, **kwargs)
        midas_core.set_output_channels(midas_model_type)
        return midas_core

    @staticmethod
    def build_from_config(config):
        return MidasCore.build(**config)

    @staticmethod
    def parse_img_size(config):
        assert 'img_size' in config
        if isinstance(config['img_size'], str):
            assert "," in config['img_size'], "img_size should be a string with comma separated img_size=H,W"
            config['img_size'] = list(map(int, config['img_size'].split(",")))
            assert len(
                config['img_size']) == 2, "img_size should be a string with comma separated img_size=H,W"
        elif isinstance(config['img_size'], int):
            config['img_size'] = [config['img_size'], config['img_size']]
        else:
            assert isinstance(config['img_size'], list) and len(
                config['img_size']) == 2, "img_size should be a list of H,W"
        return config


nchannels2models = {
    tuple([256]*5): ["DPT_BEiT_L_384", "DPT_BEiT_L_512", "DPT_BEiT_B_384", "DPT_SwinV2_L_384", "DPT_SwinV2_B_384", "DPT_SwinV2_T_256", "DPT_Large", "DPT_Hybrid"],
    (512, 256, 128, 64, 64): ["MiDaS_small"]
}

# Model name to number of output channels
MIDAS_SETTINGS = {m: k for k, v in nchannels2models.items()
                  for m in v
                  }

Overwriting /kaggle/working/ZoeDepth/zoedepth/models/base_models/midas.py


In [9]:
!mkdir /kaggle/working/noise

In [15]:
!pwd

/kaggle/working/ZoeDepth


## Запуск

In [10]:
%cd ZoeDepth

/kaggle/working/ZoeDepth


In [13]:
!python new_evaluate.py -m zoedepth -d nyu

  warn(
{'dataset': 'nyu', 'avoid_boundary': False, 'min_depth': 0.001, 'max_depth': 10, 'data_path': '/kaggle/input/nyuv2-official-split-dataset/test/official', 'gt_path': '/kaggle/input/nyuv2-official-split-dataset/test/official', 'filenames_file': './train_test_inputs/nyudepthv2_train_files_with_gt.txt', 'input_height': 480, 'input_width': 640, 'data_path_eval': '/kaggle/input/nyuv2-official-split-dataset/test/official', 'gt_path_eval': '/kaggle/input/nyuv2-official-split-dataset/test/official', 'filenames_file_eval': './train_test_inputs/nyudepthv2_test_files_with_gt.txt', 'min_depth_eval': 0.001, 'max_depth_eval': 10, 'min_depth_diff': -10, 'max_depth_diff': 10, 'do_random_rotate': True, 'degree': 1.0, 'do_kb_crop': False, 'garg_crop': False, 'eigen_crop': True, 'save_dir': '/root/shortcuts/monodepth3_checkpoints', 'project': 'ZoeDepth', 'tags': '', 'notes': '', 'gpu': None, 'root': '.', 'uid': None, 'print_losses': False, 'distributed': True, 'workers': 16, 'clip_grad': 0.1, 'use

In [17]:
!mkdir /kaggle/working/noise

In [16]:
metrics = [{'rmse': 0.340304751230513, 'mae': 0.23593246336371385, 'mre': 0.0867020212024018, 'gradient': 0.174683911654515, 'rank': 2.7737496333762026, 'census': 6.064811037478692, 'delta1': 0.9313147552491292, 'delta2': 0.9879942050365393, 'delta3': 0.996578425702021}, {'rmse': 0.37890817604062127, 'mae': 0.2682416052476159, 'mre': 0.09400768660476401, 'gradient': 0.17444271702610994, 'rank': 2.7554587399524553, 'census': 6.116146644135966, 'delta1': 0.9144736218021695, 'delta2': 0.9853109354393429, 'delta3': 0.9962669044337383}, {'rmse': 0.3872871257553789, 'mae': 0.2757061655299087, 'mre': 0.0964177169816861, 'gradient': 0.17456817335086888, 'rank': 2.7506409699276166, 'census': 6.1377781020988325, 'delta1': 0.9101174260137785, 'delta2': 0.9843741859926124, 'delta3': 0.996193996291716}, {'rmse': 0.39835986663048023, 'mae': 0.28539497621900617, 'mre': 0.09950534876725897, 'gradient': 0.17470979392112002, 'rank': 2.744794621449389, 'census': 6.165706522519419, 'delta1': 0.9035818247917679, 'delta2': 0.9829364246129183, 'delta3': 0.9961294910133578}, {'rmse': 0.4109984273028108, 'mae': 0.2963121945233976, 'mre': 0.10292759875183502, 'gradient': 0.1748623486536599, 'rank': 2.739008926135813, 'census': 6.19504936089524, 'delta1': 0.895535842475854, 'delta2': 0.9813585193628658, 'delta3': 0.9958903715360453}, {'rmse': 0.4246461892733868, 'mae': 0.308044903309212, 'mre': 0.10662340239182536, 'gradient': 0.17504777749945827, 'rank': 2.733360772432761, 'census': 6.22823734583854, 'delta1': 0.8867844170014988, 'delta2': 0.9801838618356429, 'delta3': 0.9955647301238622}, {'rmse': 0.43918792794337314, 'mae': 0.32057156463624886, 'mre': 0.11062951391908361, 'gradient': 0.1752561355212028, 'rank': 2.7279340415582163, 'census': 6.26446203982303, 'delta1': 0.8774088016888272, 'delta2': 0.9789045114212301, 'delta3': 0.9951721093636224}, {'rmse': 0.45440328713738065, 'mae': 0.3336605294140007, 'mre': 0.11486229659074035, 'gradient': 0.1755195118649241, 'rank': 2.7230305059203337, 'census': 6.3040252280758935, 'delta1': 0.866444186269503, 'delta2': 0.9774564050972501, 'delta3': 0.9946306534415459}, {'rmse': 0.47012801433845686, 'mae': 0.34708235110017205, 'mre': 0.11924506626083728, 'gradient': 0.17583706343322997, 'rank': 2.7182656463757047, 'census': 6.3433311515371456, 'delta1': 0.8549248921151776, 'delta2': 0.9754211494758561, 'delta3': 0.9939461180949095}, {'rmse': 0.48641462648905476, 'mae': 0.36094011192417264, 'mre': 0.12379983671287706, 'gradient': 0.17620253193815552, 'rank': 2.7139780835881737, 'census': 6.385143006356176, 'delta1': 0.8423008361073681, 'delta2': 0.9725849746798831, 'delta3': 0.9936116379725862}, {'rmse': 0.5034805276690713, 'mae': 0.37548135546473194, 'mre': 0.12861381101390465, 'gradient': 0.17663434142187476, 'rank': 2.710043928197114, 'census': 6.432522133309548, 'delta1': 0.8286741857875073, 'delta2': 0.9690326400297811, 'delta3': 0.9932203375707858}]


import json

with open("/kaggle/working/depth_anything/metrics.json", "w") as f:
    json.dump(metrics, f)

In [7]:
import json


# new metrics
new_metrics1 = [{'rmse': 0.5214592776320631, 'mae': 0.39087804621559197, 'mre': 0.13374813024603602, 'gradient': 0.1771206408352508, 'rank': 2.706434670715424, 'census': 6.483251701852289, 'delta1': 0.8128770905248005, 'delta2': 0.9652207716226006, 'delta3': 0.992555812704976}, {'rmse': 0.5402979676778293, 'mae': 0.40707260991642497, 'mre': 0.1391846743042471, 'gradient': 0.17759674508847478, 'rank': 2.7032632210182608, 'census': 6.53676814967276, 'delta1': 0.7968549895191746, 'delta2': 0.9611191934994269, 'delta3': 0.9916392916623084}, {'rmse': 0.559712094962883, 'mae': 0.423933803580484, 'mre': 0.14481767039643792, 'gradient': 0.1780635900494511, 'rank': 2.700129119518331, 'census': 6.593649031036368, 'delta1': 0.7798328708233088, 'delta2': 0.9571964470308088, 'delta3': 0.9906725072658178}, {'rmse': 0.579860945348232, 'mae': 0.44148826796053786, 'mre': 0.15061636917570898, 'gradient': 0.17856015232782155, 'rank': 2.6970808733645475, 'census': 6.6530434967091, 'delta1': 0.7608531605022588, 'delta2': 0.9525316911663672, 'delta3': 0.9897249321617785}, {'rmse': 0.6011546144654439, 'mae': 0.45998957347471114, 'mre': 0.1566912458903946, 'gradient': 0.17907223781910814, 'rank': 2.6941977489298727, 'census': 6.7142857463332515, 'delta1': 0.7411570147182788, 'delta2': 0.9469886918350057, 'delta3': 0.988711050707918}, {'rmse': 0.6238297247591876, 'mae': 0.4796993464526668, 'mre': 0.16314150483978482, 'gradient': 0.17961290495884397, 'rank': 2.6916112052026757, 'census': 6.779455203442884, 'delta1': 0.7198982311298987, 'delta2': 0.9409532693100502, 'delta3': 0.9872300443845535}, {'rmse': 0.6479636152016665, 'mae': 0.500611194237409, 'mre': 0.1699644577739956, 'gradient': 0.1801867162524036, 'rank': 2.689282669769894, 'census': 6.848752088217374, 'delta1': 0.697987241748145, 'delta2': 0.9332157829751201, 'delta3': 0.9854553416318507}, {'rmse': 0.6733668937234597, 'mae': 0.52252454412041, 'mre': 0.17705439928212177, 'gradient': 0.18078518814746714, 'rank': 2.687215969107202, 'census': 6.921485692954871, 'delta1': 0.6763758134946273, 'delta2': 0.9242601441985256, 'delta3': 0.9835479620844326}, {'rmse': 0.699957674885162, 'mae': 0.5453046218959428, 'mre': 0.18433468080223545, 'gradient': 0.18140855310377388, 'rank': 2.685669316613238, 'census': 6.996858303234291, 'delta1': 0.6524434835311557, 'delta2': 0.9141979617255017, 'delta3': 0.9811888468937985}, {'rmse': 0.7274772205864249, 'mae': 0.5687725655331423, 'mre': 0.1917651077432716, 'gradient': 0.1820741866400675, 'rank': 2.684255379371024, 'census': 7.074898678513497, 'delta1': 0.6285040902909819, 'delta2': 0.903145311727655, 'delta3': 0.9783886550700743}]
new_metrics2 = [{'rmse': 0.7556868932088155, 'mae': 0.5928432275232786, 'mre': 0.19933632042234975, 'gradient': 0.18275387049016387, 'rank': 2.6830188893298614, 'census': 7.156332906972929, 'delta1': 0.6049525042712948, 'delta2': 0.8916349460165678, 'delta3': 0.9749223616418187}, {'rmse': 0.7844886016871722, 'mae': 0.6174693396926692, 'mre': 0.20701860764805438, 'gradient': 0.18344393147415405, 'rank': 2.681900545731063, 'census': 7.240703657469795, 'delta1': 0.580616407414928, 'delta2': 0.8793378069128334, 'delta3': 0.9715191569908751}, {'rmse': 0.8136513710156286, 'mae': 0.642359073250926, 'mre': 0.2147016090674719, 'gradient': 0.18413944243831715, 'rank': 2.6807515390508194, 'census': 7.32668605286151, 'delta1': 0.5570823065303125, 'delta2': 0.8664994169912442, 'delta3': 0.9668123419415571}, {'rmse': 0.8428874736065098, 'mae': 0.6672078784516234, 'mre': 0.22227477082817754, 'gradient': 0.18482101503390438, 'rank': 2.6803375105372305, 'census': 7.413681047805698, 'delta1': 0.5329029541674103, 'delta2': 0.8535224535285099, 'delta3': 0.9621436378208083}, {'rmse': 0.871957388236123, 'mae': 0.6918876753308619, 'mre': 0.22968048794301832, 'gradient': 0.18549295819913628, 'rank': 2.6800724453756146, 'census': 7.503080344964795, 'delta1': 0.5104866713020219, 'delta2': 0.8392485006881247, 'delta3': 0.9569625128446527}, {'rmse': 0.9004972403830855, 'mae': 0.7161243379500049, 'mre': 0.23684104483928242, 'gradient': 0.1861475370833181, 'rank': 2.679958683036036, 'census': 7.589969948385524, 'delta1': 0.48949624095233957, 'delta2': 0.8239972198122469, 'delta3': 0.9514721291561807}, {'rmse': 0.9284402630003299, 'mae': 0.7397563143849001, 'mre': 0.24370038935181174, 'gradient': 0.186779880639978, 'rank': 2.680318198692665, 'census': 7.67427254659456, 'delta1': 0.47045498654516293, 'delta2': 0.8090027678814984, 'delta3': 0.9459083181556204}, {'rmse': 0.9554663319980284, 'mae': 0.7625534954353533, 'mre': 0.25022492632512056, 'gradient': 0.1873975104979018, 'rank': 2.680658985769865, 'census': 7.758227787981713, 'delta1': 0.4540993412052336, 'delta2': 0.7951409014740844, 'delta3': 0.9397397124848991}, {'rmse': 0.9812773235264055, 'mae': 0.7843276430571523, 'mre': 0.2563898817628333, 'gradient': 0.18799535362231934, 'rank': 2.68124471215673, 'census': 7.842302021866166, 'delta1': 0.43968309730964816, 'delta2': 0.7816435629577672, 'delta3': 0.9322931408532744}, {'rmse': 1.0056787407884094, 'mae': 0.804909249575663, 'mre': 0.26219219203953303, 'gradient': 0.18856208738786134, 'rank': 2.681796654032245, 'census': 7.927717028470522, 'delta1': 0.42670613384691397, 'delta2': 0.768575513991184, 'delta3': 0.9263866776089769}]

json_path = "/kaggle/input/zoedepth-metrics/zoedepth_metrics.json"
with open(json_path, "r") as f:
    data = json.load(f)
    
assert(data[-1]['mre'] < new_metrics1[0]['mre'])
assert(new_metrics1[-1]['mre'] < new_metrics2[0]['mre'])


data += new_metrics1
data += new_metrics2

assert(len(data) == 31)
with open("/kaggle/working/zoedepth_metrics.json", "w") as f:
    json.dump(data, f)

1;2

In [12]:
print("#\tRMSE\tMAE\tMRE\tGR\tCEN\tdelta1\tdelta2\tdelta3")
for i, row in enumerate(data):
    for k, v in row.items():
        row[k] = round(v, 3)
    print("{};{};{};{};{};{};{};{};{}".format(i, row['rmse'], row['mae'], row['mre'], row['gradient'], row['census'], row['delta1'], row['delta2'], row['delta3']))

#	RMSE	MAE	MRE	GR	CEN	delta1	delta2	delta3
0;0.34;0.236;0.087;0.175;6.065;0.931;0.988;0.997
1;0.379;0.268;0.094;0.174;6.116;0.914;0.985;0.996
2;0.387;0.276;0.096;0.175;6.138;0.91;0.984;0.996
3;0.398;0.285;0.1;0.175;6.166;0.904;0.983;0.996
4;0.411;0.296;0.103;0.175;6.195;0.896;0.981;0.996
5;0.425;0.308;0.107;0.175;6.228;0.887;0.98;0.996
6;0.439;0.321;0.111;0.175;6.264;0.877;0.979;0.995
7;0.454;0.334;0.115;0.176;6.304;0.866;0.977;0.995
8;0.47;0.347;0.119;0.176;6.343;0.855;0.975;0.994
9;0.486;0.361;0.124;0.176;6.385;0.842;0.973;0.994
10;0.503;0.375;0.129;0.177;6.433;0.829;0.969;0.993
11;0.521;0.391;0.134;0.177;6.483;0.813;0.965;0.993
12;0.54;0.407;0.139;0.178;6.537;0.797;0.961;0.992
13;0.56;0.424;0.145;0.178;6.594;0.78;0.957;0.991
14;0.58;0.441;0.151;0.179;6.653;0.761;0.953;0.99
15;0.601;0.46;0.157;0.179;6.714;0.741;0.947;0.989
16;0.624;0.48;0.163;0.18;6.779;0.72;0.941;0.987
17;0.648;0.501;0.17;0.18;6.849;0.698;0.933;0.985
18;0.673;0.523;0.177;0.181;6.921;0.676;0.924;0.984
19;0.7;0.545;0.

In [9]:
!mkdir /kaggle/working/noise

In [11]:
!pip install timm==0.6.13

Collecting timm==0.6.13
  Downloading timm-0.6.13-py3-none-any.whl.metadata (38 kB)
Downloading timm-0.6.13-py3-none-any.whl (549 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m549.1/549.1 kB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: timm
  Attempting uninstall: timm
    Found existing installation: timm 0.9.16
    Uninstalling timm-0.9.16:
      Successfully uninstalled timm-0.9.16
Successfully installed timm-0.6.13


In [12]:
!pip install torch==2.0.1

Collecting torch==2.0.1
  Downloading torch-2.0.1-cp310-cp310-manylinux1_x86_64.whl.metadata (24 kB)
Collecting nvidia-cuda-nvrtc-cu11==11.7.99 (from torch==2.0.1)
  Downloading nvidia_cuda_nvrtc_cu11-11.7.99-2-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu11==11.7.99 (from torch==2.0.1)
  Downloading nvidia_cuda_runtime_cu11-11.7.99-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cuda-cupti-cu11==11.7.101 (from torch==2.0.1)
  Downloading nvidia_cuda_cupti_cu11-11.7.101-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu11==8.5.0.96 (from torch==2.0.1)
  Downloading nvidia_cudnn_cu11-8.5.0.96-2-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu11==11.10.3.66 (from torch==2.0.1)
  Downloading nvidia_cublas_cu11-11.10.3.66-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cufft-cu11==10.9.0.58 (from torch==2.0.1)
  Downloading nvidia_cufft_cu11-10.9.0.58-py3-none-man

In [13]:
!pip list

Package                                  Version
---------------------------------------- -------------------
absl-py                                  1.4.0
accelerate                               0.29.3
access                                   1.1.9
affine                                   2.4.0
aiobotocore                              2.12.3
aiofiles                                 22.1.0
aiohttp                                  3.9.1
aiohttp-cors                             0.7.0
aioitertools                             0.11.0
aiorwlock                                1.3.0
aiosignal                                1.3.1
aiosqlite                                0.19.0
albumentations                           1.4.0
alembic                                  1.13.1
altair                                   5.3.0
annotated-types                          0.6.0
annoy                                    1.17.3
anyio                                    4.2.0
apache-beam                          