In [1]:
!git clone https://github.com/ashutosh1807/PixelFormer.git

Cloning into 'PixelFormer'...
remote: Enumerating objects: 67, done.[K
remote: Counting objects: 100% (67/67), done.[K
remote: Compressing objects: 100% (48/48), done.[K
remote: Total 67 (delta 22), reused 54 (delta 15), pack-reused 0[K
Unpacking objects: 100% (67/67), 1.01 MiB | 4.39 MiB/s, done.


In [2]:
!mkdir /kaggle/working/PixelFormer/pixelformer/pretrained

In [3]:
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt




In [5]:
!mv /kaggle/input/nyuv2-official-split-dataset/test/official/rgb_00001.png /kaggle/working
!mv /kaggle/input/nyuv2-official-split-dataset/test/official/rgb_00002.png /kaggle/working
!mv /kaggle/input/nyuv2-official-split-dataset/test/official/rgb_00009.png /kaggle/working
!mv /kaggle/input/nyuv2-official-split-dataset/test/official/rgb_00014.png /kaggle/working

mv: cannot remove '/kaggle/input/nyuv2-official-split-dataset/test/official/rgb_00001.png': Read-only file system
mv: cannot remove '/kaggle/input/nyuv2-official-split-dataset/test/official/rgb_00002.png': Read-only file system
mv: cannot remove '/kaggle/input/nyuv2-official-split-dataset/test/official/rgb_00009.png': Read-only file system
mv: cannot remove '/kaggle/input/nyuv2-official-split-dataset/test/official/rgb_00014.png': Read-only file system


In [10]:
path = "/kaggle/input/nyuv2-official-split-dataset/test/official/depth_00014.png"
image = np.asarray(Image.open(path))
plt.imsave("/kaggle/working/depth_14.png", image, cmap='inferno')

In [4]:
!cp /kaggle/input/pixelformer/pytorch/nyu/1/nyu.pth /kaggle/working/PixelFormer/pixelformer/pretrained/nyu.pth

In [5]:
%%writefile /kaggle/working/PixelFormer/pixelformer/new_eval.py

import torch
import torch.backends.cudnn as cudnn

import os, sys
import time
import argparse
import numpy as np
from tqdm import tqdm
import scipy.signal as sps
import shutil
from PIL import Image

from utils import post_process_depth, flip_lr, compute_errors
from networks.PixelFormer import PixelFormer


def convert_arg_line_to_args(arg_line):
    for arg in arg_line.split():
        if not arg.strip():
            continue
        yield arg


def get_metrics(x, y, size=5):
    x[y == 0] = 0
    values = dict()
    values['rmse'] = np.sum(rmse(x, y))
    values['mae'] = np.sum(mae(x, y))
    values['mre'] = np.sum(mre(x, y))
    values['gradient'] = np.sum(gradient_metric(x, y))
    values['rank'], values['census'] = rank_and_census(x, y, size)
    values['rank'] = np.sum(values['rank'])
    values['census'] = np.sum(values['census'])
    
    values['delta1'] = np.sum(get_delta(x, y, 1.25))
    values['delta2'] = np.sum(get_delta(x, y, 1.25 ** 2))
    values['delta3'] = np.sum(get_delta(x, y, 1.25 ** 3))

    return values

def get_delta(x, y, delta):
    frac1, frac2 = x / (y + 1e-6), y / (x + 1e-6)
    frac1, frac2 = frac1[..., np.newaxis], frac2[..., np.newaxis]
    frac = np.concatenate((frac1, frac2), axis=-1)

    delta_values = np.max(frac, axis=-1)
    return np.sum((delta_values < delta), axis=(1, 2)) / (x.shape[1] * x.shape[2])

def rmse(x, y):
    return np.sqrt(np.sum((x - y) ** 2, axis=(1, 2)) / (x.shape[1] * x.shape[2]))

def mae(x, y):
    return np.sum(np.abs(x - y), axis=(1, 2)) / (x.shape[1] * x.shape[2])

def mre(x, y):
    return np.sum(np.abs(x - y) / (y + 1e-7), axis=(1, 2)) / (x.shape[1] * x.shape[2])

def convolution(image, conv):
    height, width = image.shape[0], image.shape[1]
    padding_width = conv.shape[0] // 2
    image = np.pad(image, padding_width, 'constant')
    result = np.zeros((height, width))
    for i in range(padding_width, height + padding_width):
        for j in range(padding_width, width + padding_width):
            result[i-padding_width][j-padding_width] = np.sum(image[(i - padding_width):(i + padding_width + 1), (j - padding_width):(j + padding_width + 1)] * conv)
    return result

def gradient_metric(x, y):
    kernel1 = np.array([[
        [1, 0, -1],
        [2, 0, -2],
        [1, 0, -1],
    ]])
    
    kernel2 = np.array([[
        [1, 2, 1],
        [0, 0, 0],
        [-1, -2, -1],
    ]])
    
    x_deriv1, y_deriv1 = sps.fftconvolve(x, kernel1, mode='same'), sps.fftconvolve(y, kernel1, mode='same')
    x_deriv2, y_deriv2 = sps.fftconvolve(x, kernel2, mode='same'), sps.fftconvolve(y, kernel2, mode='same')

    return np.sum(np.abs(x_deriv1 - y_deriv1) + np.abs(x_deriv2 - y_deriv2), axis=(1, 2)) / (x.shape[1] * x.shape[2])

def neighborhood(x, size=5):
    indices = np.indices(x.shape[1:])
    padding_width = size // 2
    indices += padding_width
    pad_x = np.pad(x, ((0,), (padding_width,), (padding_width,)), 'constant')
    
    index_maps = list()
    for i in range(-padding_width, padding_width+1):
        for j in range(-padding_width, padding_width+1):
            index_map = np.zeros(indices.shape).astype(int)
            index_map[0] = indices[0] + i
            index_map[1] = indices[1] + j
            index_maps.append(index_map)
            
    index_maps = np.array(index_maps)
    return pad_x[:, index_maps[:, 0, :, :], index_maps[:, 1, :, :]]
    
def one_hot(x, size):
    center_index = size ** 2 // 2
    center = x[:, center_index]
    center = center[:, np.newaxis]

    encoding = (x < center).astype(int)
    return np.delete(encoding, center_index, axis=1)
    
def rank_metric(en_x, en_y, size=5):
    rank_x, rank_y = np.sum(en_x, axis=1), np.sum(en_y, axis=1)
    return np.sum(np.abs(rank_x - rank_y), axis=(1, 2)) / (en_x.shape[2] * en_x.shape[3])

def census_metric(en_x, en_y, size=5):
    mask = (en_x != en_y).astype(int)
    return np.sum(mask, axis=(1, 2, 3)) / (en_x.shape[2] * en_x.shape[3])

def rank_and_census(x, y, size=5):
    nx, ny = neighborhood(x, size), neighborhood(y, size)
    en_x, en_y = one_hot(nx, size), one_hot(ny, size)
    rank, census = rank_metric(en_x, en_y, size), census_metric(en_x, en_y, size)
    return rank, census        
        
        
        
parser = argparse.ArgumentParser(description='PixelFormer PyTorch implementation.', fromfile_prefix_chars='@')
parser.convert_arg_line_to_args = convert_arg_line_to_args

parser.add_argument('--model_name',                type=str,   help='model name', default='pixelformer')
parser.add_argument('--encoder',                   type=str,   help='type of encoder, base07, large07', default='large07')
parser.add_argument('--checkpoint_path',           type=str,   help='path to a checkpoint to load', default='')

# Dataset
parser.add_argument('--dataset',                   type=str,   help='dataset to train on, kitti or nyu', default='nyu')
parser.add_argument('--input_height',              type=int,   help='input height', default=480)
parser.add_argument('--input_width',               type=int,   help='input width',  default=640)
parser.add_argument('--max_depth',                 type=float, help='maximum depth in estimation', default=10)

# Preprocessing
parser.add_argument('--do_random_rotate',                      help='if set, will perform random rotation for augmentation', action='store_true')
parser.add_argument('--degree',                    type=float, help='random rotation maximum degree', default=2.5)
parser.add_argument('--do_kb_crop',                            help='if set, crop input images as kitti benchmark images', action='store_true')
parser.add_argument('--use_right',                             help='if set, will randomly use right images when train on KITTI', action='store_true')

# Eval
parser.add_argument('--data_path_eval',            type=str,   help='path to the data for evaluation', required=False)
parser.add_argument('--gt_path_eval',              type=str,   help='path to the groundtruth data for evaluation', required=False)
parser.add_argument('--filenames_file_eval',       type=str,   help='path to the filenames text file for evaluation', required=False)
parser.add_argument('--min_depth_eval',            type=float, help='minimum depth for evaluation', default=1e-3)
parser.add_argument('--max_depth_eval',            type=float, help='maximum depth for evaluation', default=80)
parser.add_argument('--eigen_crop',                            help='if set, crops according to Eigen NIPS14', action='store_true')
parser.add_argument('--garg_crop',                             help='if set, crops according to Garg  ECCV16', action='store_true')


if sys.argv.__len__() == 2:
    arg_filename_with_prefix = '@' + sys.argv[1]
    args = parser.parse_args([arg_filename_with_prefix])
else:
    args = parser.parse_args()

if args.dataset == 'kitti' or args.dataset == 'nyu':
    from dataloaders.dataloader import NewDataLoader
elif args.dataset == 'kittipred':
    from dataloaders.dataloader_kittipred import NewDataLoader

    
def write_demo(pred, path, dir_root):
    path = os.path.join(dir_root, path.split("/")[-1])
    pred = pred * 255 / pred.max()
    pred = pred.astype(np.uint8)
    img = Image.fromarray(pred)
    img.save(path)

def eval(model, dataloader_eval, demo_dir, post_process=False):
    eval_measures = torch.zeros(10).cuda()
    metrics = list()
    i = 0
    for _, eval_sample_batched in enumerate(tqdm(dataloader_eval.data)):
        with torch.no_grad():
            image = torch.autograd.Variable(eval_sample_batched['image'].cuda())
            gt_depth = eval_sample_batched['depth']
            has_valid_depth = eval_sample_batched['has_valid_depth']
            if not has_valid_depth:
                # print('Invalid depth. continue.')
                continue

            times = list()
            for i in range(200):
                start = time.time()
                pred_depth = model(image)
                times.append(time.time() - start)
            print(sum(times) / len(times))
            sys.exit(0)    
            if post_process:
                image_flipped = flip_lr(image)
                pred_depth_flipped = model(image_flipped)
                pred_depth = post_process_depth(pred_depth, pred_depth_flipped)
            # pred_depth[pred_depth>8] = 8

            pred_depth = pred_depth.cpu().numpy().squeeze()
            gt_depth = gt_depth.cpu().numpy().squeeze()

        if args.do_kb_crop:
            height, width = gt_depth.shape
            top_margin = int(height - 352)
            left_margin = int((width - 1216) / 2)
            pred_depth_uncropped = np.zeros((height, width), dtype=np.float32)
            pred_depth_uncropped[top_margin:top_margin + 352, left_margin:left_margin + 1216] = pred_depth
            pred_depth = pred_depth_uncropped

        pred_depth[pred_depth < args.min_depth_eval] = args.min_depth_eval
        pred_depth[pred_depth > args.max_depth_eval] = args.max_depth_eval
        pred_depth[np.isinf(pred_depth)] = args.max_depth_eval
        pred_depth[np.isnan(pred_depth)] = args.min_depth_eval

        write_demo(pred_depth, str(i)+".png", demo_dir)
        i += 1
        
        gt_depth = gt_depth[45:471, 41:601]
        pred_depth = pred_depth[45:471, 41:601]
        
        gt_depth, pred_depth = gt_depth[np.newaxis, ...], pred_depth[np.newaxis, ...]
        metrics.append(get_metrics(pred_depth, gt_depth))
    result = dict()
    for k in metrics[0].keys():
        result[k] = sum([m[k] for m in metrics]) / 654
        
    return result
        
        
def evalu(model, model_name="pixel_former", batch_size=1):
    demo_root = "/kaggle/working"
    all_metrics = list()

    demo_dir = os.path.join(demo_root, model_name)
    if os.path.exists(demo_dir):
        shutil.rmtree(demo_dir)
    os.mkdir(demo_dir)
    for dn in range(11, 21):
        print("Noise " + str(dn) + " processing...")
        loader = NewDataLoader(args, 'online_eval', dn=dn)

        noise_dir = os.path.join(demo_dir, str(dn))
        os.mkdir(noise_dir)
        
        metrics = eval(model, loader, noise_dir)
        print(metrics)
        all_metrics.append(metrics)
    return all_metrics
        
        
        
        

def main_worker(args):

    model = PixelFormer(version=args.encoder, inv_depth=False, max_depth=args.max_depth, pretrained=None)
    model.train()

    num_params = sum([np.prod(p.size()) for p in model.parameters()])
    print("== Total number of parameters: {}".format(num_params))

    num_params_update = sum([np.prod(p.shape) for p in model.parameters() if p.requires_grad])
    print("== Total number of learning parameters: {}".format(num_params_update))

    model = torch.nn.DataParallel(model)
    model.cuda()

    print("== Model Initialized")

    if args.checkpoint_path != '':
        if os.path.isfile(args.checkpoint_path):
            print("== Loading checkpoint '{}'".format(args.checkpoint_path))
            checkpoint = torch.load(args.checkpoint_path, map_location='cpu')
            model.load_state_dict(checkpoint['model'])
            print("== Loaded checkpoint '{}'".format(args.checkpoint_path))
            del checkpoint
        else:
            print("== No checkpoint found at '{}'".format(args.checkpoint_path))

    cudnn.benchmark = True


    # ===== Evaluation ======
    model.eval()
    with torch.no_grad():
        eval_measures = evalu(model)
    print(eval_measures)


def main():
    torch.cuda.empty_cache()
    args.distributed = False
    ngpus_per_node = torch.cuda.device_count()
    if ngpus_per_node > 1:
        print("This machine has more than 1 gpu. Please set \'CUDA_VISIBLE_DEVICES=0\'")
        return -1
    
    main_worker(args)


if __name__ == '__main__':
    main()

Writing /kaggle/working/PixelFormer/pixelformer/new_eval.py


In [6]:
%%writefile /kaggle/working/PixelFormer/configs/arguments_eval_nyu.txt
--model_name pixelformer_nyu
--encoder large07
--dataset nyu
--input_height 480
--input_width 640
--max_depth 10

--data_path_eval datasets/nyu_depth_v2/official_splits/test/
--gt_path_eval datasets/nyu_depth_v2/official_splits/test/
--filenames_file_eval data_splits/nyudepthv2_test_files_with_gt.txt
--min_depth_eval 1e-3
--max_depth_eval 10
--eigen_crop

--checkpoint_path /kaggle/working/PixelFormer/pixelformer/pretrained/nyu.pth

Overwriting /kaggle/working/PixelFormer/configs/arguments_eval_nyu.txt


In [7]:
import os

test_path = "/kaggle/input/nyuv2-official-split-dataset/test/official"
path = "/kaggle/working/PixelFormer/data_splits/nyudepthv2_test_files_with_gt.txt"


with open(path, "w") as f:
    focal = "518.8579"
    for filename in os.listdir(test_path):
        if "rgb" in filename:
            f.write("{0} {1} {2}\n".format(filename, filename.replace("rgb", "depth"), focal))
        

with open(path, "r") as f:
    string = f.read()
    



with open(path, "w") as f:
    f.write(string[:-1])


In [8]:
!mkdir /kaggle/working/noise

In [9]:
%%writefile /kaggle/working/PixelFormer/pixelformer/dataloaders/dataloader.py
import torch
from torch.utils.data import Dataset, DataLoader
import torch.utils.data.distributed
from torchvision import transforms

import numpy as np
from PIL import Image
import os
import random
import cv2
import scipy.signal as sps

from utils import DistributedSamplerNoEvenlyDivisible


def _is_pil_image(img):
    return isinstance(img, Image.Image)


def _is_numpy_image(img):
    return isinstance(img, np.ndarray) and (img.ndim in {2, 3})


def preprocessing_transforms(mode):
    return transforms.Compose([
        ToTensor(mode=mode)
    ])


class NewDataLoader(object):
    def __init__(self, args, mode, dn=0):
        if mode == 'train':
            self.training_samples = DataLoadPreprocess(args, mode, transform=preprocessing_transforms(mode))
            if args.distributed:
                self.train_sampler = torch.utils.data.distributed.DistributedSampler(self.training_samples)
            else:
                self.train_sampler = None
    
            self.data = DataLoader(self.training_samples, args.batch_size,
                                   shuffle=(self.train_sampler is None),
                                   num_workers=args.num_threads,
                                   pin_memory=True,
                                   sampler=self.train_sampler)

        elif mode == 'online_eval':
            self.testing_samples = DataLoadPreprocess(args, mode, dn=dn, transform=preprocessing_transforms(mode))
            if args.distributed:
                # self.eval_sampler = torch.utils.data.distributed.DistributedSampler(self.testing_samples, shuffle=False)
                self.eval_sampler = DistributedSamplerNoEvenlyDivisible(self.testing_samples, shuffle=False)
            else:
                self.eval_sampler = None
            self.data = DataLoader(self.testing_samples, 1,
                                   shuffle=False,
                                   num_workers=1,
                                   pin_memory=True,
                                   sampler=self.eval_sampler)
        
        elif mode == 'test':
            self.testing_samples = DataLoadPreprocess(args, mode, dn=dn, transform=preprocessing_transforms(mode))
            self.data = DataLoader(self.testing_samples, 1, shuffle=False, num_workers=1)

        else:
            print('mode should be one of \'train, test, online_eval\'. Got {}'.format(mode))
            
def image2depth(path):
    depth = cv2.imread(path, cv2.IMREAD_UNCHANGED)
    depth = depth.astype('float32')
    depth /= (2**16 - 1)
    depth *= 10.0
    return depth

def make_noise(path_in, path_out, dn):
    image = Image.open(path_in)
    photons = adu2photons(np.asarray(image))
    noised = add_camera_noise(photons, dark_noise=dn)

    out_image = Image.fromarray(noised.astype(np.uint8))
    out_image.save(path_out)
    
def adu2photons(image, qe=0.69, sensitivity=5.88):
    return image / (qe * sensitivity + 1e-7)

def add_camera_noise(input_irrad_photons, qe=0.69, sensitivity=5.88,
                     dark_noise=2.29, bitdepth=8, baseline=100,
                     rs=np.random.RandomState(seed=42)):
 
    # Add shot noise
    photons = rs.poisson(input_irrad_photons, size=input_irrad_photons.shape)
    
    # Convert to electrons
    electrons = qe * photons
    
    # Add dark noise
    electrons_out = rs.normal(scale=dark_noise, size=electrons.shape) + electrons
    
    # Convert to ADU and add baseline
    max_adu     = 2**bitdepth - 1
    adu         = (electrons_out * sensitivity).astype(int) # Convert to discrete numbers
    adu = np.clip(adu, 0, max_adu)
    
    return adu

class DataLoadPreprocess(Dataset):
    def __init__(self, args, mode, dn=0, transform=None, is_for_online_eval=False):
        self.args = args
        self.dn = dn
        self.args.data_path = "/kaggle/input/nyuv2-official-split-dataset/test/official"
        self.args.gt_path = "/kaggle/input/nyuv2-official-split-dataset/test/official"
        if mode == 'online_eval':
            with open(args.filenames_file_eval, 'r') as f:
                self.filenames = f.readlines()
        else:
            with open(args.filenames_file, 'r') as f:
                self.filenames = f.readlines()
    
        self.mode = mode
        self.transform = transform
        self.to_tensor = ToTensor
        self.is_for_online_eval = is_for_online_eval
    
    def __getitem__(self, idx):
        sample_path = self.filenames[idx]
        # focal = float(sample_path.split()[2])
        focal = 518.8579

        if self.mode == 'train':
            if self.args.dataset == 'kitti':
                rgb_file = sample_path.split()[0]
                depth_file = os.path.join(sample_path.split()[0].split('/')[0], sample_path.split()[1])
                if self.args.use_right is True and random.random() > 0.5:
                    rgb_file.replace('image_02', 'image_03')
                    depth_file.replace('image_02', 'image_03')
            else:
                rgb_file = sample_path.split()[0]
                depth_file = sample_path.split()[1]

            image_path = os.path.join(self.args.data_path, rgb_file)
            depth_path = os.path.join(self.args.gt_path, depth_file)
    
            image = Image.open(image_path)
            depth_gt = Image.open(depth_path)
            
            

            if self.args.do_kb_crop is True:
                height = image.height
                width = image.width
                top_margin = int(height - 352)
                left_margin = int((width - 1216) / 2)
                depth_gt = depth_gt.crop((left_margin, top_margin, left_margin + 1216, top_margin + 352))
                image = image.crop((left_margin, top_margin, left_margin + 1216, top_margin + 352))
            
            # To avoid blank boundaries due to pixel registration
            if self.args.dataset == 'nyu':
                if self.args.input_height == 480:
                    depth_gt = np.array(depth_gt)
                    valid_mask = np.zeros_like(depth_gt)
                    valid_mask[45:472, 43:608] = 1
                    depth_gt[valid_mask==0] = 0
                    depth_gt = Image.fromarray(depth_gt)
                else:
                    depth_gt = depth_gt.crop((43, 45, 608, 472))
                    image = image.crop((43, 45, 608, 472))
    
            if self.args.do_random_rotate is True:
                random_angle = (random.random() - 0.5) * 2 * self.args.degree
                image = self.rotate_image(image, random_angle)
                depth_gt = self.rotate_image(depth_gt, random_angle, flag=Image.NEAREST)
            
            image = np.asarray(image, dtype=np.float32) / 255.0
            depth_gt = np.asarray(depth_gt, dtype=np.float32)
            depth_gt = np.expand_dims(depth_gt, axis=2)

            if self.args.dataset == 'nyu':
                depth_gt = depth_gt / 1000.0
                img, depth = image, depth_gt
                #<https://arxiv.org/abs/2107.07684>
                H, W = img.shape[0], img.shape[1]
                a, b, c, d = random.uniform(0,1), random.uniform(0,1), random.uniform(0,1), random.uniform(0,1)
                l, u = int(a*W), int(b*H)
                w, h = int(max((W-a*W)*c*0.75, 1)), int(max((H-b*H)*d*0.75, 1))
                depth_copied = np.repeat(depth, 3, axis=2)
                M = np.ones(img.shape)
                M[l:l+h, u:u+w, :] = 0
                img = M*img + (1-M)*depth_copied
                image = img.astype(np.float32)
            else:
                depth_gt = depth_gt / 256.0

            if image.shape[0] != self.args.input_height or image.shape[1] != self.args.input_width:
                image, depth_gt = self.random_crop(image, depth_gt, self.args.input_height, self.args.input_width)
            image, depth_gt = self.train_preprocess(image, depth_gt)
            sample = {'image': image, 'depth': depth_gt, 'focal': focal}
        
        else:
            if self.mode == 'online_eval':
                data_path = self.args.data_path_eval
            else:
                data_path = self.args.data_path

            image_path = os.path.join(self.args.data_path, sample_path.split()[0])
            
            if self.dn != 0:
                new_path = os.path.join("/kaggle/working/noise", image_path.split("/")[-1])
                make_noise(image_path, new_path, self.dn)
                image_path = new_path
                
            image = np.asarray(Image.open(image_path),
                               dtype=np.float32) / 255.0
            
            if self.mode == 'online_eval':
                gt_path = self.args.data_path
                depth_path = os.path.join(gt_path, sample_path.split()[1])
                depth_gt = image2depth(depth_path)

            if self.args.do_kb_crop is True:
                height = image.shape[0]
                width = image.shape[1]
                top_margin = int(height - 352)
                left_margin = int((width - 1216) / 2)
                image = image[top_margin:top_margin + 352, left_margin:left_margin + 1216, :]
                if self.mode == 'online_eval' and has_valid_depth:
                    depth_gt = depth_gt[top_margin:top_margin + 352, left_margin:left_margin + 1216, :]
            if self.mode == 'online_eval':
                sample = {'image': image, 'depth': depth_gt, 'focal': focal, 'has_valid_depth': True, 'path': image_path}
            else:
                sample = {'image': image, 'focal': focal}
        
        if self.transform:
            sample = self.transform(sample)
        
        return sample
    
    def rotate_image(self, image, angle, flag=Image.BILINEAR):
        result = image.rotate(angle, resample=flag)
        return result

    def random_crop(self, img, depth, height, width):
        assert img.shape[0] >= height
        assert img.shape[1] >= width
        assert img.shape[0] == depth.shape[0]
        assert img.shape[1] == depth.shape[1]
        x = random.randint(0, img.shape[1] - width)
        y = random.randint(0, img.shape[0] - height)
        img = img[y:y + height, x:x + width, :]
        depth = depth[y:y + height, x:x + width, :]
        return img, depth

    def train_preprocess(self, image, depth_gt):
        # Random flipping
        do_flip = random.random()
        if do_flip > 0.5:
            image = (image[:, ::-1, :]).copy()
            depth_gt = (depth_gt[:, ::-1, :]).copy()
    
        # Random gamma, brightness, color augmentation
        do_augment = random.random()
        if do_augment > 0.5:
            image = self.augment_image(image)
    
        return image, depth_gt
    
    def augment_image(self, image):
        # gamma augmentation
        gamma = random.uniform(0.9, 1.1)
        image_aug = image ** gamma

        # brightness augmentation
        if self.args.dataset == 'nyu':
            brightness = random.uniform(0.75, 1.25)
        else:
            brightness = random.uniform(0.9, 1.1)
        image_aug = image_aug * brightness

        # color augmentation
        colors = np.random.uniform(0.9, 1.1, size=3)
        white = np.ones((image.shape[0], image.shape[1]))
        color_image = np.stack([white * colors[i] for i in range(3)], axis=2)
        image_aug *= color_image
        image_aug = np.clip(image_aug, 0, 1)

        return image_aug
    
    def __len__(self):
        return len(self.filenames)


class ToTensor(object):
    def __init__(self, mode):
        self.mode = mode
        self.normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    
    def __call__(self, sample):
        image, focal = sample['image'], sample['focal']
        image = self.to_tensor(image)
        image = self.normalize(image)

        if self.mode == 'test':
            return {'image': image, 'focal': focal}

        depth = sample['depth']
        if self.mode == 'train':
            depth = self.to_tensor(depth)
            return {'image': image, 'depth': depth, 'focal': focal}
        else:
            has_valid_depth = sample['has_valid_depth']
            return {'image': image, 'depth': depth, 'focal': focal, 'has_valid_depth': has_valid_depth, 'path': sample['path']}
    
    def to_tensor(self, pic):
        if not (_is_pil_image(pic) or _is_numpy_image(pic)):
            raise TypeError(
                'pic should be PIL Image or ndarray. Got {}'.format(type(pic)))
        
        if isinstance(pic, np.ndarray):
            img = torch.from_numpy(pic.transpose((2, 0, 1)))
            return img
        
        # handle PIL Image
        if pic.mode == 'I':
            img = torch.from_numpy(np.array(pic, np.int32, copy=False))
        elif pic.mode == 'I;16':
            img = torch.from_numpy(np.array(pic, np.int16, copy=False))
        else:
            img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
        # PIL image mode: 1, L, P, I, F, RGB, YCbCr, RGBA, CMYK
        if pic.mode == 'YCbCr':
            nchannel = 3
        elif pic.mode == 'I;16':
            nchannel = 1
        else:
            nchannel = len(pic.mode)
        img = img.view(pic.size[1], pic.size[0], nchannel)
        
        img = img.transpose(0, 1).transpose(0, 2).contiguous()
        if isinstance(img, torch.ByteTensor):
            return img.float()
        else:
            return img

Overwriting /kaggle/working/PixelFormer/pixelformer/dataloaders/dataloader.py


## Запуск

In [17]:
%cd /kaggle/working/PixelFormer

/kaggle/working/PixelFormer


In [11]:
!ls

LICENSE  README.md  configs  data_splits  pixelformer


In [None]:
!python ./pixelformer/new_eval.py ./configs/arguments_eval_nyu.txt

  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]
== Load encoder backbone from: None
== Total number of parameters: 270895920
== Total number of learning parameters: 270895920
== Model Initialized
== Loading checkpoint '/kaggle/working/PixelFormer/pixelformer/pretrained/nyu.pth'
== Loaded checkpoint '/kaggle/working/PixelFormer/pixelformer/pretrained/nyu.pth'
Noise 11 processing...
  0%|                                                   | 0/654 [00:00<?, ?it/s]

In [89]:
metrics = [{'rmse': 0.4025952238671212, 'mae': 0.28483502760488527, 'mre': 0.1056012374482374, 'gradient': 0.2039616459511303, 'rank': 3.6091340666322087, 'census': 7.690249261881169, 'delta1': 0.8923962608474497, 'delta2': 0.9806395649636852, 'delta3': 0.9951307295864896}, {'rmse': 0.4632419999846905, 'mae': 0.3353133314307312, 'mre': 0.11800526869186112, 'gradient': 0.2070533960555077, 'rank': 3.7986903903030846, 'census': 8.205050146700795, 'delta1': 0.8533504544084068, 'delta2': 0.9716245421048207, 'delta3': 0.9929096431289066}, {'rmse': 0.4734405920664047, 'mae': 0.3433937875241084, 'mre': 0.12063120687643653, 'gradient': 0.20769407969825948, 'rank': 3.8089659965398908, 'census': 8.251687052744597, 'delta1': 0.8456172239861179, 'delta2': 0.9692183298568174, 'delta3': 0.9923170970266035}, {'rmse': 0.48750899699590755, 'mae': 0.3546013050313556, 'mre': 0.12427738164433214, 'gradient': 0.20821731103675753, 'rank': 3.8176311885071916, 'census': 8.304876115766973, 'delta1': 0.8350448191185843, 'delta2': 0.9658124524414584, 'delta3': 0.9915696010927951}, {'rmse': 0.5043111406883812, 'mae': 0.3679688106462045, 'mre': 0.12857171099971915, 'gradient': 0.20875862006227427, 'rank': 3.825253829295861, 'census': 8.36066326603864, 'delta1': 0.823093421640957, 'delta2': 0.9612919553508631, 'delta3': 0.9905766851363016}, {'rmse': 0.5228208437486869, 'mae': 0.38257214224224073, 'mre': 0.13333901934844494, 'gradient': 0.20937553426007913, 'rank': 3.8338748597599865, 'census': 8.422539813293623, 'delta1': 0.8104297036038862, 'delta2': 0.9564321197316427, 'delta3': 0.9893606414224388}, {'rmse': 0.5436144334380446, 'mae': 0.3990636989245714, 'mre': 0.1387918941169103, 'gradient': 0.2101729013765918, 'rank': 3.8447536134236633, 'census': 8.496094584838284, 'delta1': 0.7952986266221187, 'delta2': 0.9512193830670054, 'delta3': 0.9873210401553043}, {'rmse': 0.5664415763284849, 'mae': 0.4174953432440513, 'mre': 0.14502109491620466, 'gradient': 0.2112169800693658, 'rank': 3.860065079570184, 'census': 8.585662702002022, 'delta1': 0.7792919404808066, 'delta2': 0.9448978978355359, 'delta3': 0.984964354167819}, {'rmse': 0.5912727513305839, 'mae': 0.437581402378718, 'mre': 0.15167793815964134, 'gradient': 0.21265641008181657, 'rank': 3.8825190439271706, 'census': 8.69789057997321, 'delta1': 0.7596551915981111, 'delta2': 0.9385773996681402, 'delta3': 0.981930074329771}, {'rmse': 0.6194754077802475, 'mae': 0.4602312830751091, 'mre': 0.15929310976954425, 'gradient': 0.21458408599305287, 'rank': 3.911857254638943, 'census': 8.835369197857894, 'delta1': 0.7387607243870965, 'delta2': 0.930808263187689, 'delta3': 0.9791175506145946}, {'rmse': 0.6524958604383869, 'mae': 0.48689517720061526, 'mre': 0.16837280558047144, 'gradient': 0.21687186008225637, 'rank': 3.947958905317736, 'census': 8.995250145111232, 'delta1': 0.7141927315677966, 'delta2': 0.9212544379426397, 'delta3': 0.9756337592322536}]



len(metrics)

11

In [100]:
from IPython.display import FileLink
FileLink(r'pixel_former.zip')

In [99]:
!ls

PixelFormer  mmcv  noise  pixel_former	pixel_former.zip


In [96]:
%cd /kaggle/working

/kaggle/working


In [90]:
import json


with open("/kaggle/working/pixel_former/metrics.json", "w") as f:
    json.dump(metrics, f)

In [10]:
!pip install mmcv

Collecting mmcv
  Downloading mmcv-2.2.0.tar.gz (479 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m479.1/479.1 kB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting addict (from mmcv)
  Downloading addict-2.4.0-py3-none-any.whl.metadata (1.0 kB)
Collecting mmengine>=0.3.0 (from mmcv)
  Downloading mmengine-0.10.4-py3-none-any.whl.metadata (20 kB)
Downloading mmengine-0.10.4-py3-none-any.whl (451 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m451.7/451.7 kB[0m [31m27.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading addict-2.4.0-py3-none-any.whl (3.8 kB)
Building wheels for collected packages: mmcv
  Building wheel for mmcv (setup.py) ... [?25l\^C
[?25canceled
[31mERROR: Operation cancelled by user[0m[31m
[0m

In [11]:
!python -c 'import torch;print(torch.__version__)'

2.1.2


In [12]:
%cd /kaggle/working

/kaggle/working


In [13]:
!git clone https://github.com/open-mmlab/mmcv.git
%cd mmcv

Cloning into 'mmcv'...
remote: Enumerating objects: 16902, done.[K
remote: Counting objects: 100% (77/77), done.[K
remote: Compressing objects: 100% (52/52), done.[K
remote: Total 16902 (delta 40), reused 50 (delta 25), pack-reused 16825[K
Receiving objects: 100% (16902/16902), 13.89 MiB | 5.73 MiB/s, done.
Resolving deltas: 100% (12446/12446), done.
/kaggle/working/mmcv


In [14]:
!pip install -r requirements/optional.txt



In [12]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Mon_Apr__3_17:16:06_PDT_2023
Cuda compilation tools, release 12.1, V12.1.105
Build cuda_12.1.r12.1/compiler.32688072_0


In [13]:
!gcc --version

gcc (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0
Copyright (C) 2019 Free Software Foundation, Inc.
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.



In [15]:
!pip install -e . -v

Using pip 23.3.2 from /opt/conda/lib/python3.10/site-packages/pip (python 3.10)
Obtaining file:///kaggle/working/mmcv
  Running command python setup.py egg_info
    from pkg_resources import DistributionNotFound, get_distribution, parse_version
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting addict (from mmcv==2.2.0)
  Obtaining dependency information for addict from https://files.pythonhosted.org/packages/6a/00/b08f23b7d7e1e14ce01419a467b583edbb93c6cdb8654e54a9cc579cd61f/addict-2.4.0-py3-none-any.whl.metadata
  Using cached addict-2.4.0-py3-none-any.whl.metadata (1.0 kB)
Collecting mmengine>=0.3.0 (from mmcv==2.2.0)
  Obtaining dependency information for mmengine>=0.3.0 from https://files.pythonhosted.org/packages/0b/03/e8a1da1e73d6d9ba3ada49780c0c27afcea4607539ccf9a4be75e2b08533/mmengine-0.10.4-py3-none-any.whl.metadata
  Using cached mmengine-0.10.4-py3-none-any.whl.metadata (20 kB)
Using cached mmengine-0.10.4-py3-none-any.whl (451 kB)
Using cached addict-2.4.0-py3-

In [16]:
!python .dev_scripts/check_installation.py

Start checking the installation of mmcv ...
CPU ops were compiled successfully.
CUDA ops were compiled successfully.
mmcv has been installed successfully.

Environment information:
------------------------------------------------------------
sys.platform: linux
Python: 3.10.13 | packaged by conda-forge | (main, Dec 23 2023, 15:36:39) [GCC 12.3.0]
CUDA available: True
MUSA available: False
numpy_random_seed: 2147483648
GPU 0: Tesla P100-PCIE-16GB
CUDA_HOME: /usr/local/cuda
NVCC: Cuda compilation tools, release 12.1, V12.1.105
GCC: gcc (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0
PyTorch: 2.1.2
PyTorch compiling details: PyTorch built with:
  - GCC 9.4
  - C++ Version: 201703
  - Intel(R) oneAPI Math Kernel Library Version 2023.1-Product Build 20230303 for Intel(R) 64 architecture applications
  - Intel(R) MKL-DNN v3.1.1 (Git Hash 64f6bcbcbab628e96f33a62c3e975f8535a7bde4)
  - OpenMP 201511 (a.k.a. OpenMP 4.5)
  - LAPACK is enabled (usually provided by MKL)
  - NNPACK is enabled
  - CPU capabili

In [18]:
!pip list

Package                                  Version             Editable project location
---------------------------------------- ------------------- --------------------------------
absl-py                                  1.4.0
accelerate                               0.29.3
access                                   1.1.9
addict                                   2.4.0
affine                                   2.4.0
aiobotocore                              2.12.3
aiofiles                                 22.1.0
aiohttp                                  3.9.1
aiohttp-cors                             0.7.0
aioitertools                             0.11.0
aiorwlock                                1.3.0
aiosignal                                1.3.1
aiosqlite                                0.19.0
albumentations                           1.4.0
alembic                                  1.13.1
altair                                   5.3.0
annotated-types                          0.6.0
annoy         