In [1]:
import os
os.getcwd()

'/root/autodl-tmp/ViewDiff/viewdiff'

In [2]:

from dataclasses import dataclass, asdict
from typing import Union, Optional, Literal, Tuple
import os
from pathlib import Path
import json
import tyro
import copy
from tqdm.auto import tqdm

import numpy as np
import torch
import torch.utils.checkpoint

from torch.utils.tensorboard import SummaryWriter

from accelerate.utils import set_seed
from accelerate.logging import get_logger
from diffusers import DPMSolverMultistepScheduler, UniPCMultistepScheduler, DDPMScheduler, DDIMScheduler

from model.custom_unet_2d_condition import (
    UNet2DConditionCrossFrameInExistingAttnModel,
)
from model.util import (
    replace_self_attention_with_cross_frame_attention,
    add_pose_cond_to_attention_layers,
    update_cross_frame_attention_config,
    update_last_layer_mode,
    update_vol_rend_inject_noise_sigma,
    update_n_novel_images,
    CrossFrameAttentionConfig,
    ModelConfig,
)
from model.custom_stable_diffusion_pipeline import CustomStableDiffusionPipeline
from model.custom_stable_instructPix2pix_pipeline import CustomInstructPix2pixDiffusionPipeline

from .io_util import (
    setup_output_directories,
    make_output_directories,
    convert_to_tensorboard_dict,
    SaveConfig
)

from metrics.image_metrics import load_lpips_vgg_model

from .model.util import (
    replace_self_attention_with_cross_frame_attention,
    update_last_layer_mode,
    update_vol_rend_inject_noise_sigma,
    update_n_novel_images,
    update_cross_frame_attention_config,
    add_pose_cond_to_attention_layers,
    collapse_prompt_to_batch_dim,
    collapse_tensor_to_batch_dim,
    expand_output_to_k,
    expand_tensor_to_k,
    tokenize_captions,
    ModelConfig,
    CrossFrameAttentionConfig,
    build_cross_attention_kwargs,
)



from .train_util import FinetuneConfig
from diffusers.loaders import LoraLoaderMixin

from dacite import from_dict, Config

from .train import test_step

logger = get_logger(__name__, log_level="INFO")

ImportError: attempted relative import with no known parent package

In [None]:
import subprocess
import os

result = subprocess.run('bash -c "source /etc/network_turbo && env | grep proxy"', shell=True, capture_output=True, text=True)
output = result.stdout
for line in output.splitlines():
    if '=' in line:
        var, value = line.split('=', 1)
        os.environ[var] = value

In [3]:
class dataconfig:
    root_dir ="/root/autodl-tmp/mvs_training/dtu/"
    split = "val"

    target_light = 6
    n_views:int=3 
    levels:int=3 
    depth_interval:int =2.65
    img_wh:int=None
    abs_error:Optional[str] ="abs"
    output_total:Optional[bool]=False
    threshold: Optional[int] = 4.7
    prompt_dir: Optional[str] = "/root/autodl-tmp/mvs_training/dtu/co3d_blip2_captions_final.json"


In [4]:
from torch.utils.data import Dataset
import sys
sys.path.append('/root/autodl-tmp/project/dp_simple/')
from CasMVSNet_pl.datasets.utils import read_pfm
import os
import numpy as np
import cv2
from PIL import Image
import torch
from torchvision import transforms as T
class DTUDataset(Dataset):
    def __init__(self, config):
        """
        img_wh should be set to a tuple ex: (1152, 864) to enable test mode!
        """

        self.root_dir = config.root_dir
        self.split = config.split
        assert self.split in ['train', 'val', 'test'], \
            'split must be either "train", "val" or "test"!'
        
        
        self.light_class = config.target_light
        self.img_wh = None

        
        self.threshold = config.threshold
        self.build_metas()
        self.n_views = config.n_views
        self.levels = config.levels # FPN levels
        self.depth_interval = config.depth_interval
        self.build_proj_mats()
        self.define_transforms()
        self.output_total = config.output_total
        prompt_dir = config.prompt_dir
        if prompt_dir != None:
            import json
            captions = json.load(open(prompt_dir))
        self.prompt_dir =captions
        
      
        
        
        
    def build_metas(self):
        self.metas = []
        with open(f'/root/autodl-tmp/project/dp_simple/CasMVSNet_pl/datasets/lists/dtu/{self.split}.txt') as f:
            self.scans = [line.rstrip() for line in f.readlines()]
        output_pkl = f'/root/autodl-tmp/project/dp_simple/CasMVSNet_pl/datasets/lists/dtu/{self.split}_abs.pkl'
        import pickle
        with open(output_pkl, 'rb') as f:
            self.output_pkl = pickle.load(f)
        # light conditions 0-6 for training
        # light condition 3 for testing (the brightest?)
        outputs_total = {}
        for scan in self.output_pkl.keys():
            scan_index = scan.split('_')[0]
            if scan_index not in outputs_total:
                outputs_total[scan_index] = []
            outputs_total[scan_index].append(self.output_pkl[scan])
        for scan in outputs_total.keys():
            outputs_total[scan] = np.mean(np.array(outputs_total[scan]), axis=0)
            print(f"scan {scan} mean output: {outputs_total[scan]}")
        self.total_pkl = outputs_total


        light_idxs = list(range(7))

        pair_file = "Cameras/pair.txt"
        for scan in self.scans:
            with open(os.path.join(self.root_dir, pair_file)) as f:
                num_viewpoint = int(f.readline())
                # viewpoints (49)
                for _ in range(num_viewpoint):
                    ref_view = int(f.readline().rstrip())
                    src_views = [int(x) for x in f.readline().rstrip().split()[1::2]]
                    

                    for light_idx in light_idxs:
                        output_key = f"{scan}_{ref_view}_{src_views[0]}_{src_views[1]}"
                        losses = self.output_pkl[output_key]
                        if np.argmin(losses)==self.light_class and self.split=="train":
                            self.metas += [(scan, ref_view,light_idx, src_views,int(np.argmin(losses)))]
                        elif self.split!="train":
                            if light_idx!=0:
                                continue
                            else:
                                self.metas += [(scan, ref_view,light_idx, src_views,int(np.argmin(losses)))]
                                
                                
                           
                         
    def build_proj_mats(self):
        proj_mats = []
        for vid in range(49): # total 49 view ids
            if self.img_wh is None:
                proj_mat_filename = os.path.join(self.root_dir,
                                                 f'Cameras/train/{vid:08d}_cam.txt')
            else:
                proj_mat_filename = os.path.join(self.root_dir,
                                                 f'Cameras/{vid:08d}_cam.txt')
            intrinsics, extrinsics, depth_min = \
                self.read_cam_file(proj_mat_filename)
            if self.img_wh is not None: # resize the intrinsics to the coarsest level
                intrinsics[0] *= self.img_wh[0]/1600/4
                intrinsics[1] *= self.img_wh[1]/1200/4
            K = intrinsics
            R = extrinsics
            # multiply intrinsics and extrinsics to get projection matrix
            proj_mat_ls = []
            for l in reversed(range(self.levels)):
                proj_mat_l = np.eye(4)
                proj_mat_l[:3, :4] = intrinsics @ extrinsics[:3, :4]
                intrinsics[:2] *= 2 # 1/4->1/2->1
                proj_mat_ls += [torch.FloatTensor(proj_mat_l)]
            # (self.levels, 4, 4) from fine to coarse
            proj_mat_ls = torch.stack(proj_mat_ls[::-1])
           
            proj_mats += [(proj_mat_ls, depth_min,K,R)]

        self.proj_mats = proj_mats

    def read_cam_file(self, filename):
        with open(filename) as f:
            lines = [line.rstrip() for line in f.readlines()]
        # extrinsics: line [1,5), 4x4 matrix
        extrinsics = np.fromstring(' '.join(lines[1:5]), dtype=np.float32, sep=' ')
        extrinsics = extrinsics.reshape((4, 4))
        # intrinsics: line [7-10), 3x3 matrix
        intrinsics = np.fromstring(' '.join(lines[7:10]), dtype=np.float32, sep=' ')
        intrinsics = intrinsics.reshape((3, 3))
        # depth_min & depth_interval: line 11
        depth_min = float(lines[11].split()[0])
        return intrinsics, extrinsics, depth_min

    def read_depth(self, filename):
        depth = np.array(read_pfm(filename)[0], dtype=np.float32) # (1200, 1600)
        if self.img_wh is None:
            depth = cv2.resize(depth, None, fx=0.5, fy=0.5,
                            interpolation=cv2.INTER_NEAREST) # (600, 800)
            depth_0 = depth[44:556, 80:720] # (512, 640)
        else:
            depth_0 = cv2.resize(depth, self.img_wh,
                                 interpolation=cv2.INTER_NEAREST)
        depth_1 = cv2.resize(depth_0, None, fx=0.5, fy=0.5,
                             interpolation=cv2.INTER_NEAREST)
        depth_2 = cv2.resize(depth_1, None, fx=0.5, fy=0.5,
                             interpolation=cv2.INTER_NEAREST)

        depths = {"level_0": torch.FloatTensor(depth_0),
                  "level_1": torch.FloatTensor(depth_1),
                  "level_2": torch.FloatTensor(depth_2)}
        
        return depths

    def read_mask(self, filename):
        mask = cv2.imread(filename, 0) # (1200, 1600)
       
        if self.img_wh is None:
            mask = cv2.resize(mask, None, fx=0.5, fy=0.5,
                            interpolation=cv2.INTER_NEAREST) # (600, 800)
            mask_0 = mask[44:556, 80:720] # (512, 640)
        else:
            mask_0 = cv2.resize(mask, self.img_wh,
                                interpolation=cv2.INTER_NEAREST)
        mask_1 = cv2.resize(mask_0, None, fx=0.5, fy=0.5,
                            interpolation=cv2.INTER_NEAREST)
        mask_2 = cv2.resize(mask_1, None, fx=0.5, fy=0.5,
                            interpolation=cv2.INTER_NEAREST)

        masks = {"level_0": torch.BoolTensor(mask_0),
                 "level_1": torch.BoolTensor(mask_1),
                 "level_2": torch.BoolTensor(mask_2)}

        return masks

    def define_transforms(self):
        if self.split == 'train': # you can add augmentation here
            self.transform = T.Compose([T.ToTensor(),
                                        T.Normalize(mean=[0.485, 0.456, 0.406], 
                                                    std=[0.229, 0.224, 0.225]),
                                       ])
        else:
            self.transform = T.Compose([T.ToTensor(),
                                        T.Normalize(mean=[0.485, 0.456, 0.406], 
                                                    std=[0.229, 0.224, 0.225]),
                                       ])
        self.unpreprocess = T.Compose([
            T.Normalize(mean=[0, 0, 0], std=[1/0.229, 1/0.224, 1/0.225]),
            T.Normalize(mean=[-0.485, -0.456, -0.406], std=[1, 1, 1]),
        ])
    
    def decode_batch(self, batch):
        imgs = batch['imgs']
        proj_mats = batch['proj_mats']
        depths = batch['depths']
        masks = batch['masks']
        init_depth_min = batch['init_depth_min']
        depth_interval = batch['depth_interval']
        return imgs, proj_mats, depths, masks, init_depth_min, depth_interval

    def __len__(self):
        return len(self.metas)


    


    def  __getitem__(self, idx):
       
        scan, ref_view,light_idx, src_views,target_light = self.metas[idx]
        # use only the reference view and first nviews-1 source views
        view_ids = [ref_view] + src_views[:self.n_views-1]

        # output_key = f"{scan}_{ref_view}_{src_views[0]}_{src_views[1]}"
        # if self.total_pkl:
        #     target_light = self.total_pkl[scan]
        #     target_light = np.argmin(target_light)
        # else:
        #     target_light = self.output_pkl[output_key]
        #     target_light = np.argmin(target_light)

        

        sample = {}
        imgs = []
        cams = []
        proj_mats = []
        target_imgs = []
        Ks = []
        Rs = []
        intensity_stats =[]
        prompt = str(np.random.choice(self.prompt_dir[scan][str(ref_view)],1)[0])
         
        sample['prompt'] = [f"modify the lightness of image to light_class_{self.light_class} style"]
        for i, vid in enumerate(view_ids):
        # NOTE that the id in image file names is from 1 to 49 (not 0~48)
        
            img_filename = os.path.join(self.root_dir,
                            f'Rectified/{scan}_train/rect_{vid+1:03d}_{light_idx}_r5000.png')
            target_filename = os.path.join(self.root_dir,
                            f'Rectified/{scan}_train/rect_{vid+1:03d}_{self.light_class}_r5000.png')
            mask_filename = os.path.join(self.root_dir,
                            f'Depths/{scan}/depth_visual_{vid:04d}.png')
            depth_filename = os.path.join(self.root_dir,
                            f'Depths/{scan}/depth_map_{vid:04d}.pfm')
    

            img = Image.open(img_filename)
            target_img = Image.open(target_filename)
            if self.img_wh is not None:
                img = img.resize(self.img_wh, Image.BILINEAR)
                target_img = target_img.resize(self.img_wh, Image.BILINEAR)

            img = self.transform(img)
            target_img = self.transform(target_img)
            imgs += [img]
            target_imgs += [target_img]

            proj_mat_ls, depth_min,K,R = self.proj_mats[vid]
            Ks += [K]
            Rs += [R]
        



            if i == 0:  # reference view
                
                sample['init_depth_min'] = torch.FloatTensor([depth_min])
                
                sample['masks'] = self.read_mask(mask_filename)
                for key in sample['masks']:
                    sample['masks'][key] = sample['masks'][key]
                sample['depths'] = self.read_depth(depth_filename)
                for key in sample['depths']:
                    sample['depths'][key] = sample['depths'][key]
                sample["depth"] = sample["depths"]["level_0"]
                ref_proj_inv = torch.inverse(proj_mat_ls)
            else:
                
                proj_mats += [proj_mat_ls @ ref_proj_inv]
            var, mean = torch.var_mean(img)
            intensity_stat = torch.stack([mean, var], dim=0)
            intensity_stats.append(intensity_stat)
    
    
        imgs = torch.stack(imgs) # (V, 3, H, W)
        target_imgs = torch.stack(target_imgs)
        proj_mats = torch.stack(proj_mats)[:,:,:3] # (V-1, self.levels, 3, 4) from fine to coarse
        
        imgs = self.unpreprocess(imgs)
        target_imgs = self.unpreprocess(target_imgs)
       
        Ks = np.stack(Ks)
        Rs = np.stack(Rs)
        sample['pose'] = torch.tensor(Rs)
        sample['K'] = torch.tensor(Ks)
        sample['images'] = imgs
        sample["intensity_stats"] = torch.stack(intensity_stats)
        sample['proj_mats'] = proj_mats
        sample['depth_interval'] = torch.FloatTensor([self.depth_interval])
        sample['scan_vid'] = (scan, ref_view)
        

        sample['target_imgs'] = target_imgs
        sample["bbox"] =torch.tensor([[-1, -1, -1], [1, 1, 1]], dtype=torch.float32)



        return sample







In [9]:
(val_data[0]["images"]-val_data[0]["target_imgs"]).abs().mean()

tensor(0.2805)

In [5]:
val_data = DTUDataset(dataconfig)

scan scan3 mean output: [3.05263341 2.92222905 2.79727794 2.86347311 2.84872966 2.86011306
 2.89029897]
scan scan5 mean output: [1.27925969 1.21662743 1.19024779 1.17714057 1.17585228 1.13693983
 1.13944231]
scan scan17 mean output: [4.23918102 4.3646479  4.26873698 4.152941   4.16164234 4.44407791
 4.15546879]
scan scan21 mean output: [6.7044504  6.78296111 6.79752936 6.78882487 6.74292021 6.64370545
 6.71224226]
scan scan28 mean output: [8.33500304 8.26974845 8.08564879 7.80853114 7.63656337 7.53135754
 7.33638762]
scan scan35 mean output: [1.18667069 1.32775589 1.20760566 1.06743625 1.15983699 0.91010288
 0.87788923]
scan scan37 mean output: [21.1885944  21.17826699 21.25191359 21.18731575 20.97766127 20.14655309
 19.91200681]
scan scan38 mean output: [1.69360119 1.60652492 1.57134287 1.54112932 1.56236016 1.63116413
 1.62020702]
scan scan40 mean output: [3.64535844 3.69948315 3.71793726 3.69029421 3.68635351 3.71370282
 3.58273512]
scan scan43 mean output: [2.2589177  2.24700747 2.

In [6]:
class runfig:
    pretrained_model_name_or_path = "/root/autodl-tmp/ViewDiff/output_var_second/all/subset_all/input_3/train/class6/saved_model_from_checkpoint-15000/"
    n_input_images =3
    n_output_noise =3

In [7]:
config_path = config_path = os.path.join(runfig.pretrained_model_name_or_path, "config.json")
if not os.path.isfile(str(config_path)):
        raise ValueError("cannot find config.json in ", config_path)
with open(config_path, "r") as f:
    config_data = json.load(f)
finetune_config = from_dict(FinetuneConfig, data=config_data, config=Config(cast=[tuple, int]))
runfig.cross_frame_attention = finetune_config.cross_frame_attention
runfig.model = finetune_config.model


In [8]:
pipeline = CustomInstructPix2pixDiffusionPipeline.from_pretrained(
        runfig.pretrained_model_name_or_path
    )
pipeline.scheduler.config.prediction_type = finetune_config.training.noise_prediction_type

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

Some weights of the model checkpoint were not used when initializing UNet2DConditionCrossFrameInExistingAttnModel: 
 ['down_blocks.0.attentions.0.transformer_blocks.0.attn1.processor.temb_proj.0.bias, down_blocks.0.attentions.0.transformer_blocks.0.attn1.processor.temb_proj.0.weight, down_blocks.0.attentions.0.transformer_blocks.0.attn1.processor.temb_proj.2.bias, down_blocks.0.attentions.0.transformer_blocks.0.attn1.processor.temb_proj.2.weight, down_blocks.0.attentions.0.transformer_blocks.0.attn1.processor.to_k_lora.down.weight, down_blocks.0.attentions.0.transformer_blocks.0.attn1.processor.to_k_lora.up.weight, down_blocks.0.attentions.0.transformer_blocks.0.attn1.processor.to_out_lora.down.weight, down_blocks.0.attentions.0.transformer_blocks.0.attn1.processor.to_out_lora.up.weight, down_blocks.0.attentions.0.transformer_blocks.0.attn1.processor.to_q_lora.down.weight, down_blocks.0.attentions.0.transformer_blocks.0.attn1.processor.to_q_lora.up.weight, down_blocks.0.attentions.0.tr

In [9]:
replace_self_attention_with_cross_frame_attention(
                unet=pipeline.unet,
                n_input_images=runfig.n_input_images,
                to_k_other_frames=runfig.cross_frame_attention.to_k_other_frames,
                with_self_attention=runfig.cross_frame_attention.with_self_attention,
                random_others=runfig.cross_frame_attention.random_others,
                use_lora_in_cfa="cfa" in runfig.model.pose_cond_mode or "sa" in runfig.model.pose_cond_mode,
                use_temb_in_lora=runfig.cross_frame_attention.use_temb_cond,
                temb_out_size=8,
                pose_cond_dim=runfig.model.pose_cond_dim,
                rank=runfig.model.pose_cond_lora_rank,
            )

({},
 [Parameter containing:
  tensor([[ 0.0085, -0.0219,  0.0140,  ..., -0.0102, -0.0116,  0.0034],
          [-0.0202, -0.0160,  0.0122,  ..., -0.0062, -0.0178, -0.0263],
          [-0.0167,  0.0100,  0.0129,  ...,  0.0100,  0.0184, -0.0160],
          ...,
          [ 0.0174, -0.0044, -0.0089,  ...,  0.0210, -0.0241,  0.0076],
          [-0.0103,  0.0065,  0.0131,  ..., -0.0170, -0.0080,  0.0261],
          [ 0.0270,  0.0232, -0.0066,  ...,  0.0250,  0.0082, -0.0013]],
         requires_grad=True),
  Parameter containing:
  tensor([-2.6313e-03, -2.4328e-02,  1.2488e-02, -1.7717e-02,  2.6822e-02,
          -1.9169e-02, -8.4139e-03,  8.1025e-03,  2.6243e-02, -1.4524e-02,
          -2.0218e-02, -1.5498e-02, -2.4912e-02,  2.3118e-02,  2.0096e-02,
          -2.4976e-02,  1.6888e-02,  8.9045e-03, -2.1103e-02, -1.9572e-03,
           5.5837e-03,  8.4881e-03, -4.9214e-03, -1.1827e-02, -1.1982e-03,
          -1.0095e-02, -1.0001e-02, -2.1398e-02, -2.4084e-02,  1.4497e-02,
          -2.6465e-

In [10]:
runfig.cross_frame_attention.last_layer_mode,finetune_config.training.changed_cfa_last_layer

('zero-conv', 'no_residual_connection')

In [11]:
runfig.cross_frame_attention.with_self_attention,runfig.cross_frame_attention.random_others,

(True, True)

In [12]:
def update_cfa_config(
   
    pipeline: CustomInstructPix2pixDiffusionPipeline,
):
    if runfig.cross_frame_attention.mode == "add_in_existing_block":
        update_cross_frame_attention_config(
            pipeline.unet,
            runfig.n_input_images,
            runfig.cross_frame_attention.to_k_other_frames,
            runfig.cross_frame_attention.with_self_attention,
            runfig.cross_frame_attention.random_others,
            change_self_attention_layers=False,  # should have custom cfa layers
        )
    elif runfig.cross_frame_attention.mode == "pretrained":
        update_cross_frame_attention_config(
            pipeline.unet,
            3,
            2,
            runfig.cross_frame_attention.with_self_attention,
            runfig.cross_frame_attention.random_others,
            change_self_attention_layers=True,  # should have cfa is sa layers
        )
    else:
        raise NotImplementedError(
            f"did not implement different n_input_images for cfa.mode={runfig.cross_frame_attention.mode}"
        )

In [13]:
if finetune_config.training.changed_cfa_last_layer != runfig.cross_frame_attention.last_layer_mode:
        print("Change last-layer-mode to", finetune_config.training.changed_cfa_last_layer)
        update_last_layer_mode(
            pipeline.unet,
            finetune_config.training.changed_cfa_last_layer,
        )
update_vol_rend_inject_noise_sigma(
        pipeline.unet, 0.0
    )
    # disable n_novel_images
update_n_novel_images(
        pipeline.unet, 0

    )
update_cfa_config(pipeline)

Change last-layer-mode to no_residual_connection


In [14]:
if runfig.model.pose_cond_mode != "none":
        # Set correct lora layers
        unet_lora_attn_procs, unet_lora_parameters = add_pose_cond_to_attention_layers(
            pipeline.unet,
            rank=runfig.model.pose_cond_lora_rank,
            pose_cond_dim=runfig.model.pose_cond_dim,
            only_cross_attention="sa" not in runfig.model.pose_cond_mode,
        )

        if unet_lora_parameters is not None:
            in_dir = os.path.join(runfig.pretrained_model_name_or_path, "unet")
            try:
                lora_state_dict, network_alpha = LoraLoaderMixin.lora_state_dict(in_dir, weight_name="pytorch_lora_weights.safetensors")
            except:
                lora_state_dict, network_alpha = LoraLoaderMixin.lora_state_dict(in_dir, weight_name="pytorch_lora_weights.bin")
            lora_state_dict = {k.replace("unet.", ""): v for k, v in lora_state_dict.items()}
            pipeline.unet.load_state_dict(lora_state_dict, strict=False)
            print("Loaded LoRA weights into model")

Loaded LoRA weights into model


In [15]:
pipeline = pipeline.to("cuda")

In [16]:
runfig.model.pose_cond_mode

'sa-ca'

In [17]:
import torch
@torch.no_grad()
def process_batch(
    
   
    pipeline,
   
    batch,
    guidance_scale=16,
    image_guidance_scale: float = 1.0,
    
):
    
    model_config=runfig.model
    cfa_config=runfig.cross_frame_attention
    io_config=runfig
    orig_hw=(512, 640)
    num_inference_steps=50
    n_repeat_generation=1
    generator = torch.Generator("cuda").manual_seed(42)

    # combine
    
    batch["images"] = batch["images"].to("cuda").unsqueeze(0)
    batch["target_imgs"] = batch["target_imgs"].to("cuda").unsqueeze(0) 
    batch["pose"] = batch["pose"].to("cuda").unsqueeze(0)
    batch["K"] = batch["K"].to("cuda").unsqueeze(0)
    batch["intensity_stats"] = batch["intensity_stats"].to("cuda").unsqueeze(0)
    batch["bbox"] = batch["bbox"].to("cuda").unsqueeze(0)

    # check if need to change n_input_images
    if runfig.n_input_images != batch["pose"].shape[1]:
        runfig.n_input_images = batch["pose"].shape[1]
        runfig.cross_frame_attention.to_k_other_frames = batch["pose"].shape[1] - 1
        runfig.model.n_input_images = batch["pose"].shape[1]
        update_cfa_config(runfig, pipeline)

    # alwasy set to 0
    batch["intensity_stats"] *= 0

    # create images
    batch_size = len(batch["prompt"])

    batch["images"] = 2*batch["images"]-1
    batch["target_imgs"] = 2*batch["target_imgs"]-1
    # parse batch
    # collapse K dimension into batch dimension (no concatenation happening)
    batch["prompt"] = [cap for cap in batch["prompt"]]
    prompt = collapse_prompt_to_batch_dim(batch["prompt"],3)
   
    
    _, pose = collapse_tensor_to_batch_dim(batch["pose"])
    _, K = collapse_tensor_to_batch_dim(batch["K"])
    _, intensity_stats = collapse_tensor_to_batch_dim(batch["intensity_stats"])
    bbox = batch["bbox"]

    _, known_images = collapse_tensor_to_batch_dim(batch["images"])
    known_images = known_images.to(pipeline.device)
    known_images = known_images.squeeze(1)
    print(known_images.shape)

    K = K.squeeze(1)[..., :3, :3]
    pose = pose.squeeze(1)
    intensity_stats = intensity_stats.squeeze(1)

    # build cross_attention_kwargs
    cross_attention_kwargs = build_cross_attention_kwargs(
        model_config=runfig.model,
        cfa_config=runfig.cross_frame_attention,
        pose=pose,
        K=K,
        intensity_stats=intensity_stats,
        bbox=bbox,
        orig_hw=orig_hw,
    )
    if "pose_cond" in cross_attention_kwargs:
            cross_attention_kwargs["pose_cond"] = torch.cat([cross_attention_kwargs["pose_cond"]] * 3)
    if "unproj_reproj_kwargs" in cross_attention_kwargs:
        proj_kwargs = cross_attention_kwargs["unproj_reproj_kwargs"]
        proj_kwargs["pose"] = torch.cat([proj_kwargs["pose"]] * 3)
        proj_kwargs["K"] = torch.cat([proj_kwargs["K"]] * 3)
        proj_kwargs["bbox"] = torch.cat([proj_kwargs["bbox"]] * 3)

    outputs = []
    all_psnrs = []
    all_lpipses = []
    all_ssims = []
    for _ in range(n_repeat_generation):
        output = pipeline(
            prompt=prompt,
            height=orig_hw[0],
            width=orig_hw[1],
            known_images=known_images,
            output_type="pt",  # return tensor normalized to [0, 1]
            generator=generator,
            cross_attention_kwargs=cross_attention_kwargs,
            guidance_scale=guidance_scale,
            image_guidance_scale=image_guidance_scale,
            decode_all_timesteps=True,
            num_inference_steps=num_inference_steps,
            n_images_per_batch=model_config.n_input_images,
        )

        # re-create K dimension from batch dimension
        output.images = output.images.unsqueeze(1)
        expand_output_to_k(output, batch_size, model_config.n_input_images)

        outputs.append(output)
   



    

    return outputs



In [18]:
from torchvision.utils import save_image
import matplotlib.pyplot as plt
# batch = val_data[0]
# save_image(outputs[0].images[0], "output.png")
# save_image(batch["images"], "input.png")
# save_image(batch["target_imgs"], "target.png")

# plt.imshow(plt.imread("output.png"))
# plt.show()
# plt.imshow(plt.imread("input.png"))
# plt.show()
# plt.imshow(plt.imread("target.png"))
# plt.show()

In [19]:
from pandas import DataFrame
df = DataFrame(val_data.metas,columns=["scan","ref_view","light_idx","src_views","target_light"])    
df.head()                                       

Unnamed: 0,scan,ref_view,light_idx,src_views,target_light
0,scan3,0,0,"[10, 1, 9, 12, 11, 13, 2, 8, 14, 27]",6
1,scan3,1,0,"[9, 10, 2, 0, 8, 13, 14, 12, 7, 15]",0
2,scan3,2,0,"[8, 1, 7, 9, 3, 15, 14, 16, 6, 10]",5
3,scan3,3,0,"[7, 6, 2, 4, 8, 5, 17, 16, 1, 15]",2
4,scan3,4,0,"[5, 6, 3, 7, 18, 2, 17, 8, 16, 1]",1


In [20]:
df["scan"].unique()

array(['scan3', 'scan5', 'scan17', 'scan21', 'scan28', 'scan35', 'scan37',
       'scan38', 'scan40', 'scan43', 'scan56', 'scan59', 'scan66',
       'scan67', 'scan82', 'scan86', 'scan106', 'scan117'], dtype=object)

In [21]:
from math import sqrt
import sys
sys.path.append('/root/autodl-tmp/project/dp_simple/')
#import ViT
from torchvision import transforms as T
from CasMVSNet_pl.models.mvsnet import CascadeMVSNet
from CasMVSNet_pl.utils import load_ckpt
from CasMVSNet_pl.datasets.dtu import DTUDataset  
from CasMVSNet_pl.utils import *
from CasMVSNet_pl.datasets.dtu import DTUDataset 
from CasMVSNet_pl.metrics import *  
from inplace_abn import ABN

import pytorch_ssim
import pytorch_lightning as pl
import pytorch_ssim
import pytorch_lightning as pl
import sys
sys.path.append('/root/autodl-tmp/D3Dnet/code')

import matplotlib.pyplot as plt

import functools
import torch.nn.functional as F
import torch
from collections import namedtuple
from torchvision import models
import torch.nn as nn
import sys
from einops import rearrange
from torchvision import models
import sys

from CasMVSNet_pl.datasets.utils import save_pfm, read_pfm
import cv2
import torch
import os, shutil
import numpy as np
from tqdm import tqdm
from argparse import ArgumentParser

# for depth prediction
from CasMVSNet_pl.models.mvsnet import CascadeMVSNet
from CasMVSNet_pl.utils import load_ckpt
from inplace_abn import ABN

# for point cloud fusion
from numba import jit
from plyfile import PlyData, PlyElement

torch.backends.cudnn.benchmark = True # this increases inference speed a little

In [22]:
torch.backends.cudnn.benchmark = True # this increases inference speed a little

def get_opts():
    parser = ArgumentParser()
    parser.add_argument('--root_dir', type=str,
                        default='/root/autodl-tmp/mvs_training/dtu',
                        help='root directory of dtu dataset')
    parser.add_argument('--dataset_name', type=str, default='dtu',
                        choices=['dtu', 'tanks', 'blendedmvs'],
                        help='which dataset to train/val')
    parser.add_argument('--split', type=str, default='train',
                        help='which split to evaluate')
    parser.add_argument('--scan', type=str, default='scan7',
                        help='specify scan to evaluate (must be in the split)')
    parser.add_argument('--cpu', default=False, action='store_true',
                        help='''use cpu to do depth inference.
                                WARNING: It is going to be EXTREMELY SLOW!
                                about 37s/view, so in total 30min/scan. 
                             ''')
    # for depth prediction
    parser.add_argument('--n_views', type=int, default=3,
                        help='number of views (including ref) to be used in testing')
    parser.add_argument('--depth_interval', type=float, default=2.65,
                        help='depth interval unit in mm')
    parser.add_argument('--n_depths', nargs='+', type=int, default=[8,32,48],
                        help='number of depths in each level')
    parser.add_argument('--interval_ratios', nargs='+', type=float, default=[1.0,2.0,4.0],
                        help='depth interval ratio to multiply with --depth_interval in each level')
    parser.add_argument('--num_groups', type=int, default=1, choices=[1, 2, 4, 8],
                        help='number of groups in groupwise correlation, must be a divisor of 8')
    parser.add_argument('--img_wh', nargs="+", type=int, default=[640,512],
                        help='resolution (img_w, img_h) of the image, must be multiples of 32')
    parser.add_argument('--ckpt_path', type=str, default='/root/autodl-tmp/project/dp_simple/CasMVSNet_pl/ckpts/_ckpt_epoch_10.ckpt',
                        help='pretrained checkpoint path to load')
    parser.add_argument('--save_visual', default=False, action='store_true',
                        help='save depth and proba visualization or not')

    # for point cloud fusion
    parser.add_argument('--conf', type=float, default=0.999,
                        help='min confidence for pixel to be valid')
    parser.add_argument('--min_geo_consistent', type=int, default=5,
                        help='min number of consistent views for pixel to be valid')
    parser.add_argument('--max_ref_views', type=int, default=400,
                        help='max number of ref views (to limit RAM usage)')
    parser.add_argument('--skip', type=int, default=1,
                        help='''how many points to skip when creating the point cloud.
                                Larger = fewer points and smaller file size.
                                Ref: skip=10 creates ~= 3M points = 50MB file
                                     skip=1 creates ~= 30M points = 500MB file
                             ''')
    args, _ = parser.parse_known_args()
    return args
  
    

In [23]:
def decode_batch(batch):
    imgs = batch['images']
    proj_mats = batch['proj_mats']
    init_depth_min = batch['init_depth_min'].item()
    depth_interval = batch['depth_interval'].item()
    scan, vid = batch['scan_vid']
    return imgs, proj_mats, init_depth_min, depth_interval, \
           scan, vid


# define read_image and read_proj_mat for each dataset

def read_image(dataset_name, root_dir, scan, vid):
    if dataset_name == 'dtu':
        return cv2.imread(os.path.join(root_dir,
                    f'Rectified/{scan}_train/rect_{vid+1:03d}_3_r5000.png'))
    if dataset_name == 'tanks':
        return cv2.imread(os.path.join(root_dir, scan,
                    f'images/{vid:08d}.jpg'))
    if dataset_name == 'blendedmvs':
        return cv2.imread(os.path.join(root_dir, scan,
                    f'blended_images/{vid:08d}.jpg'))


def read_refined_image(dataset_name, scan, vid):
    return cv2.imread(f'results/{dataset_name}/image_refined/{scan}/{vid:08d}.png')


def save_refined_image(image_refined, dataset_name, scan, vid):
    cv2.imwrite(f'results/{dataset_name}/image_refined/{scan}/{vid:08d}.png',
                image_refined)


def read_proj_mat(dataset_name, dataset, scan, vid):
    if dataset_name == 'dtu':
        return dataset.proj_mats[vid][0][0].numpy()
    if dataset_name in ['tanks', 'blendedmvs']:
        return dataset.proj_mats[scan][vid][0][0].numpy()


@jit(nopython=True, fastmath=True)
def xy_ref2src(xy_ref, depth_ref, P_world2ref,
               depth_src, P_world2src, img_wh):
    # create ref grid and project to ref 3d coordinate using depth_ref
    xyz_ref = np.vstack((xy_ref, np.ones_like(xy_ref[:1]))) * depth_ref
    xyz_ref_h = np.vstack((xyz_ref, np.ones_like(xy_ref[:1])))

    P = (P_world2src @ np.ascontiguousarray(np.linalg.inv(P_world2ref)))[:3]
    # project to src 3d coordinate using P_world2ref and P_world2src
    xyz_src_h = P @ xyz_ref_h.reshape(4,-1)
    xy_src = xyz_src_h[:2]/xyz_src_h[2:3]
    xy_src = xy_src.reshape(2, img_wh[1], img_wh[0])

    return xy_src


@jit(nopython=True, fastmath=True)
def xy_src2ref(xy_ref, xy_src, depth_ref, P_world2ref,
               depth_src2ref, P_world2src, img_wh):
    # project xy_src back to ref view using the sampled depth
    xyz_src = np.vstack((xy_src, np.ones_like(xy_src[:1]))) * depth_src2ref
    xyz_src_h = np.vstack((xyz_src, np.ones_like(xy_src[:1])))
    P = (P_world2ref @ np.ascontiguousarray(np.linalg.inv(P_world2src)))[:3]
    xyz_ref_h = P @ xyz_src_h.reshape(4,-1)
    depth_ref_reproj = xyz_ref_h[2].reshape(img_wh[1], img_wh[0])
    xy_ref_reproj = xyz_ref_h[:2]/xyz_ref_h[2:3]
    xy_ref_reproj = xy_ref_reproj.reshape(2, img_wh[1], img_wh[0])

    # check |p_reproj-p_1| < 1
    pixel_diff = xy_ref_reproj - xy_ref
    mask_pixel_reproj = (pixel_diff[0]**2+pixel_diff[1]**2)<1

    # check |d_reproj-d_1| / d_1 < 0.01
    mask_depth_reproj = np.abs((depth_ref_reproj-depth_ref)/depth_ref)<0.01

    mask_geo = mask_pixel_reproj & mask_depth_reproj

    return depth_ref_reproj, mask_geo


def check_geo_consistency(depth_ref, P_world2ref,
                          depth_src, P_world2src,
                          image_ref, image_src,
                          img_wh):
    """
    Check the geometric consistency between ref and src views.
    """
    xy_ref = np.mgrid[:img_wh[1],:img_wh[0]][::-1].astype(np.float32)
    xy_src = xy_ref2src(xy_ref, depth_ref, P_world2ref,
                        depth_src, P_world2src, img_wh)

    # Sample the depth of xy_src using bilinear interpolation
    depth_src2ref = cv2.remap(depth_src,
                              xy_src[0].astype(np.float32),
                              xy_src[1].astype(np.float32),
                              interpolation=cv2.INTER_LINEAR)

    image_src2ref = cv2.remap(image_src,
                              xy_src[0].astype(np.float32),
                              xy_src[1].astype(np.float32),
                              interpolation=cv2.INTER_LINEAR)

    depth_ref_reproj, mask_geo = \
        xy_src2ref(xy_ref, xy_src, depth_ref, P_world2ref, 
                   depth_src2ref, P_world2src, img_wh)

    depth_ref_reproj[~mask_geo] = 0
    image_src2ref[~mask_geo] = 0
    
    return depth_ref_reproj, mask_geo, image_src2ref

In [24]:
def abs_error(depth_pred, depth_gt, mask):
    depth_pred, depth_gt = depth_pred[mask], depth_gt[mask]
    return np.abs(depth_pred - depth_gt)

def acc_threshold(depth_pred, depth_gt, mask, threshold):
    """
    computes the percentage of pixels whose depth error is less than @threshold
    """
    errors = abs_error(depth_pred, depth_gt, mask)
    acc_mask = errors < threshold
    return acc_mask.mean()

def return_log(result1,result2,gt_depth,mask):
    depth_pred = result1["depth_0"][0].cpu().numpy()
    ori_pred = result2["depth_0"][0].cpu().numpy()

    print(depth_pred.shape, ori_pred.shape, gt_depth.shape, mask.shape)
    

    abs_error1 = abs_error(depth_pred, gt_depth, mask).mean()
    abs_error2 = abs_error(ori_pred, gt_depth, mask).mean()
    print(f"depth modified is {abs_error1},original error is {abs_error2} ")
    abs_diff = abs_error1 - abs_error2
    abs_ratio = abs_error1 / abs_error2

    acc1mm1 = acc_threshold(depth_pred, gt_depth, mask, 1)
    acc1mm2 = acc_threshold(ori_pred, gt_depth, mask, 1)
    acc_diff = acc1mm1 - acc1mm2
    acc_ratio = acc1mm1 / (acc1mm2+1e-7)

    acc2mm1 = acc_threshold(depth_pred, gt_depth, mask, 2)
    acc2mm2 = acc_threshold(ori_pred, gt_depth, mask, 2)
    acc_diff2 = acc2mm1 - acc2mm2
    acc_ratio2 = acc2mm1 / (acc2mm2+1e-7)

    acc3mm1 = acc_threshold(depth_pred, gt_depth, mask, 3)
    acc3mm2 = acc_threshold(ori_pred, gt_depth, mask, 3)
    acc_diff3 = acc3mm1 - acc3mm2
    acc_ratio3 = acc3mm1 / (acc3mm2+1e-7)

    acc4mm1 = acc_threshold(depth_pred, gt_depth, mask, 4)
    acc4mm2 = acc_threshold(ori_pred, gt_depth, mask, 4)
    acc_diff4 = acc4mm1 - acc4mm2
    acc_ratio4 = acc4mm1 / (acc4mm2+1e-7)

    return {"abs_diff":abs_diff,"abs_ratio":abs_ratio,"acc_diff1":acc_diff,"acc_ratio1":acc_ratio,
            "acc_diff2":acc_diff2,"acc_ratio2":acc_ratio2,"acc_diff3":acc_diff3,"acc_ratio3":acc_ratio3,
            "acc_diff4":acc_diff4,"acc_ratio4":acc_ratio4}



In [25]:
args = get_opts()

In [26]:
model = CascadeMVSNet(n_depths=args.n_depths,
                        interval_ratios=args.interval_ratios,
                        num_groups=args.num_groups,
                        norm_act=ABN)
device = 'cpu' if args.cpu else 'cuda:0'
model.to(device)
load_ckpt(model, args.ckpt_path)
model.eval()


CascadeMVSNet(
  (feature): FeatureNet(
    (conv0): Sequential(
      (0): ConvBnReLU(
        (conv): Conv2d(3, 8, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn): ABN(8, eps=1e-05, momentum=0.1, affine=True, activation=leaky_relu[0.01])
      )
      (1): ConvBnReLU(
        (conv): Conv2d(8, 8, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn): ABN(8, eps=1e-05, momentum=0.1, affine=True, activation=leaky_relu[0.01])
      )
    )
    (conv1): Sequential(
      (0): ConvBnReLU(
        (conv): Conv2d(8, 16, kernel_size=(5, 5), stride=(2, 2), padding=(2, 2), bias=False)
        (bn): ABN(16, eps=1e-05, momentum=0.1, affine=True, activation=leaky_relu[0.01])
      )
      (1): ConvBnReLU(
        (conv): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn): ABN(16, eps=1e-05, momentum=0.1, affine=True, activation=leaky_relu[0.01])
      )
      (2): ConvBnReLU(
        (conv): Conv2d(16, 16, k

In [27]:
val_data[0]["depths"]["level_0"].shape

torch.Size([512, 640])

In [28]:
transform = T.Compose([T.Normalize(mean=[0.485, 0.456, 0.406], 
std=[0.229, 0.224, 0.225])])

In [29]:
refine = True
read_gt = False

In [30]:
args.save_visual=True   

In [31]:

print('Creating depth and confidence predictions...')
for scan in ["scan106"]:
    depth_dir = f'./results/{args.dataset_name}/depth'
    depth_dir = os.path.join(depth_dir, scan)

    img_dir = f'./results/{args.dataset_name}/image_modified'
    img_dir = os.path.join(img_dir, scan)

    os.makedirs(depth_dir, exist_ok=True)
    os.makedirs(img_dir, exist_ok=True)

    abs_ratio = []
    acc_ratio1 = []
    acc_ratio2 = []
    acc_ratio3 = []
    acc_ratio4 = []
    acc_diff1 = []
    acc_diff2 = []
    acc_diff3 = []
    acc_diff4 = []
    abs_diff = []

    data_range = [i for i, x in enumerate(val_data.metas) if x[0] == scan]
    print(f'Processing {scan} with {len(data_range)} views')
    for i in tqdm(data_range):
        batch =  val_data[i]
       
        imgs, proj_mats, init_depth_min, depth_interval, \
            scan, vid = decode_batch(batch)
        proj_mats = proj_mats.unsqueeze(0).to("cuda")
        imgs = imgs.unsqueeze(0).to(device)
           
       
        
        
        
        os.makedirs(os.path.join(depth_dir, scan), exist_ok=True)

        with torch.no_grad():
           
            
            
            if refine == True:
                # whether image exist or not
                modified_path = os.path.join(img_dir, f'{vid:04d}_class6.npy')
                if os.path.exists(modified_path):
                    np_array = np.load(modified_path)
                    modified_imgs = torch.tensor(np_array).unsqueeze(0).cuda()
                    results_modified = model(transform(modified_imgs), proj_mats, init_depth_min, depth_interval)
                else:
                    modified_imgs = process_batch(pipeline,batch)[0].images
                    results_modified = model(transform(modified_imgs), proj_mats, init_depth_min, depth_interval)
                imgs_original = imgs[0]
                pred_imgs = modified_imgs[0]
                torch.stack([imgs_original, pred_imgs], dim=0)
                save_image(torch.cat([imgs_original, pred_imgs], dim=0), 
                           os.path.join(img_dir, f'{vid:04d}_class6.png'))
                np.save(os.path.join(img_dir, f'{vid:04d}_class6.npy'), pred_imgs.cpu().numpy())
            
                results_ori = model(transform(imgs), proj_mats, init_depth_min, depth_interval)

                metric_logs = return_log(results_modified,
                                        results_ori,
                                        val_data[i]["depths"]["level_0"].numpy(),
                                        val_data[i]["masks"]["level_0"].numpy())
                abs_ratio.append(metric_logs["abs_ratio"])
                acc_ratio1.append(metric_logs["acc_ratio1"])
                acc_ratio2.append(metric_logs["acc_ratio2"])
                acc_ratio3.append(metric_logs["acc_ratio3"])
                acc_ratio4.append(metric_logs["acc_ratio4"])
                acc_diff1.append(metric_logs["acc_diff1"])
                acc_diff2.append(metric_logs["acc_diff2"])
                acc_diff3.append(metric_logs["acc_diff3"])
                acc_diff4.append(metric_logs["acc_diff4"])
                abs_diff.append(metric_logs["abs_diff"])

                # print output
                sys.stdout.write(f'\r{scan} {vid:04d} '
                                f'abs_diff: {np.mean(abs_diff)} '
                                f'abs_ratio: {np.mean(abs_ratio)} '
                                f'acc_diff1: {np.mean(acc_diff1)} '
                                f'acc_ratio1: {np.mean(acc_ratio1)} '
                                f'acc_diff2: {np.mean(acc_diff2)} '
                                f'acc_ratio2: {np.mean(acc_ratio2)} '
                                f'acc_diff3: {np.mean(acc_diff3)} '
                                f'acc_ratio3: {np.mean(acc_ratio3)} '
                                f'acc_diff4: {np.mean(acc_diff4)} '
                                f'acc_ratio4: {np.mean(acc_ratio4)} ')
                
                sys.stdout.flush()
            else:
                results_ori = model(transform(imgs), proj_mats, init_depth_min, depth_interval)


            
        if refine == True:
            depth = results_modified['depth_0'][0].cpu().numpy()
            depth = np.nan_to_num(depth)
            proba = results_modified['confidence_2'][0].cpu().numpy()
            proba = np.nan_to_num(proba)
            save_pfm(os.path.join(depth_dir, f'{scan}/depth_refined_{vid:04d}.pfm'), depth)
            save_pfm(os.path.join(depth_dir, f'{scan}/proba_refined_{vid:04d}.pfm'), proba)
        else:   
            depth = results_ori['depth_0'][0].cpu().numpy()
            depth = np.nan_to_num(depth) # change nan to 0
            proba = results_ori['confidence_2'][0].cpu().numpy() # NOTE: this is 1/4 scale!
            proba = np.nan_to_num(proba) # change nan to 0
            save_pfm(os.path.join(depth_dir, f'{scan}/depth_{vid:04d}.pfm'), depth)
            save_pfm(os.path.join(depth_dir, f'{scan}/proba_{vid:04d}.pfm'), proba)
        if args.save_visual:
            mi = np.min(depth[depth>0])
            ma = np.max(depth)
            depth = (depth-mi)/(ma-mi+1e-8)
            depth = (255*depth).astype(np.uint8)
            depth_img = cv2.applyColorMap(depth, cv2.COLORMAP_JET)
            
            cv2.imwrite(os.path.join(depth_dir, f'{scan}/depth_visual_{vid:04d}.jpg'),
                        depth_img)
            cv2.imwrite(os.path.join(depth_dir, f'{scan}/proba_visual_{vid:04d}.jpg'),
                        (255*(proba>args.conf)).astype(np.uint8))
        del imgs, proj_mats, results_ori
    
    torch.cuda.empty_cache()

Creating depth and confidence predictions...
Processing scan106 with 49 views


  0%|          | 0/49 [00:00<?, ?it/s]

(512, 640) (512, 640) (512, 640) (512, 640)
depth modified is 12.391034126281738,original error is 3.7918593883514404 
scan106 0000 abs_diff: 8.599174499511719 abs_ratio: 3.267798900604248 acc_diff1: -0.2826278077156093 acc_ratio1: 0.5943789475340991 acc_diff2: -0.19801939607067864 acc_ratio2: 0.7561123831708602 acc_diff3: -0.1485042978376433 acc_ratio3: 0.8257505952749943 acc_diff4: -0.12468967653476304 acc_ratio4: 0.8576279551688266 

  2%|▏         | 1/49 [00:03<02:56,  3.67s/it]

(512, 640) (512, 640) (512, 640) (512, 640)
depth modified is 3.114622116088867,original error is 1.9095314741134644 
scan106 0001 abs_diff: 4.902132511138916 abs_ratio: 2.4494457244873047 acc_diff1: -0.2844683343831733 acc_ratio1: 0.6065192344131805 acc_diff2: -0.18622738480272183 acc_ratio2: 0.7780471944761118 acc_diff3: -0.12643567254952282 acc_ratio3: 0.8557358625133635 acc_diff4: -0.09665383079381018 acc_ratio4: 0.8919993651788007 

  4%|▍         | 2/49 [00:05<01:53,  2.42s/it]

(512, 640) (512, 640) (512, 640) (512, 640)
depth modified is 3.692985773086548,original error is 1.7375462055206299 
scan106 0002 abs_diff: 3.9199016094207764 abs_ratio: 2.3414313793182373 acc_diff1: -0.2857470917445683 acc_ratio1: 0.6085831449312121 acc_diff2: -0.1807002984306126 acc_ratio2: 0.7871294096041369 acc_diff3: -0.1199816726901232 acc_ratio3: 0.8646265015750637 acc_diff4: -0.09139592380356105 acc_ratio4: 0.899108557576854 

  6%|▌         | 3/49 [00:06<01:32,  2.02s/it]

(512, 640) (512, 640) (512, 640) (512, 640)
depth modified is 4.639885902404785,original error is 3.7749674320220947 
scan106 0003 abs_diff: 3.156155824661255 abs_ratio: 2.0633535385131836 acc_diff1: -0.27953153877258996 acc_ratio1: 0.6236442933521972 acc_diff2: -0.17112123129650342 acc_ratio2: 0.799736161232665 acc_diff3: -0.1119216037679408 acc_ratio3: 0.8744136457263529 acc_diff4: -0.08297620453391188 acc_ratio4: 0.9088173454256536 

  8%|▊         | 4/49 [00:08<01:22,  1.84s/it]

(512, 640) (512, 640) (512, 640) (512, 640)
depth modified is 13.902609825134277,original error is 11.890796661376953 
scan106 0004 abs_diff: 2.9272873401641846 abs_ratio: 1.8845208883285522 acc_diff1: -0.2650882055955809 acc_ratio1: 0.6461344634561652 acc_diff2: -0.15162467484672737 acc_ratio2: 0.8222298803460811 acc_diff3: -0.09925006642916281 acc_ratio3: 0.8882525679098107 acc_diff4: -0.07427662745571026 acc_ratio4: 0.9180192052823408 

 10%|█         | 5/49 [00:09<01:16,  1.74s/it]

(512, 640) (512, 640) (512, 640) (512, 640)
depth modified is 11.308504104614258,original error is 8.902156829833984 
scan106 0005 abs_diff: 2.840463876724243 abs_ratio: 1.782152533531189 acc_diff1: -0.2672670994942361 acc_ratio1: 0.6476739346816544 acc_diff2: -0.14749502836297648 acc_ratio2: 0.8278028203333562 acc_diff3: -0.09813164540829612 acc_ratio3: 0.8898070746102805 acc_diff4: -0.07546574152303577 acc_ratio4: 0.9168837631177023 

 12%|█▏        | 6/49 [00:11<01:12,  1.68s/it]

(512, 640) (512, 640) (512, 640) (512, 640)
depth modified is 4.116981506347656,original error is 2.5493357181549072 
scan106 0006 abs_diff: 2.658632755279541 abs_ratio: 1.7582626342773438 acc_diff1: -0.26855513923401336 acc_ratio1: 0.6479139175906019 acc_diff2: -0.1460204231775369 acc_ratio2: 0.829894267545983 acc_diff3: -0.09447041578798922 acc_ratio3: 0.8939675077422543 acc_diff4: -0.07144080623899714 acc_ratio4: 0.9213096519169565 

 14%|█▍        | 7/49 [00:12<01:08,  1.64s/it]

(512, 640) (512, 640) (512, 640) (512, 640)
depth modified is 3.099046230316162,original error is 1.9533439874649048 
scan106 0007 abs_diff: 2.4695165157318115 abs_ratio: 1.7367966175079346 acc_diff1: -0.27251239867183785 acc_ratio1: 0.6445507033387343 acc_diff2: -0.14604941290231394 acc_ratio2: 0.8304418251217239 acc_diff3: -0.09291518296815393 acc_ratio3: 0.895936204103272 acc_diff4: -0.06919988872081212 acc_ratio4: 0.9238856513031894 

 16%|█▋        | 8/49 [00:14<01:05,  1.61s/it]

(512, 640) (512, 640) (512, 640) (512, 640)
depth modified is 2.558220624923706,original error is 1.588063359260559 
scan106 0008 abs_diff: 2.3029210567474365 abs_ratio: 1.7228087186813354 acc_diff1: -0.27506128461033214 acc_ratio1: 0.6431079936547399 acc_diff2: -0.14645608766823415 acc_ratio2: 0.8308539629147247 acc_diff3: -0.09257598621244664 acc_ratio3: 0.8968334454763904 acc_diff4: -0.06827841425425107 acc_ratio4: 0.9252475103985852 

 18%|█▊        | 9/49 [00:16<01:03,  1.59s/it]

(512, 640) (512, 640) (512, 640) (512, 640)
depth modified is 2.8737165927886963,original error is 1.5703096389770508 
scan106 0009 abs_diff: 2.202969789505005 abs_ratio: 1.7335309982299805 acc_diff1: -0.27613201191919035 acc_ratio1: 0.6419066972812588 acc_diff2: -0.14971243944798734 acc_ratio2: 0.8279572116778373 acc_diff3: -0.09492364319781554 acc_ratio3: 0.8947742700135761 acc_diff4: -0.06970944095771846 acc_ratio4: 0.9240710734345173 

 20%|██        | 10/49 [00:17<01:05,  1.67s/it]

(512, 640) (512, 640) (512, 640) (512, 640)
depth modified is 3.7178595066070557,original error is 2.0933971405029297 
scan106 0010 abs_diff: 2.1503782272338867 abs_ratio: 1.7373912334442139 acc_diff1: -0.27454696276100976 acc_ratio1: 0.6414694456891956 acc_diff2: -0.14963935672865283 acc_ratio2: 0.8276423146488354 acc_diff3: -0.09429123864307355 acc_ratio3: 0.8953874054668951 acc_diff4: -0.0690162694942284 acc_ratio4: 0.9248199177239038 

 22%|██▏       | 11/49 [00:19<01:02,  1.64s/it]

(512, 640) (512, 640) (512, 640) (512, 640)
depth modified is 3.4005861282348633,original error is 1.8767772912979126 
scan106 0011 abs_diff: 2.0981640815734863 abs_ratio: 1.7436026334762573 acc_diff1: -0.2729927895168346 acc_ratio1: 0.6445995019993563 acc_diff2: -0.14680210864354967 acc_ratio2: 0.8311140475411957 acc_diff3: -0.09222679112874538 acc_ratio3: 0.8977952535930606 acc_diff4: -0.06771801417274288 acc_ratio4: 0.9263275467342419 

 24%|██▍       | 12/49 [00:21<00:59,  1.62s/it]

(512, 640) (512, 640) (512, 640) (512, 640)
depth modified is 4.231178283691406,original error is 1.9870718717575073 
scan106 0012 abs_diff: 2.1093904972076416 abs_ratio: 1.7732758522033691 acc_diff1: -0.27246817552906577 acc_ratio1: 0.645428339352783 acc_diff2: -0.14740220334509424 acc_ratio2: 0.8304287141972349 acc_diff3: -0.09408478255322364 acc_ratio3: 0.8957918945324918 acc_diff4: -0.07008362858366465 acc_ratio4: 0.9238385176787934 

 27%|██▋       | 13/49 [00:22<00:57,  1.60s/it]

(512, 640) (512, 640) (512, 640) (512, 640)
depth modified is 4.697841167449951,original error is 2.120145559310913 
scan106 0013 abs_diff: 2.14284086227417 abs_ratio: 1.8048855066299438 acc_diff1: -0.27112133379711456 acc_ratio1: 0.646386845984965 acc_diff2: -0.14682023538675473 acc_ratio2: 0.8306227726942542 acc_diff3: -0.09421672272612469 acc_ratio3: 0.8954393396598684 acc_diff4: -0.07086441141126491 acc_ratio4: 0.9228975989577375 

 29%|██▊       | 14/49 [00:24<00:55,  1.59s/it]

(512, 640) (512, 640) (512, 640) (512, 640)
depth modified is 2.9092884063720703,original error is 1.477778673171997 
scan106 0014 abs_diff: 2.095418691635132 abs_ratio: 1.8158057928085327 acc_diff1: -0.27164022643464486 acc_ratio1: 0.6467669690195093 acc_diff2: -0.14709732907752315 acc_ratio2: 0.8308694663763115 acc_diff3: -0.09379681505855367 acc_ratio3: 0.8962126887674361 acc_diff4: -0.06980221351772513 acc_ratio4: 0.9242172047650211 

 31%|███       | 15/49 [00:25<00:53,  1.58s/it]

(512, 640) (512, 640) (512, 640) (512, 640)
depth modified is 3.707930564880371,original error is 1.5906322002410889 
scan106 0015 abs_diff: 2.0967860221862793 abs_ratio: 1.8480119705200195 acc_diff1: -0.2730913808228551 acc_ratio1: 0.645564594951272 acc_diff2: -0.1476597199478685 acc_ratio2: 0.8303786755724939 acc_diff3: -0.0939770263713704 acc_ratio3: 0.8960770750094519 acc_diff4: -0.06992176765073461 acc_ratio4: 0.9241307747221046 

 33%|███▎      | 16/49 [00:27<00:51,  1.57s/it]

(512, 640) (512, 640) (512, 640) (512, 640)
depth modified is 3.3369882106781006,original error is 2.547083854675293 
scan106 0016 abs_diff: 2.0199108123779297 abs_ratio: 1.8163713216781616 acc_diff1: -0.2715553132486029 acc_ratio1: 0.6477116646281695 acc_diff2: -0.14600358742198088 acc_ratio2: 0.8323161734217219 acc_diff3: -0.0932677329282698 acc_ratio3: 0.8968785079545742 acc_diff4: -0.06953024906020178 acc_ratio4: 0.9245569884163233 

 35%|███▍      | 17/49 [00:28<00:49,  1.56s/it]

(512, 640) (512, 640) (512, 640) (512, 640)
depth modified is 2.8527886867523193,original error is 1.354295015335083 
scan106 0017 abs_diff: 1.9909430742263794 abs_ratio: 1.8324881792068481 acc_diff1: -0.27066804079234175 acc_ratio1: 0.6492135970955057 acc_diff2: -0.14471626517692038 acc_ratio2: 0.8339792476206137 acc_diff3: -0.09249942073706709 acc_ratio3: 0.8978277486659513 acc_diff4: -0.06942447939306656 acc_ratio4: 0.9247658321633251 

 37%|███▋      | 18/49 [00:30<00:48,  1.55s/it]

(512, 640) (512, 640) (512, 640) (512, 640)
depth modified is 2.642282247543335,original error is 1.6392204761505127 
scan106 0018 abs_diff: 1.9389493465423584 abs_ratio: 1.8208791017532349 acc_diff1: -0.27093244642653963 acc_ratio1: 0.6502052014539965 acc_diff2: -0.1440567466323344 acc_ratio2: 0.8351737057436912 acc_diff3: -0.09154333384146557 acc_ratio3: 0.8990526371319045 acc_diff4: -0.0681917498631067 acc_ratio4: 0.9261811624358406 

 39%|███▉      | 19/49 [00:31<00:46,  1.55s/it]

(512, 640) (512, 640) (512, 640) (512, 640)
depth modified is 3.3543238639831543,original error is 2.218082904815674 
scan106 0019 abs_diff: 1.8988139629364014 abs_ratio: 1.805448293685913 acc_diff1: -0.27095501342913797 acc_ratio1: 0.6513309909011923 acc_diff2: -0.1423666695875605 acc_ratio2: 0.8373987567697407 acc_diff3: -0.08976149444931177 acc_ratio3: 0.9011248514235921 acc_diff4: -0.06663532799483152 acc_ratio4: 0.9279203527282502 

 41%|████      | 20/49 [00:33<00:45,  1.55s/it]

(512, 640) (512, 640) (512, 640) (512, 640)
depth modified is 2.3832926750183105,original error is 1.399966835975647 
scan106 0020 abs_diff: 1.8552193641662598 abs_ratio: 1.8005409240722656 acc_diff1: -0.27336089761043236 acc_ratio1: 0.6490278509796626 acc_diff2: -0.14422977599946382 acc_ratio2: 0.835899531230907 acc_diff3: -0.08952741213594673 acc_ratio3: 0.9016266497555518 acc_diff4: -0.06548677815756085 acc_ratio4: 0.9292661317812452 

 43%|████▎     | 21/49 [00:35<00:43,  1.55s/it]

(512, 640) (512, 640) (512, 640) (512, 640)
depth modified is 2.6448371410369873,original error is 1.190735936164856 
scan106 0021 abs_diff: 1.8369866609573364 abs_ratio: 1.8196609020233154 acc_diff1: -0.275537074146736 acc_ratio1: 0.6473996564897176 acc_diff2: -0.14525055777513315 acc_ratio2: 0.8350518870130318 acc_diff3: -0.08969505452193567 acc_ratio3: 0.9015851323140005 acc_diff4: -0.06535341394897003 acc_ratio4: 0.9295047170131336 

 45%|████▍     | 22/49 [00:36<00:41,  1.55s/it]

(512, 640) (512, 640) (512, 640) (512, 640)
depth modified is 2.8815035820007324,original error is 1.1774959564208984 
scan106 0022 abs_diff: 1.8312050104141235 abs_ratio: 1.8469427824020386 acc_diff1: -0.2771992434362513 acc_ratio1: 0.645871767606137 acc_diff2: -0.14581522839208863 acc_ratio2: 0.8345444615200573 acc_diff3: -0.08980187190284125 acc_ratio3: 0.9015581875640113 acc_diff4: -0.06552922738361404 acc_ratio4: 0.9293966037858329 

 47%|████▋     | 23/49 [00:38<00:40,  1.55s/it]

(512, 640) (512, 640) (512, 640) (512, 640)
depth modified is 2.8576388359069824,original error is 1.415492057800293 
scan106 0023 abs_diff: 1.814994215965271 abs_ratio: 1.8541046380996704 acc_diff1: -0.2779072792594192 acc_ratio1: 0.6452863660081755 acc_diff2: -0.14555901015762618 acc_ratio2: 0.8348686034746104 acc_diff3: -0.08911695155713888 acc_ratio3: 0.9023081479261156 acc_diff4: -0.06477346120527512 acc_ratio4: 0.9302078375518498 

 49%|████▉     | 24/49 [00:39<00:38,  1.55s/it]

(512, 640) (512, 640) (512, 640) (512, 640)
depth modified is 3.5508604049682617,original error is 1.5944589376449585 
scan106 0024 abs_diff: 1.8206504583358765 abs_ratio: 1.8690205812454224 acc_diff1: -0.2777380563658615 acc_ratio1: 0.6456454450796109 acc_diff2: -0.14493474827544178 acc_ratio2: 0.835616185703397 acc_diff3: -0.08870953331120145 acc_ratio3: 0.9027953570616951 acc_diff4: -0.06462491537724556 acc_ratio4: 0.9304056034888315 

 51%|█████     | 25/49 [00:41<00:37,  1.56s/it]

(512, 640) (512, 640) (512, 640) (512, 640)
depth modified is 3.9640355110168457,original error is 1.9551533460617065 
scan106 0025 abs_diff: 1.827890157699585 abs_ratio: 1.875115156173706 acc_diff1: -0.276254342547963 acc_ratio1: 0.6472068017099856 acc_diff2: -0.14470074502075517 acc_ratio2: 0.8357020890871529 acc_diff3: -0.08956376735917676 acc_ratio3: 0.9018062017267243 acc_diff4: -0.06608064623537033 acc_ratio4: 0.9288427162640264 

 53%|█████▎    | 26/49 [00:42<00:36,  1.60s/it]

(512, 640) (512, 640) (512, 640) (512, 640)
depth modified is 3.025616407394409,original error is 1.3655098676681519 
scan106 0026 abs_diff: 1.8216758966445923 abs_ratio: 1.8877309560775757 acc_diff1: -0.2751330649586915 acc_ratio1: 0.6489180960843032 acc_diff2: -0.14370248692684418 acc_ratio2: 0.8368671245460865 acc_diff3: -0.08943418469637963 acc_ratio3: 0.9019742159202114 acc_diff4: -0.06612446191021676 acc_ratio4: 0.9288263241384064 

 55%|█████▌    | 27/49 [00:44<00:36,  1.64s/it]

(512, 640) (512, 640) (512, 640) (512, 640)
depth modified is 2.35486102104187,original error is 0.9622334837913513 
scan106 0027 abs_diff: 1.8063527345657349 abs_ratio: 1.907715082168579 acc_diff1: -0.2760961430809126 acc_ratio1: 0.6487027133819384 acc_diff2: -0.14411640810007875 acc_ratio2: 0.8367268413722772 acc_diff3: -0.08973164111694909 acc_ratio3: 0.9018168917303335 acc_diff4: -0.06621774272699847 acc_ratio4: 0.9288298483261109 

 57%|█████▋    | 28/49 [00:46<00:34,  1.62s/it]

(512, 640) (512, 640) (512, 640) (512, 640)
depth modified is 2.6259214878082275,original error is 1.0121707916259766 
scan106 0028 abs_diff: 1.7997113466262817 abs_ratio: 1.931391954421997 acc_diff1: -0.27615236203710974 acc_ratio1: 0.6494046851428491 acc_diff2: -0.14399849618985414 acc_ratio2: 0.8370510631952306 acc_diff3: -0.08938953479536159 acc_ratio3: 0.9022761544548235 acc_diff4: -0.06581523115832709 acc_ratio4: 0.9293135965130748 

 59%|█████▉    | 29/49 [00:47<00:32,  1.62s/it]

(512, 640) (512, 640) (512, 640) (512, 640)
depth modified is 2.0011484622955322,original error is 0.9604710936546326 
scan106 0029 abs_diff: 1.7744101285934448 abs_ratio: 1.9364625215530396 acc_diff1: -0.27515047823990063 acc_ratio1: 0.6515327062379898 acc_diff2: -0.1430473966982804 acc_ratio2: 0.8383100382645935 acc_diff3: -0.08889853469728821 acc_ratio3: 0.9029033833942314 acc_diff4: -0.06545412024004088 acc_ratio4: 0.9297581682576384 

 61%|██████    | 30/49 [00:49<00:30,  1.61s/it]

(512, 640) (512, 640) (512, 640) (512, 640)
depth modified is 1.9931753873825073,original error is 1.1177018880844116 
scan106 0030 abs_diff: 1.7454121112823486 abs_ratio: 1.9315211772918701 acc_diff1: -0.2737692254167603 acc_ratio1: 0.6538703363340604 acc_diff2: -0.1417994945756243 acc_ratio2: 0.8398449924756705 acc_diff3: -0.08827587786104063 acc_ratio3: 0.9036506479446792 acc_diff4: -0.06496901864950567 acc_ratio4: 0.9303252477696715 

 63%|██████▎   | 31/49 [00:51<00:28,  1.60s/it]

(512, 640) (512, 640) (512, 640) (512, 640)
depth modified is 2.8970789909362793,original error is 1.2286248207092285 
scan106 0031 abs_diff: 1.7430074214935303 abs_ratio: 1.9448480606079102 acc_diff1: -0.27268146543561067 acc_ratio1: 0.6556433260954716 acc_diff2: -0.14086537903865143 acc_ratio2: 0.8409503727302825 acc_diff3: -0.08813845988096522 acc_ratio3: 0.9038436316797144 acc_diff4: -0.06512314552142391 acc_ratio4: 0.9301934189572089 

 65%|██████▌   | 32/49 [00:52<00:27,  1.59s/it]

(512, 640) (512, 640) (512, 640) (512, 640)
depth modified is 2.469144105911255,original error is 1.5264010429382324 
scan106 0032 abs_diff: 1.718756914138794 abs_ratio: 1.9349322319030762 acc_diff1: -0.2729019697859609 acc_ratio1: 0.6558474461918669 acc_diff2: -0.1403702848759474 acc_ratio2: 0.8415830001957528 acc_diff3: -0.08773832091420565 acc_ratio3: 0.9043144865912381 acc_diff4: -0.06487971438330357 acc_ratio4: 0.9304791564576569 

 67%|██████▋   | 33/49 [00:54<00:25,  1.58s/it]

(512, 640) (512, 640) (512, 640) (512, 640)
depth modified is 2.3133723735809326,original error is 1.2273889780044556 
scan106 0033 abs_diff: 1.700145959854126 abs_ratio: 1.9334574937820435 acc_diff1: -0.273326299931867 acc_ratio1: 0.6559880454593374 acc_diff2: -0.1402330851213879 acc_ratio2: 0.8418877378892394 acc_diff3: -0.08756081552236923 acc_ratio3: 0.9045706762981494 acc_diff4: -0.06454494566475723 acc_ratio4: 0.9308665936020137 

 69%|██████▉   | 34/49 [00:55<00:23,  1.57s/it]

(512, 640) (512, 640) (512, 640) (512, 640)
depth modified is 1.9062126874923706,original error is 1.3557846546173096 
scan106 0034 abs_diff: 1.6672967672348022 abs_ratio: 1.9183868169784546 acc_diff1: -0.2744448835762567 acc_ratio1: 0.6546092914708812 acc_diff2: -0.14108889290164425 acc_ratio2: 0.841085153830168 acc_diff3: -0.08750981619716953 acc_ratio3: 0.9047108078969425 acc_diff4: -0.0638865310577719 acc_ratio4: 0.9316080464360923 

 71%|███████▏  | 35/49 [00:57<00:21,  1.56s/it]

(512, 640) (512, 640) (512, 640) (512, 640)
depth modified is 2.2034316062927246,original error is 0.9404058456420898 
scan106 0035 abs_diff: 1.656067132949829 abs_ratio: 1.9301834106445312 acc_diff1: -0.2763370920628094 acc_ratio1: 0.6529335126150309 acc_diff2: -0.1422511432782051 acc_ratio2: 0.840095197164763 acc_diff3: -0.0881137013050867 acc_ratio3: 0.9042079553338086 acc_diff4: -0.06401564147463715 acc_ratio4: 0.9315535660119012 

 73%|███████▎  | 36/49 [00:58<00:20,  1.55s/it]

(512, 640) (512, 640) (512, 640) (512, 640)
depth modified is 2.067241668701172,original error is 1.398384928703308 
scan106 0036 abs_diff: 1.6293857097625732 abs_ratio: 1.9179706573486328 acc_diff1: -0.2780562393811271 acc_ratio1: 0.6514316482141282 acc_diff2: -0.14297711066521696 acc_ratio2: 0.8394882028013863 acc_diff3: -0.08813431794515063 acc_ratio3: 0.9042767657862145 acc_diff4: -0.06366689623462661 acc_ratio4: 0.931971172991173 

 76%|███████▌  | 37/49 [01:00<00:18,  1.55s/it]

(512, 640) (512, 640) (512, 640) (512, 640)
depth modified is 2.5924768447875977,original error is 1.5159491300582886 
scan106 0037 abs_diff: 1.6148368120193481 abs_ratio: 1.912501335144043 acc_diff1: -0.27969862712572485 acc_ratio1: 0.6500053575932782 acc_diff2: -0.1437523375205951 acc_ratio2: 0.8388357740647652 acc_diff3: -0.08812490339880417 acc_ratio3: 0.9043681618750699 acc_diff4: -0.06350726538539549 acc_ratio4: 0.9321838010574744 

 78%|███████▊  | 38/49 [01:02<00:17,  1.59s/it]

(512, 640) (512, 640) (512, 640) (512, 640)
depth modified is 3.152177572250366,original error is 1.894791841506958 
scan106 0038 abs_diff: 1.6056714057922363 abs_ratio: 1.9061193466186523 acc_diff1: -0.28115067512329095 acc_ratio1: 0.6485729708897299 acc_diff2: -0.14489409561845942 acc_ratio2: 0.8376993407503255 acc_diff3: -0.08890072707243492 acc_ratio3: 0.9036006800313894 acc_diff4: -0.06392265874567753 acc_ratio4: 0.9317798017534918 

 80%|███████▉  | 39/49 [01:03<00:15,  1.57s/it]

(512, 640) (512, 640) (512, 640) (512, 640)
depth modified is 2.5411415100097656,original error is 1.3978984355926514 
scan106 0039 abs_diff: 1.5941107273101807 abs_ratio: 1.9039119482040405 acc_diff1: -0.282757019337033 acc_ratio1: 0.6472826907017468 acc_diff2: -0.1453701527923097 acc_ratio2: 0.837350133680987 acc_diff3: -0.08893571084841004 acc_ratio3: 0.903632273156024 acc_diff4: -0.06379460616351364 acc_ratio4: 0.9319534519098129 

 82%|████████▏ | 40/49 [01:05<00:13,  1.55s/it]

(512, 640) (512, 640) (512, 640) (512, 640)
depth modified is 2.2010130882263184,original error is 1.1197839975357056 
scan106 0040 abs_diff: 1.5816013813018799 abs_ratio: 1.9054157733917236 acc_diff1: -0.28419178751905666 acc_ratio1: 0.6458885952763603 acc_diff2: -0.14615226060820574 acc_ratio2: 0.8367050127609943 acc_diff3: -0.08909522949153907 acc_ratio3: 0.9035565417009962 acc_diff4: -0.06379429263969302 acc_ratio4: 0.9320091390362661 

 84%|████████▎ | 41/49 [01:06<00:12,  1.55s/it]

(512, 640) (512, 640) (512, 640) (512, 640)
depth modified is 1.9479936361312866,original error is 1.0129083395004272 
scan106 0041 abs_diff: 1.5662082433700562 abs_ratio: 1.9058386087417603 acc_diff1: -0.28579709771333456 acc_ratio1: 0.6442470125876273 acc_diff2: -0.1472941874447334 acc_ratio2: 0.8357069776229201 acc_diff3: -0.08927801718095844 acc_ratio3: 0.9034712260759468 acc_diff4: -0.06367353179662084 acc_ratio4: 0.9321969724551105 

 86%|████████▌ | 42/49 [01:08<00:11,  1.64s/it]

(512, 640) (512, 640) (512, 640) (512, 640)
depth modified is 1.773020625114441,original error is 0.9461953043937683 
scan106 0042 abs_diff: 1.549013376235962 abs_ratio: 1.9050943851470947 acc_diff1: -0.28683679122556427 acc_ratio1: 0.6433321711322406 acc_diff2: -0.1477376915144674 acc_ratio2: 0.8354208935096615 acc_diff3: -0.08909299564479692 acc_ratio3: 0.9037569736563558 acc_diff4: -0.06322164996111951 acc_ratio4: 0.9327192925210551 

 88%|████████▊ | 43/49 [01:10<00:09,  1.62s/it]

(512, 640) (512, 640) (512, 640) (512, 640)
depth modified is 1.5927387475967407,original error is 0.7932613492012024 
scan106 0043 abs_diff: 1.5319784879684448 abs_ratio: 1.9074294567108154 acc_diff1: -0.28673270613940216 acc_ratio1: 0.643887656211172 acc_diff2: -0.14735154246068516 acc_ratio2: 0.8359975827847562 acc_diff3: -0.08862838898127792 acc_ratio3: 0.9043263307267836 acc_diff4: -0.06278981140379986 acc_ratio4: 0.9332195383768727 

 90%|████████▉ | 44/49 [01:11<00:08,  1.61s/it]

(512, 640) (512, 640) (512, 640) (512, 640)
depth modified is 1.7515149116516113,original error is 0.9110279679298401 
scan106 0044 abs_diff: 1.516611933708191 abs_ratio: 1.9077658653259277 acc_diff1: -0.28673026245117755 acc_ratio1: 0.6444257135030433 acc_diff2: -0.14669764666427856 acc_ratio2: 0.8368344054957261 acc_diff3: -0.08801866267259681 acc_ratio3: 0.9050334243090907 acc_diff4: -0.062245603741967255 acc_ratio4: 0.9338259108813228 

 92%|█████████▏| 45/49 [01:13<00:06,  1.60s/it]

(512, 640) (512, 640) (512, 640) (512, 640)
depth modified is 2.009265422821045,original error is 0.858924388885498 
scan106 0045 abs_diff: 1.508649468421936 abs_ratio: 1.9171465635299683 acc_diff1: -0.28638426219333807 acc_ratio1: 0.6454096193077409 acc_diff2: -0.14594920215522772 acc_ratio2: 0.8377655383977151 acc_diff3: -0.08762411247355882 acc_ratio3: 0.9055059026178093 acc_diff4: -0.06208624085745344 acc_ratio4: 0.9340299532384335 

 94%|█████████▍| 46/49 [01:14<00:04,  1.59s/it]

(512, 640) (512, 640) (512, 640) (512, 640)
depth modified is 2.0982420444488525,original error is 1.070530891418457 
scan106 0046 abs_diff: 1.4984166622161865 abs_ratio: 1.9180583953857422 acc_diff1: -0.28569730001169086 acc_ratio1: 0.6467408514632952 acc_diff2: -0.14488360626400756 acc_ratio2: 0.8390290550129754 acc_diff3: -0.08679950015442671 acc_ratio3: 0.9064231476114548 acc_diff4: -0.06151223849058253 acc_ratio4: 0.9346551053286685 

 96%|█████████▌| 47/49 [01:16<00:03,  1.59s/it]

(512, 640) (512, 640) (512, 640) (512, 640)
depth modified is 1.6372215747833252,original error is 0.9751362204551697 
scan106 0047 abs_diff: 1.4809931516647339 abs_ratio: 1.9130773544311523 acc_diff1: -0.28473383560977816 acc_ratio1: 0.6484306366913034 acc_diff2: -0.14406806400934802 acc_ratio2: 0.8400614693316054 acc_diff3: -0.0863861215958589 acc_ratio3: 0.906931827248536 acc_diff4: -0.061219782074734996 acc_ratio4: 0.93500383231739 

 98%|█████████▊| 48/49 [01:17<00:01,  1.58s/it]

In [None]:
# Step 2. Perform depth filtering and fusion
point_dir = f'results/{args.dataset_name}/points'
os.makedirs(point_dir, exist_ok=True)
print('Fusing point clouds...')

for scan in ["scan106"]:
    print(f'Processing {scan} ...')
    
    # buffers for the final vertices of this scan
    vs = []
    v_colors = []
    # buffers storing the refined data of each ref view
    os.makedirs(f'results/{args.dataset_name}/image_refined/{scan}', exist_ok=True)
    image_refined = set()
    depth_refined = {}
    for meta in tqdm(list(filter(lambda x: x[0]==scan and x[2]==0, val_data.metas))[:args.max_ref_views]):
       
        try:
            ref_vid = meta[1]
            if ref_vid in image_refined: # not yet refined actually
                image_ref = read_refined_image(args.dataset_name, scan, ref_vid)
                depth_ref = depth_refined[ref_vid]
            else:
                if refine:
                    img_dir = f'./results/{args.dataset_name}/image_modified/{scan}'
                    image_ref = np.load(os.path.join(img_dir, f'{ref_vid:04d}_class6.npy'))[0]
                    print(image_ref.shape)
                    image_ref *= 255
                    image_ref = image_ref.transpose(1,2,0)
                    image_ref = image_ref.astype(np.uint8)
                    image_ref = cv2.resize(image_ref, tuple(args.img_wh))
                    plt.imshow(image_ref)
                    plt.show()
                    print(image_ref.shape)

                else:
                    image_ref = read_image(args.dataset_name, args.root_dir, scan, ref_vid)
                    image_ref = cv2.resize(image_ref, tuple(args.img_wh),
                                            interpolation=cv2.INTER_LINEAR)[:,:,::-1] # to RGB
                
                if read_gt:
                    depth_ref = read_pfm(f'{args.root_dir}/Depths/{scan}/depth_map_{ref_vid:04d}.pfm')[0]
                    depth_ref = cv2.resize(depth_ref, tuple(args.img_wh),
                                            interpolation=cv2.INTER_LINEAR)
                else:
                    if refine:
                        depth_ref = read_pfm(f'results/{args.dataset_name}/depth/' \
                                                f'{scan}/{scan}/depth_refined_{ref_vid:04d}.pfm')[0]
                        print(depth_ref.shape)
                    else:
                        depth_ref = read_pfm(f'results/{args.dataset_name}/depth/' \
                                            f'{scan}/{scan}/depth_{ref_vid:04d}.pfm')[0]
            if read_gt:
                proba_ref = np.ones_like(depth_ref)
            proba_ref = read_pfm(f'results/{args.dataset_name}/depth/' \
                                    f'{scan}/{scan}/proba_{ref_vid:04d}.pfm')[0]
            proba_ref = cv2.resize(proba_ref, None, fx=4, fy=4,
                                    interpolation=cv2.INTER_LINEAR)
            mask_conf = proba_ref > args.conf # confidence mask
            P_world2ref = read_proj_mat(args.dataset_name, val_data, scan, ref_vid)
            
            src_vids = meta[3]
            mask_geos = []
            depth_ref_reprojs = [depth_ref]
            image_src2refs = [image_ref]
            # for each src view, check the consistency and refine depth
            for src_vid in src_vids:
                if src_vid in image_refined: # use refined data of previous runs
                    image_src = read_refined_image(args.dataset_name, scan, src_vid)
                    depth_src = depth_refined[src_vid]
                else:
                    if refine:
                        img_dir = f'./results/{args.dataset_name}/image_modified/{scan}'
                        image_src = np.load(os.path.join(img_dir, f'{src_vid:04d}_class6.npy'))[0]
                        print(image_src.shape)
                        image_src *= 255
                        image_src = image_src.transpose(1,2,0)
                        image_src= image_src.astype(np.uint8)
                        image_src = cv2.resize(image_src, tuple(args.img_wh))
                    else: 
                        image_src = read_image(args.dataset_name, args.root_dir, scan, src_vid)
                        image_src = cv2.resize(image_src, tuple(args.img_wh),
                                                interpolation=cv2.INTER_LINEAR)[:,:,::-1] # to RGB




                    if read_gt:
                        depth_src = read_pfm(f'{args.root_dir}/Depths/{scan}/depth_map_{src_vid:04d}.pfm')[0]
                        depth_src = cv2.resize(depth_src, tuple(args.img_wh),
                                                interpolation=cv2.INTER_LINEAR)
                    else:
                        if refine:
                            depth_src = read_pfm(f'results/{args.dataset_name}/depth/' \
                                                f'{scan}/{scan}/depth_refined_{src_vid:04d}.pfm')[0]
                        else:
                            depth_src = read_pfm(f'results/{args.dataset_name}/depth/' \
                                                f'{scan}/{scan}/depth_{src_vid:04d}.pfm')[0]
                        

                   
                    depth_refined[src_vid] = depth_src
                P_world2src = read_proj_mat(args.dataset_name, val_data, scan, src_vid)
                depth_ref_reproj, mask_geo, image_src2ref = \
                    check_geo_consistency(depth_ref, P_world2ref,
                                            depth_src, P_world2src,
                                            image_ref, image_src, tuple(args.img_wh))
                depth_ref_reprojs += [depth_ref_reproj]
                image_src2refs += [image_src2ref]
                mask_geos += [mask_geo]
            mask_geo_sum = np.sum(mask_geos, 0)
            mask_geo_final = mask_geo_sum >= args.min_geo_consistent
            depth_refined[ref_vid] = \
                (np.sum(depth_ref_reprojs, 0)/(mask_geo_sum+1)).astype(np.float32)
            image_refined_ = \
                np.sum(image_src2refs, 0)/np.expand_dims((mask_geo_sum+1), -1)

            image_refined.add(ref_vid)
            save_refined_image(image_refined_, args.dataset_name, scan, ref_vid)
            mask_final = mask_conf & mask_geo_final
            
            # create the final points
            xy_ref = np.mgrid[:args.img_wh[1],:args.img_wh[0]][::-1]
            xyz_ref = np.vstack((xy_ref, np.ones_like(xy_ref[:1]))) * depth_refined[ref_vid]
            xyz_ref = xyz_ref.transpose(1,2,0)[mask_final].T # (3, N)
            color = image_refined_[mask_final] # (N, 3)
            xyz_ref_h = np.vstack((xyz_ref, np.ones_like(xyz_ref[:1])))
            xyz_world = (np.linalg.inv(P_world2ref) @ xyz_ref_h).T # (N, 4)
            xyz_world = xyz_world[::args.skip, :3]
            color = color[::args.skip]
            
            # append to buffers
            vs += [xyz_world]
            v_colors += [color]

        except Exception as e:
            # some scenes might not have depth prediction due to too few valid src views
            
            print(f'Error: {e}')
    # clear refined buffer
    image_refined.clear()
    depth_refined.clear()
    shutil.rmtree(f'results/{args.dataset_name}/image_refined/{scan}')

    # process all points in the buffers
    vs = np.ascontiguousarray(np.vstack(vs).astype(np.float32))
    v_colors = np.vstack(v_colors).astype(np.uint8)
    print(f'{scan} contains {len(vs)/1e6:.2f} M points')
    vs.dtype = [('x', 'f4'), ('y', 'f4'), ('z', 'f4')]
    v_colors.dtype = [('red', 'u1'), ('green', 'u1'), ('blue', 'u1')]

    vertex_all = np.empty(len(vs), vs.dtype.descr+v_colors.dtype.descr)
    for prop in vs.dtype.names:
        vertex_all[prop] = vs[prop][:, 0]
    for prop in v_colors.dtype.names:
        vertex_all[prop] = v_colors[prop][:, 0]
    if read_gt:
        el = PlyElement.describe(vertex_all, 'vertex')
        PlyData([el]).write(f'{point_dir}/{scan}_gt.ply')
    elif refine:
        el = PlyElement.describe(vertex_all, 'vertex')
        PlyData([el]).write(f'{point_dir}/{scan}_refine.ply')
    
    else:
        el = PlyElement.describe(vertex_all, 'vertex')
        PlyData([el]).write(f'{point_dir}/{scan}.ply')
    del vertex_all, vs, v_colors
shutil.rmtree(f'results/{args.dataset_name}/image_refined')

print('Done!')

NameError: name 'args' is not defined