In [1]:
#!/bin/python3 python3.10

import os
#Set directory to dust3r
os.chdir("/dust3r")
print(os.getcwd())

import matplotlib.pyplot as plt
import numpy as np
from PIL import Image
import open3d as o3d
import torch
import json
from masked_dust3r.scripts.utils.image import *
from masked_dust3r.scripts.utils.constraint import *

from dust3r.inference import inference_with_mask
from dust3r.model import AsymmetricCroCo3DStereo
from dust3r.utils.image import load_images
from dust3r.image_pairs import make_pairs
from dust3r.cloud_opt import global_aligner, GlobalAlignerMode

DATA_PATH = "/dust3r/masked_dust3r/data/jackal_training_data_0"
IMG_FILE_EXTENSION = ".png"
MASK_FILE_EXTENSION = ".png"
GAUSSIAN_SIGMA = 3.0
INIT_FRAMES = 10
RECURRING_FRAMES = 5
TOTAL_IMGS = 11

IS_FOCAL_FIXED = False
IS_BEST_FIT_PLANE = True
IS_ZERO_Z = True
FOCAL_LENGTH = 4.74

device = 'cuda'
batch_size = 1
schedule = 'cosine'
lr = 0.01
niter = 300

with open(f"{DATA_PATH}/transforms.json") as f:
    transforms = json.load(f)

/dust3r
Jupyter environment detected. Enabling Open3D WebVisualizer.
[Open3D INFO] WebRTC GUI backend enabled.
[Open3D INFO] WebRTCWindowSystem: HTTP handshake server disabled.


In [2]:
# Load the model

model_name = "checkpoints/DUSt3R_ViTLarge_BaseDecoder_512_dpt.pth"
# you can put the path to a local checkpoint in model_name if needed
model = AsymmetricCroCo3DStereo.from_pretrained(model_name).to(device)

... loading model from checkpoints/DUSt3R_ViTLarge_BaseDecoder_512_dpt.pth
instantiating : AsymmetricCroCo3DStereo(enc_depth=24, dec_depth=12, enc_embed_dim=1024, dec_embed_dim=768, enc_num_heads=16, dec_num_heads=12, pos_embed='RoPE100', patch_embed_cls='PatchEmbedDust3R', img_size=(512, 512), head_type='dpt', output_mode='pts3d', depth_mode=('exp', -inf, inf), conf_mode=('exp', 1, inf), landscape_only=False)
<All keys matched successfully>


In [3]:
new_img_index = 10
print("Looking at frame {}...".format(new_img_index))
images_array = []
masks_array = []

preset_focal = [transforms["fl_x"] for _ in range(RECURRING_FRAMES+1)]
preset_pose = []
preset_mask = [True for _ in range(RECURRING_FRAMES+1)]
preset_mask[0] = False

images_array.append(os.path.join(DATA_PATH,"masked_images/{}{}".format(new_img_index,IMG_FILE_EXTENSION)))
masks_array.append(os.path.join(DATA_PATH,"masks/{}{}".format(new_img_index,MASK_FILE_EXTENSION)))
preset_pose.append(np.eye(4))

for i in range(-RECURRING_FRAMES,0):
    images_array.append(os.path.join(DATA_PATH,transforms["frames"][i]["file_path"]))
    masks_array.append(os.path.join(DATA_PATH,transforms["frames"][i]["mask_path"]))
    preset_pose.append(np.array(transforms["frames"][i]["transform_matrix"]))
    print("Using {}...".format(transforms["frames"][i]["file_path"]))
preset_pose[0] = preset_pose[-1]

images = load_images(images_array, size=512, verbose=True)
_,_,H,W = images[0]["img"].shape
masks = load_masks(masks_array, H, W, device)

Looking at frame 10...
Using masked_images/45.png...
Using masked_images/46.png...
Using masked_images/47.png...
Using masked_images/48.png...
Using masked_images/49.png...
>> Loading a list of 6 images
 - adding /dust3r/masked_dust3r/data/jackal_training_data_0/masked_images/10.png with resolution 1280x720 --> 512x288
 - adding /dust3r/masked_dust3r/data/jackal_training_data_0/masked_images/45.png with resolution 1280x720 --> 512x288
 - adding /dust3r/masked_dust3r/data/jackal_training_data_0/masked_images/46.png with resolution 1280x720 --> 512x288
 - adding /dust3r/masked_dust3r/data/jackal_training_data_0/masked_images/47.png with resolution 1280x720 --> 512x288
 - adding /dust3r/masked_dust3r/data/jackal_training_data_0/masked_images/48.png with resolution 1280x720 --> 512x288
 - adding /dust3r/masked_dust3r/data/jackal_training_data_0/masked_images/49.png with resolution 1280x720 --> 512x288
 (Found 6 images)


In [4]:
pairs = make_pairs(images, scene_graph='oneref-0', prefilter=None, symmetrize=True)
output = inference_with_mask(pairs, model, device, masks, GAUSSIAN_SIGMA, batch_size=batch_size)

>> Inference with model on 10 image pairs


  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]
100%|██████████| 10/10 [00:06<00:00,  1.60it/s]


In [5]:
scene = global_aligner(output, device=device, mode=GlobalAlignerMode.ModularPointCloudOptimizer)
scene.preset_focal(preset_focal, [True for _ in range(RECURRING_FRAMES+1)])
scene.preset_pose(preset_pose, preset_mask)

 (setting focal #0 = 481.6600341796875)
 (setting focal #1 = 481.6600341796875)
 (setting focal #2 = 481.6600341796875)
 (setting focal #3 = 481.6600341796875)
 (setting focal #4 = 481.6600341796875)
 (setting focal #5 = 481.6600341796875)
 (setting pose #1 = [ 0.12649737 -0.3028121   0.27151915])
 (setting pose #2 = [ 0.1903113  -0.24278216  0.23373555])
 (setting pose #3 = [ 0.1713087  -0.25071609  0.24321499])
 (setting pose #4 = [ 0.15801997 -0.26054999  0.26193309])
 (setting pose #5 = [ 0.14215846 -0.27457073  0.27348831])


In [6]:
print(scene.im_poses[1])
print(scene.get_im_poses()[1])

Parameter containing:
tensor([ 0.0130,  0.6145,  0.7229, -0.3157,  0.1191, -0.2645,  0.2402],
       device='cuda:0')
tensor([[-0.8004,  0.4724, -0.3692,  0.1265],
        [-0.4404, -0.0454,  0.8967, -0.3028],
        [ 0.4068,  0.8802,  0.2443,  0.2715],
        [ 0.0000,  0.0000,  0.0000,  1.0000]], device='cuda:0',
       grad_fn=<SelectBackward0>)


In [13]:
def signed_expm1(x):
    sign = torch.sign(x)
    return sign * torch.expm1(torch.abs(x))

In [17]:
print(scene.im_poses[0])

Parameter containing:
tensor([ 0.0360,  0.7223, -0.3021, -0.6396,  0.1253, -0.2360,  0.2437],
       device='cuda:0', requires_grad=True)


In [15]:
for param, pose in zip(scene.im_poses, scene.get_im_poses()):
    print(pose[0,3]/signed_expm1(param[4]))
    print(pose[1,3]/signed_expm1(param[5]))
    print(pose[2,3]/signed_expm1(param[6]))

im_pose =  scene.get_im_poses()
print(im_pose)


tensor(1., device='cuda:0', grad_fn=<DivBackward0>)
tensor(1., device='cuda:0', grad_fn=<DivBackward0>)
tensor(1., device='cuda:0', grad_fn=<DivBackward0>)
tensor(1., device='cuda:0', grad_fn=<DivBackward0>)
tensor(1., device='cuda:0', grad_fn=<DivBackward0>)
tensor(1., device='cuda:0', grad_fn=<DivBackward0>)
tensor(1., device='cuda:0', grad_fn=<DivBackward0>)
tensor(1., device='cuda:0', grad_fn=<DivBackward0>)
tensor(1., device='cuda:0', grad_fn=<DivBackward0>)
tensor(1., device='cuda:0', grad_fn=<DivBackward0>)
tensor(1., device='cuda:0', grad_fn=<DivBackward0>)
tensor(1., device='cuda:0', grad_fn=<DivBackward0>)
tensor(1., device='cuda:0', grad_fn=<DivBackward0>)
tensor(1., device='cuda:0', grad_fn=<DivBackward0>)
tensor(1., device='cuda:0', grad_fn=<DivBackward0>)
tensor(1., device='cuda:0', grad_fn=<DivBackward0>)
tensor(1., device='cuda:0', grad_fn=<DivBackward0>)
tensor(1., device='cuda:0', grad_fn=<DivBackward0>)
tensor([[[-0.1980, -0.3268, -0.9241,  0.1335],
         [ 0.4284

In [8]:
for i in scene.im_poses:
    print(i)

Parameter containing:
tensor([ 0.7878,  0.1551, -1.0806,  0.6953, -1.3535,  1.3133,  0.7201],
       device='cuda:0', requires_grad=True)
Parameter containing:
tensor([ 0.0130,  0.6145,  0.7229, -0.3157,  0.1191, -0.2645,  0.2402],
       device='cuda:0')
Parameter containing:
tensor([ 0.0158,  0.5677,  0.6392, -0.5186,  0.1742, -0.2174,  0.2100],
       device='cuda:0')
Parameter containing:
tensor([ 0.0148,  0.5760,  0.6665, -0.4730,  0.1581, -0.2237,  0.2177],
       device='cuda:0')
Parameter containing:
tensor([ 0.0100,  0.6001,  0.6787, -0.4232,  0.1467, -0.2315,  0.2326],
       device='cuda:0')
Parameter containing:
tensor([ 0.0086,  0.6151,  0.6962, -0.3700,  0.1329, -0.2426,  0.2418],
       device='cuda:0')


In [9]:
loss = scene.compute_global_alignment(init="mst", niter=niter, schedule=schedule, lr=lr)

 init edge (0*,1*) score=1.0644137859344482
 init edge (2*,0) score=1.0251412391662598


 init edge (5*,0) score=1.0133864879608154
 init edge (3*,0) score=1.00983464717865
 init edge (4*,0) score=1.0085440874099731
 init loss = 0.000306908565107733
Global alignement - optimizing for:
['pw_poses', 'im_depthmaps.0', 'im_depthmaps.1', 'im_depthmaps.2', 'im_depthmaps.3', 'im_depthmaps.4', 'im_depthmaps.5', 'im_poses.0']


100%|██████████| 300/300 [00:19<00:00, 15.28it/s, lr=1.27413e-06 loss=2.0939e-05] 


In [10]:
for i in scene.im_poses:
    print(i)

Parameter containing:
tensor([ 0.0360,  0.7223, -0.3021, -0.6396,  0.1253, -0.2360,  0.2437],
       device='cuda:0', requires_grad=True)
Parameter containing:
tensor([ 0.0130,  0.6145,  0.7229, -0.3157,  0.1191, -0.2645,  0.2402],
       device='cuda:0')
Parameter containing:
tensor([ 0.0158,  0.5677,  0.6392, -0.5186,  0.1742, -0.2174,  0.2100],
       device='cuda:0')
Parameter containing:
tensor([ 0.0148,  0.5760,  0.6665, -0.4730,  0.1581, -0.2237,  0.2177],
       device='cuda:0')
Parameter containing:
tensor([ 0.0100,  0.6001,  0.6787, -0.4232,  0.1467, -0.2315,  0.2326],
       device='cuda:0')
Parameter containing:
tensor([ 0.0086,  0.6151,  0.6962, -0.3700,  0.1329, -0.2426,  0.2418],
       device='cuda:0')


In [11]:
im_pose =  scene.get_im_poses()
print(im_pose)

tensor([[[-0.1980, -0.3268, -0.9241,  0.1335],
         [ 0.4284,  0.8191, -0.3815, -0.2661],
         [ 0.8816, -0.4714, -0.0222,  0.2760],
         [ 0.0000,  0.0000,  0.0000,  1.0000]],

        [[-0.8004,  0.4724, -0.3692,  0.1265],
         [-0.4404, -0.0454,  0.8967, -0.3028],
         [ 0.4068,  0.8802,  0.2443,  0.2715],
         [ 0.0000,  0.0000,  0.0000,  1.0000]],

        [[-0.4616,  0.6809, -0.5686,  0.1903],
         [-0.6449,  0.1825,  0.7421, -0.2428],
         [ 0.6091,  0.7093,  0.3549,  0.2337],
         [ 0.0000,  0.0000,  0.0000,  1.0000]],

        [[-0.5520,  0.6477, -0.5252,  0.1713],
         [-0.6134,  0.1112,  0.7819, -0.2507],
         [ 0.5648,  0.7538,  0.3359,  0.2432],
         [ 0.0000,  0.0000,  0.0000,  1.0000]],

        [[-0.6415,  0.5865, -0.4944,  0.1580],
         [-0.5625,  0.0785,  0.8231, -0.2605],
         [ 0.5216,  0.8061,  0.2796,  0.2619],
         [ 0.0000,  0.0000,  0.0000,  1.0000]],

        [[-0.7260,  0.5258, -0.4432,  0.1422],
   

In [12]:

imgs = scene.imgs
focals = scene.get_focals()
poses = scene.get_im_poses()
pts3d = scene.get_pts3d()
confidence_masks = scene.get_masks()

if (confidence_masks[0]!=0).all():
    print("No confidence in Frame {}".format(new_img_index))       
    pass

new_tf = poses[0].detach().cpu().numpy().tolist()
if abs(new_tf[2][3]) > 0.1:
    pass
new_tf[2][3] = 0

new_frame = {
    "file_path" : "/".join(images_array[0].split("/")[-2:]),
    "transform_matrix" : new_tf,
    "mask_path" : "/".join(masks_array[0].split("/")[-2:])
}
#transforms["frames"].append(new_frame)
