In [1]:
import os
os.chdir("/dust3r")
print(os.getcwd())


/dust3r


In [2]:
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image
import open3d as o3d
import torch

from dust3r.inference import inference, inference_with_mask
from dust3r.model import AsymmetricCroCo3DStereo
from dust3r.utils.image import load_images
from dust3r.image_pairs import make_pairs
from dust3r.cloud_opt import global_aligner, GlobalAlignerMode
from masked_dust3r.scripts.utils.math import *

DATA_PATH = "/dust3r/masked_dust3r/data/jackal_training_data_0"
IMG_FILE_EXTENSION = ".png"
MASK_FILE_EXTENSION = ".png"
INIT_FRAMES = 15
FOCAL_LENGTH = 474

device = 'cuda'
batch_size = 1
schedule = 'cosine'
lr = 0.01
niter = 300


Jupyter environment detected. Enabling Open3D WebVisualizer.
[Open3D INFO] WebRTC GUI backend enabled.
[Open3D INFO] WebRTCWindowSystem: HTTP handshake server disabled.


In [None]:
model_name = "checkpoints/DUSt3R_ViTLarge_BaseDecoder_512_dpt.pth"
# you can put the path to a local checkpoint in model_name if needed
model = AsymmetricCroCo3DStereo.from_pretrained(model_name).to(device)

... loading model from checkpoints/DUSt3R_ViTLarge_BaseDecoder_512_dpt.pth
instantiating : AsymmetricCroCo3DStereo(enc_depth=24, dec_depth=12, enc_embed_dim=1024, dec_embed_dim=768, enc_num_heads=16, dec_num_heads=12, pos_embed='RoPE100', patch_embed_cls='PatchEmbedDust3R', img_size=(512, 512), head_type='dpt', output_mode='pts3d', depth_mode=('exp', -inf, inf), conf_mode=('exp', 1, inf), landscape_only=False)
<All keys matched successfully>


In [None]:
images_array = []
masks_array = []

for i in range(0,50):
    images_array.append(os.path.join(DATA_PATH,"masked_images/{}{}".format(i,IMG_FILE_EXTENSION)))
    masks_array.append(os.path.join(DATA_PATH,"masks/{}{}".format(i,MASK_FILE_EXTENSION)))
images = load_images(images_array, size=512, verbose=True)

masks = []

for i in range(len(masks_array)):
    mask = Image.open(masks_array[i]).convert('L')
    _,_,H,W = images[i]["img"].shape
    mask = mask.resize((W,H))

    mask = np.array(mask)
    mask = torch.tensor(mask).to(device)/255
    masks.append(mask)

>> Loading a list of 50 images
 - adding /dust3r/masked_dust3r/data/jackal_training_data_0/masked_images/0.png with resolution 1280x720 --> 512x288
 - adding /dust3r/masked_dust3r/data/jackal_training_data_0/masked_images/1.png with resolution 1280x720 --> 512x288
 - adding /dust3r/masked_dust3r/data/jackal_training_data_0/masked_images/2.png with resolution 1280x720 --> 512x288
 - adding /dust3r/masked_dust3r/data/jackal_training_data_0/masked_images/3.png with resolution 1280x720 --> 512x288
 - adding /dust3r/masked_dust3r/data/jackal_training_data_0/masked_images/4.png with resolution 1280x720 --> 512x288
 - adding /dust3r/masked_dust3r/data/jackal_training_data_0/masked_images/5.png with resolution 1280x720 --> 512x288
 - adding /dust3r/masked_dust3r/data/jackal_training_data_0/masked_images/6.png with resolution 1280x720 --> 512x288
 - adding /dust3r/masked_dust3r/data/jackal_training_data_0/masked_images/7.png with resolution 1280x720 --> 512x288
 - adding /dust3r/masked_dust3r/d

In [None]:
pairs = make_pairs(images, scene_graph='swin', prefilter=None, symmetrize=True)
output = inference_with_mask(pairs, model, device, masks, 1.0, batch_size=batch_size)

>> Inference with model on 300 image pairs


  0%|          | 0/300 [00:00<?, ?it/s]

  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]
100%|██████████| 300/300 [02:05<00:00,  2.39it/s]


In [None]:
init_scene = global_aligner(output, device=device, mode=GlobalAlignerMode.ModularPointCloudOptimizer)
loss = init_scene.compute_global_alignment(init="mst", niter=niter, schedule='cosine', lr=lr)

scene = global_aligner(output, device=device, mode=GlobalAlignerMode.PlanePointCloudOptimizer, 
                       weight_focal = 1, 
                       weight_z = 0.1, 
                       weight_rot = 0.1, 
                       weight_trans_smoothness = 0.001,
                       weight_rot_smoothness = 0.001)
scene.im_poses = calculate_new_params(init_scene.im_poses,device)
scene.im_focals = init_scene.im_focals
loss = scene.compute_global_alignment(init="mst", niter=niter, schedule=schedule, lr=lr)

 init edge (28*,31*) score=1.2249221801757812
 init edge (28,30*) score=1.2192926406860352
 init edge (29*,31) score=1.192214012145996
 init edge (30,33*) score=1.1820712089538574
 init edge (29,32*) score=1.1806654930114746
 init edge (27*,30) score=1.168571949005127
 init edge (28,26*) score=1.1417688131332397
 init edge (31,34*) score=1.1411017179489136
 init edge (32,35*) score=1.1073496341705322
 init edge (28,25*) score=1.0960865020751953
 init edge (23*,26) score=1.0948997735977173
 init edge (36*,34) score=1.0783061981201172
 init edge (39*,36) score=1.0661605596542358
 init edge (38*,35) score=1.048963189125061
 init edge (37*,34) score=1.0472750663757324
 init edge (20*,23) score=1.1546061038970947
 init edge (42*,39) score=1.1076107025146484
 init edge (42,45*) score=1.2190945148468018
 init edge (17*,20) score=1.2178072929382324
 init edge (17,19*) score=1.2044249773025513
 init edge (19,21*) score=1.2003003358840942
 init edge (43*,45) score=1.192771077156067
 init edge (1

100%|██████████| 300/300 [04:48<00:00,  1.04it/s, lr=1.27413e-06 loss=5.28426e-05]


 init edge (28*,31*) score=1.2249221801757812
 init edge (28,30*) score=1.2192926406860352
 init edge (29*,31) score=1.192214012145996
 init edge (30,33*) score=1.1820712089538574
 init edge (29,32*) score=1.1806654930114746
 init edge (27*,30) score=1.168571949005127
 init edge (28,26*) score=1.1417688131332397
 init edge (31,34*) score=1.1411017179489136
 init edge (32,35*) score=1.1073496341705322
 init edge (28,25*) score=1.0960865020751953
 init edge (23*,26) score=1.0948997735977173
 init edge (36*,34) score=1.0783061981201172
 init edge (39*,36) score=1.0661605596542358
 init edge (38*,35) score=1.048963189125061
 init edge (37*,34) score=1.0472750663757324
 init edge (20*,23) score=1.1546061038970947
 init edge (42*,39) score=1.1076107025146484
 init edge (42,45*) score=1.2190945148468018
 init edge (17*,20) score=1.2178072929382324
 init edge (17,19*) score=1.2044249773025513
 init edge (19,21*) score=1.2003003358840942
 init edge (43*,45) score=1.192771077156067
 init edge (1

100%|██████████| 300/300 [05:21<00:00,  1.07s/it, lr=1.27413e-06 loss=7.02729]


In [None]:
imgs = scene.imgs
focals = scene.get_focals()
poses = scene.get_im_poses()
pts3d = scene.get_pts3d()
confidence_masks = scene.get_masks()

In [None]:
#Check if pointclouds folder exists
#If exists, delete all files in the folder
if os.path.exists("{DATA_PATH}/pointclouds".format(DATA_PATH=DATA_PATH)):
    for file in os.listdir("{DATA_PATH}/pointclouds".format(DATA_PATH=DATA_PATH)):
        os.remove("{DATA_PATH}/pointclouds/{file}".format(DATA_PATH=DATA_PATH, file=file))
        
if not os.path.exists("{DATA_PATH}/pointclouds".format(DATA_PATH=DATA_PATH)):
    os.makedirs("{DATA_PATH}/pointclouds".format(DATA_PATH=DATA_PATH))

for i in range(len(images)):
    pointcloud = pts3d[i].detach().cpu().numpy()
    pointcloud = pointcloud.reshape(-1, 3)
    color = imgs[i].reshape(-1, 3)
    confidence_mask = confidence_masks[i].detach().cpu().numpy()
    confidence_mask = confidence_mask.reshape(-1)
    
    masked_pointcloud = []
    masked_color = []

    for j in range(len(confidence_mask)):
        if confidence_mask[j]:
            masked_pointcloud.append(pointcloud[j])
            masked_color.append(color[j])

    pcd = o3d.geometry.PointCloud()
    pcd.points = o3d.utility.Vector3dVector(masked_pointcloud)
    pcd.colors = o3d.utility.Vector3dVector(masked_color)
    o3d.io.write_point_cloud("{DATA_PATH}/pointclouds/pointcloud{i}.ply".format(DATA_PATH=DATA_PATH, i=i), pcd)



In [None]:
import json

#Create transform file
#TODO: Per frame camera model?
transform = {}
transform["camera_model"] = "OPENCV"

averge_focal = focals.sum()/len(focals)
transform["fl_x"] = averge_focal.item()
transform["fl_y"] = averge_focal.item()

#Find size of images
img = Image.open(images_array[0])
width, height = img.size
transform["w"] = width
transform["h"] = height
transform["cx"] = width//2
transform["cy"] = height//2

transform["frames"] = []

for i in range(len(poses)):
    if not((confidence_masks[i]==0).all()):
        frame = {}
        frame["file_path"] = "/".join(images_array[i].split("/")[-2:])
        frame["transform_matrix"] = poses[i].detach().cpu().numpy().tolist()
        frame["mask_path"] = "/".join(masks_array[i].split("/")[-2:])
        transform["frames"].append(frame)

#Save transform file
with open("{}/transforms.json".format(DATA_PATH), 'w') as f:
    json.dump(transform, f, indent=4)

: 