In [10]:
import os
os.chdir("/dust3r")
print(os.getcwd())

/dust3r


In [11]:
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image
import torch
import json
import open3d as o3d

from dust3r.inference import inference_with_mask
from dust3r.model import AsymmetricCroCo3DStereo
from dust3r.utils.image import load_images
from dust3r.image_pairs import make_pairs
from dust3r.cloud_opt import global_aligner, GlobalAlignerMode
from dust3r.cloud_opt.base_opt import global_alignment_loop
from masked_dust3r.scripts.utils.math import *
from masked_dust3r.scripts.utils.image import *


DATA_PATH = "/dust3r/masked_dust3r/data/jackal_drive"
IMG_FILE_EXTENSION = ".png"
MASK_FILE_EXTENSION = ".png"
GAUSSIAN_SIGMA = 1.0
INIT_FRAMES = 30
NEW_FRAMES = 10
PREVIOUS_FRAMES = 40
TOTAL_FRAMES = 300

INIT_WEIGHT_FOCAL = 0
INIT_WEIGHT_Z = 0.01
INIT_WEIGHT_ROT = 0.01
INIT_WEIGHT_TRANS_SMOOTHNESS = 0.00001
INIT_WEIGHT_ROT_SMOOTHNESS = 0.00001

NEW_WEIGHT_FOCAL = 0.1
NEW_WEIGHT_Z = 0.1
NEW_WEIGHT_ROT = 0.1
NEW_WEIGHT_TRANS_SMOOTHNESS = 0.00001
NEW_WEIGHT_ROT_SMOOTHNESS = 0.00001

IS_FOCAL_FIXED = True
FOCAL_LENGTH = 4.74

device = 'cuda'
batch_size = 1
schedule = 'cosine'
lr = 0.01
niter = 300

In [12]:
# Load the model

model_name = "checkpoints/DUSt3R_ViTLarge_BaseDecoder_512_dpt.pth"
# you can put the path to a local checkpoint in model_name if needed
model = AsymmetricCroCo3DStereo.from_pretrained(model_name).to(device)

... loading model from checkpoints/DUSt3R_ViTLarge_BaseDecoder_512_dpt.pth
instantiating : AsymmetricCroCo3DStereo(enc_depth=24, dec_depth=12, enc_embed_dim=1024, dec_embed_dim=768, enc_num_heads=16, dec_num_heads=12, pos_embed='RoPE100', patch_embed_cls='PatchEmbedDust3R', img_size=(512, 512), head_type='dpt', output_mode='pts3d', depth_mode=('exp', -inf, inf), conf_mode=('exp', 1, inf), landscape_only=False)
<All keys matched successfully>


In [13]:
images_array = []
masks_array = []

for i in range(INIT_FRAMES):
    images_array.append(os.path.join(DATA_PATH,"masked_images/{}{}".format(i,IMG_FILE_EXTENSION)))
    masks_array.append(os.path.join(DATA_PATH,"masks/{}{}".format(i,MASK_FILE_EXTENSION)))
images = load_images(images_array, size=512, verbose=True)
_,_,H,W = images[0]["img"].shape
masks = load_masks(masks_array, H, W, device)

>> Loading a list of 30 images
 - adding /dust3r/masked_dust3r/data/jackal_drive/masked_images/0.png with resolution 1280x720 --> 512x288
 - adding /dust3r/masked_dust3r/data/jackal_drive/masked_images/1.png with resolution 1280x720 --> 512x288
 - adding /dust3r/masked_dust3r/data/jackal_drive/masked_images/2.png with resolution 1280x720 --> 512x288
 - adding /dust3r/masked_dust3r/data/jackal_drive/masked_images/3.png with resolution 1280x720 --> 512x288
 - adding /dust3r/masked_dust3r/data/jackal_drive/masked_images/4.png with resolution 1280x720 --> 512x288
 - adding /dust3r/masked_dust3r/data/jackal_drive/masked_images/5.png with resolution 1280x720 --> 512x288
 - adding /dust3r/masked_dust3r/data/jackal_drive/masked_images/6.png with resolution 1280x720 --> 512x288
 - adding /dust3r/masked_dust3r/data/jackal_drive/masked_images/7.png with resolution 1280x720 --> 512x288
 - adding /dust3r/masked_dust3r/data/jackal_drive/masked_images/8.png with resolution 1280x720 --> 512x288
 - add

In [14]:
pairs = make_pairs(images, scene_graph='swin-2', prefilter=None, symmetrize=True)
output = inference_with_mask(pairs, model, device, masks, GAUSSIAN_SIGMA, batch_size=batch_size)

>> Inference with model on 120 image pairs


100%|██████████| 120/120 [00:35<00:00,  3.35it/s]


In [15]:
#init_scene = global_aligner(output, device=device, mode=GlobalAlignerMode.ModularPointCloudOptimizer)
#loss = init_scene.compute_global_alignment(init="mst", niter=niter, schedule='cosine', lr=lr)

#scene = init_scene

scene = global_aligner(output, device=device, mode=GlobalAlignerMode.PlanePointCloudOptimizer, 
                        weight_focal = 0,
                        weight_z = INIT_WEIGHT_Z  * 0.1,
                        weight_rot = INIT_WEIGHT_ROT * 0.1,
                        weight_trans_smoothness = 0,
                        weight_rot_smoothness = 0)
#scene.im_poses = calculate_new_params(init_scene.im_poses,device)
#scene.im_focals = init_scene.im_focals
loss = scene.compute_global_alignment(init="mst", niter=niter, schedule=schedule, lr=lr)

print(scene.get_focals())

scene.weight_focal = INIT_WEIGHT_FOCAL
scene.weight_z = INIT_WEIGHT_Z 
scene.weight_rot = INIT_WEIGHT_ROT
#scene.weight_trans_smoothness = INIT_WEIGHT_TRANS_SMOOTHNESS 
#scene.weight_rot_smoothness = INIT_WEIGHT_ROT_SMOOTHNESS
loss = global_alignment_loop(scene, niter=niter, schedule=schedule, lr=lr)


 init edge (4*,2*) score=1.4069451093673706
 init edge (4,3*) score=1.3453327417373657
 init edge (5*,3) score=1.3356707096099854
 init edge (6*,4) score=1.3290936946868896
 init edge (7*,5) score=1.2905136346817017
 init edge (9*,7) score=1.2705857753753662
 init edge (3,1*) score=1.2342967987060547
 init edge (8*,6) score=1.2284821271896362
 init edge (2,0*) score=1.1783502101898193
 init edge (11*,9) score=1.1729549169540405
 init edge (10*,8) score=1.1484359502792358
 init edge (11,12*) score=1.1280802488327026
 init edge (11,13*) score=1.1078684329986572
 init edge (12,14*) score=1.0878560543060303
 init edge (15*,13) score=1.0298125743865967
 init edge (28*,0) score=1.2418574094772339
 init edge (27*,28) score=1.217063069343567
 init edge (27,25*) score=1.2628129720687866
 init edge (27,29*) score=1.2180676460266113
 init edge (23*,25) score=1.3433653116226196
 init edge (23,21*) score=1.3883497714996338
 init edge (19*,21) score=1.3465063571929932
 init edge (19,17*) score=1.169

100%|██████████| 300/300 [01:34<00:00,  3.17it/s, lr=1.27413e-06 loss=0.000340773]


tensor([[480.2152],
        [531.8730],
        [527.9482],
        [499.3434],
        [532.2170],
        [574.3369],
        [587.7007],
        [553.4855],
        [524.9421],
        [525.9062],
        [452.9351],
        [467.5393],
        [423.8594],
        [410.4176],
        [347.7339],
        [389.5172],
        [454.7909],
        [464.5722],
        [448.0027],
        [532.2039],
        [542.5788],
        [575.4208],
        [534.7335],
        [531.6117],
        [516.3522],
        [515.9739],
        [480.7349],
        [487.4694],
        [482.9661],
        [513.5150]], device='cuda:0', grad_fn=<ExpBackward0>)
Global alignement - optimizing for:
['pw_poses', 'im_depthmaps.0', 'im_depthmaps.1', 'im_depthmaps.2', 'im_depthmaps.3', 'im_depthmaps.4', 'im_depthmaps.5', 'im_depthmaps.6', 'im_depthmaps.7', 'im_depthmaps.8', 'im_depthmaps.9', 'im_depthmaps.10', 'im_depthmaps.11', 'im_depthmaps.12', 'im_depthmaps.13', 'im_depthmaps.14', 'im_depthmaps.15', 'im_depthmaps.1

100%|██████████| 300/300 [01:34<00:00,  3.19it/s, lr=1.27413e-06 loss=0.000285791]


In [16]:
print(scene.get_focals())

imgs = scene.imgs
focals = scene.get_focals()
poses = scene.get_im_poses()
pts3d = scene.get_pts3d()
confidence_masks = scene.get_masks()

tensor([[485.2377],
        [516.5031],
        [518.0402],
        [505.9385],
        [532.1113],
        [551.9042],
        [566.1036],
        [539.4161],
        [516.5307],
        [516.5997],
        [465.1650],
        [478.6691],
        [440.7426],
        [426.0716],
        [325.7993],
        [401.3805],
        [444.3559],
        [464.6347],
        [460.9888],
        [517.5455],
        [523.2043],
        [550.7404],
        [521.0326],
        [518.1054],
        [506.8924],
        [502.6686],
        [481.2789],
        [483.7465],
        [478.5030],
        [496.5066]], device='cuda:0', grad_fn=<ExpBackward0>)


In [17]:
#Check if pointclouds folder exists
#If exists, delete all files in the folder
if os.path.exists("{DATA_PATH}/pointclouds".format(DATA_PATH=DATA_PATH)):
    for file in os.listdir("{DATA_PATH}/pointclouds".format(DATA_PATH=DATA_PATH)):
        os.remove("{DATA_PATH}/pointclouds/{file}".format(DATA_PATH=DATA_PATH, file=file))
        
if not os.path.exists("{DATA_PATH}/pointclouds".format(DATA_PATH=DATA_PATH)):
    os.makedirs("{DATA_PATH}/pointclouds".format(DATA_PATH=DATA_PATH))

for i in range(len(images)):
    pointcloud = pts3d[i].detach().cpu().numpy()
    pointcloud = pointcloud.reshape(-1, 3)
    color = imgs[i].reshape(-1, 3)
    confidence_mask = confidence_masks[i].detach().cpu().numpy()
    confidence_mask = confidence_mask.reshape(-1)
    
    masked_pointcloud = []
    masked_color = []

    for j in range(len(confidence_mask)):
        if confidence_mask[j]:
            masked_pointcloud.append(pointcloud[j])
            masked_color.append(color[j])

    pcd = o3d.geometry.PointCloud()
    pcd.points = o3d.utility.Vector3dVector(masked_pointcloud)
    pcd.colors = o3d.utility.Vector3dVector(masked_color)
    o3d.io.write_point_cloud("{DATA_PATH}/pointclouds/pointcloud{i}.ply".format(DATA_PATH=DATA_PATH, i=i), pcd)



In [18]:
#Create transform file
#TODO: Per frame camera model?
transforms = {}
transforms["camera_model"] = "OPENCV"

averge_focal = focals.sum()/len(focals)
transforms["fl_x"] = averge_focal.item()
transforms["fl_y"] = averge_focal.item()

#Find size of images
img = Image.open(images_array[0])
width, height = img.size
transforms["w"] = width
transforms["h"] = height
transforms["cx"] = width//2
transforms["cy"] = height//2

transforms["frames"] = []

for i in range(len(poses)):
    if not((confidence_masks[i]==0).all()):
        frame = {}
        frame["file_path"] = "/".join(images_array[i].split("/")[-2:])
        frame["transform_matrix"] = poses[i].detach().cpu().numpy().tolist()
        frame["mask_path"] = "/".join(masks_array[i].split("/")[-2:])
        transforms["frames"].append(frame)

#Save transform file
with open("{}/transforms.json".format(DATA_PATH), 'w') as f:
    json.dump(transforms, f, indent=4)