In [13]:
import os
os.chdir("/dust3r")
print(os.getcwd())

/dust3r


In [14]:
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image
import torch
import json
import re
import open3d as o3d

from dust3r.inference import inference_with_mask, create_gaussian_kernel
from dust3r.model import AsymmetricCroCo3DStereo
from dust3r.utils.image import load_images
from dust3r.image_pairs import make_pairs
from dust3r.cloud_opt import global_aligner, GlobalAlignerMode
from dust3r.cloud_opt.base_opt import global_alignment_loop
from masked_dust3r.scripts.utils.math import *
from masked_dust3r.scripts.utils.image import *


DATA_PATH = "/dust3r/masked_dust3r/data/jackal_training_data_0"
IMG_FILE_EXTENSION = ".png"
MASK_FILE_EXTENSION = ".png"

INIT_FRAMES = 50
NEW_FRAMES = 10
PREVIOUS_FRAMES = 40
TOTAL_FRAMES = 300

INIT_WEIGHT_FOCAL = 0.01 * 0
INIT_WEIGHT_Z = 0.1 * 0
INIT_WEIGHT_ROT = 0.1 * 0
INIT_WEIGHT_TRANS_SMOOTHNESS = 0.001 * 0
INIT_WEIGHT_ROT_SMOOTHNESS = 0.001 * 0

NEW_WEIGHT_FOCAL = 0.1 * 0
NEW_WEIGHT_Z = 0.1
NEW_WEIGHT_ROT = 0.1
NEW_WEIGHT_TRANS_SMOOTHNESS = 0.0001
NEW_WEIGHT_ROT_SMOOTHNESS = 0.00001

USE_COMMON_INTRINSICS = False

device = 'cuda'
batch_size = 1
schedule = 'cosine'
lr = 0.01
niter = 300 

In [15]:
GAUSSIAN_SIGMA = 21.0
SIZE = int(GAUSSIAN_SIGMA * 3)

kernel = create_gaussian_kernel(SIZE, GAUSSIAN_SIGMA).to(device)

SIZE = 1
kernel = torch.ones(SIZE, SIZE).to(device)

In [16]:
# Load the model

model_name = "checkpoints/DUSt3R_ViTLarge_BaseDecoder_512_dpt.pth"
# you can put the path to a local checkpoint in model_name if needed
model = AsymmetricCroCo3DStereo.from_pretrained(model_name).to(device)

... loading model from checkpoints/DUSt3R_ViTLarge_BaseDecoder_512_dpt.pth


instantiating : AsymmetricCroCo3DStereo(enc_depth=24, dec_depth=12, enc_embed_dim=1024, dec_embed_dim=768, enc_num_heads=16, dec_num_heads=12, pos_embed='RoPE100', patch_embed_cls='PatchEmbedDust3R', img_size=(512, 512), head_type='dpt', output_mode='pts3d', depth_mode=('exp', -inf, inf), conf_mode=('exp', 1, inf), landscape_only=False)
<All keys matched successfully>


In [17]:
images_array = []
masks_array = []

for i in range(0,50):
    images_array.append(os.path.join(DATA_PATH,"masked_images/{}{}".format(i,IMG_FILE_EXTENSION)))
    masks_array.append(os.path.join(DATA_PATH,"masks/{}{}".format(i,MASK_FILE_EXTENSION)))
images = load_images(images_array, size=512, verbose=True)
_,_,H,W = images[0]["img"].shape
masks = load_masks(masks_array, H, W, device)

>> Loading a list of 50 images
 - adding /dust3r/masked_dust3r/data/jackal_training_data_0/masked_images/0.png with resolution 1280x720 --> 512x288
 - adding /dust3r/masked_dust3r/data/jackal_training_data_0/masked_images/1.png with resolution 1280x720 --> 512x288
 - adding /dust3r/masked_dust3r/data/jackal_training_data_0/masked_images/2.png with resolution 1280x720 --> 512x288
 - adding /dust3r/masked_dust3r/data/jackal_training_data_0/masked_images/3.png with resolution 1280x720 --> 512x288
 - adding /dust3r/masked_dust3r/data/jackal_training_data_0/masked_images/4.png with resolution 1280x720 --> 512x288
 - adding /dust3r/masked_dust3r/data/jackal_training_data_0/masked_images/5.png with resolution 1280x720 --> 512x288
 - adding /dust3r/masked_dust3r/data/jackal_training_data_0/masked_images/6.png with resolution 1280x720 --> 512x288
 - adding /dust3r/masked_dust3r/data/jackal_training_data_0/masked_images/7.png with resolution 1280x720 --> 512x288
 - adding /dust3r/masked_dust3r/d

In [18]:
pairs = make_pairs(images, scene_graph='swin-1', prefilter=None, symmetrize=True)
output = inference_with_mask(pairs, model, device, masks, kernel, batch_size=batch_size)
del pairs

>> Inference with model on 100 image pairs


100%|██████████| 100/100 [00:29<00:00,  3.35it/s]


In [19]:
#init_scene = global_aligner(output, device=device, mode=GlobalAlignerMode.PlanePointCloudOptimizer)
#loss = init_scene.compute_global_alignment(init="mst", niter=niter, schedule='cosine', lr=lr)

#scene = init_scene

scene = global_aligner(output, device=device, mode=GlobalAlignerMode.PlanePointCloudOptimizer, 
                        weight_focal = INIT_WEIGHT_FOCAL,
                        weight_z = INIT_WEIGHT_Z ,
                        weight_rot = INIT_WEIGHT_ROT  ,
                        weight_trans_smoothness = INIT_WEIGHT_TRANS_SMOOTHNESS,
                        weight_rot_smoothness = INIT_WEIGHT_ROT_SMOOTHNESS)
#scene.im_poses = calculate_new_params(init_scene.im_poses,device)
#scene.im_focals = init_scene.im_focals
loss = scene.compute_global_alignment(init="mst", niter=niter, schedule=schedule, lr=lr)

#averge_focal = scene.get_focals().sum().item()/len(images_array)
#fixed_focal = [averge_focal for _ in range(len(images_array))]
#mask = [True for _ in range(len(images_array))]
#scene.preset_focal(fixed_focal, mask)
#loss = scene.compute_global_alignment(init="mst", niter=niter, schedule=schedule, lr=lr)

#scene.weight_focal = INIT_WEIGHT_FOCAL
#scene.weight_z = INIT_WEIGHT_Z 
#scene.weight_rot = INIT_WEIGHT_ROT
#scene.weight_trans_smoothness = INIT_WEIGHT_TRANS_SMOOTHNESS 
#scene.weight_rot_smoothness = INIT_WEIGHT_ROT_SMOOTHNESS
#loss = global_alignment_loop(scene, niter=niter, schedule=schedule, lr=lr)


 init edge (31*,32*) score=1.5550881624221802
 init edge (32,33*) score=1.5348498821258545
 init edge (31,30*) score=1.511637568473816
 init edge (30,29*) score=1.5006829500198364
 init edge (33,34*) score=1.4510823488235474
 init edge (35*,34) score=1.4219846725463867
 init edge (36*,35) score=1.3642923831939697
 init edge (29,28*) score=1.3519669771194458
 init edge (27*,28) score=1.3290653228759766
 init edge (37*,36) score=1.229400873184204
 init edge (26*,27) score=1.3301632404327393
 init edge (26,25*) score=1.2620265483856201
 init edge (25,24*) score=1.244730830192566
 init edge (23*,24) score=1.2740055322647095
 init edge (22*,23) score=1.3368784189224243
 init edge (21*,22) score=1.3375743627548218
 init edge (20*,21) score=1.4055798053741455
 init edge (19*,20) score=1.5015896558761597
 init edge (19,18*) score=1.4620022773742676
 init edge (18,17*) score=1.4963304996490479
 init edge (17,16*) score=1.4511840343475342
 init edge (15*,16) score=1.4046778678894043
 init edge (

100%|██████████| 300/300 [02:15<00:00,  2.21it/s, lr=1.27413e-06 loss=5.27541e-05]


In [20]:
imgs = scene.imgs
focals = scene.get_focals()
poses = scene.get_im_poses()
pts3d = scene.get_pts3d()
confidence_masks = scene.get_masks()
intrinsics = scene.get_intrinsics()

In [21]:
print(focals)

tensor([[654.3383],
        [574.3328],
        [529.8439],
        [535.4302],
        [539.2348],
        [526.7123],
        [523.6014],
        [528.8171],
        [536.1041],
        [538.9911],
        [544.0812],
        [629.2317],
        [626.0013],
        [628.2543],
        [595.1767],
        [543.0217],
        [541.6720],
        [534.4133],
        [535.7574],
        [526.4504],
        [540.6248],
        [603.9789],
        [606.9436],
        [656.5286],
        [672.1915],
        [668.3292],
        [634.0631],
        [621.3911],
        [586.8094],
        [555.3392],
        [523.9168],
        [521.4434],
        [520.7944],
        [532.0732],
        [523.2665],
        [533.1567],
        [549.1788],
        [639.7050],
        [642.2066],
        [629.8137],
        [551.8750],
        [533.1238],
        [543.5737],
        [526.3262],
        [524.2938],
        [533.7226],
        [549.5974],
        [609.3350],
        [642.0872],
        [625.8572]],

In [22]:
#Check if pointclouds folder exists
#If exists, delete all files in the folder
if os.path.exists("{DATA_PATH}/pointclouds".format(DATA_PATH=DATA_PATH)):
    for file in os.listdir("{DATA_PATH}/pointclouds".format(DATA_PATH=DATA_PATH)):
        os.remove("{DATA_PATH}/pointclouds/{file}".format(DATA_PATH=DATA_PATH, file=file))
        
if not os.path.exists("{DATA_PATH}/pointclouds".format(DATA_PATH=DATA_PATH)):
    os.makedirs("{DATA_PATH}/pointclouds".format(DATA_PATH=DATA_PATH))

for i in range(len(images)):
    pointcloud = pts3d[i].detach().cpu().numpy()
    pointcloud = pointcloud.reshape(-1, 3)
    color = imgs[i].reshape(-1, 3)
    confidence_mask = confidence_masks[i].detach().cpu().numpy()
    confidence_mask = confidence_mask.reshape(-1)
    
    masked_pointcloud = []
    masked_color = []

    for j in range(len(confidence_mask)):
        if confidence_mask[j]:
            masked_pointcloud.append(pointcloud[j])
            masked_color.append(color[j])

    pcd = o3d.geometry.PointCloud()
    pcd.points = o3d.utility.Vector3dVector(masked_pointcloud)
    pcd.colors = o3d.utility.Vector3dVector(masked_color)

    o3d.io.write_point_cloud("{DATA_PATH}/pointclouds/pointcloud{i}.ply".format(DATA_PATH=DATA_PATH, i=int(re.split("\.|/",images_array[i])[-2])), pcd)

In [27]:
#Create transform file

img = Image.open(images_array[0])
width, height = img.size
RESCALE_FACTOR = 1 #width/512
FIXED_FOCAL_LENGTH = 4.74

transforms = {}
transforms["camera_model"] = "OPENCV"
if USE_COMMON_INTRINSICS:
    intrinsic_mean = intrinsics.mean(dim=0)
    transforms["fl_x"] = intrinsic_mean[0,0].item() * RESCALE_FACTOR
    transforms["fl_y"] = intrinsic_mean[1,1].item() * RESCALE_FACTOR
    transforms["w"] = width 
    transforms["h"] = height 
    transforms["cx"] = intrinsic_mean[0,2].item() * RESCALE_FACTOR
    transforms["cy"] = intrinsic_mean[1,2].item() * RESCALE_FACTOR

transforms["frames"] = []

OPENGL = np.array([[1, 0, 0, 0],
                    [0, -1, 0, 0],
                    [0, 0, -1, 0],
                    [0, 0, 0, 1]])

for i in range(len(poses)):
    if not((confidence_masks[i]==0).all()):
        frame = {}
        frame["file_path"] = "/".join(images_array[i].split("/")[-2:])
        frame["transform_matrix"] = poses[i].detach().cpu().numpy()
        frame["transform_matrix"] = np.linalg.inv(frame["transform_matrix"])
        #frame["transform_matrix"] = OPENGL @ frame["transform_matrix"]
        #frame["transform_matrix"] = np.linalg.inv(frame["transform_matrix"])
        frame["transform_matrix"] = frame["transform_matrix"].tolist()
        frame["mask_path"] = "/".join(masks_array[i].split("/")[-2:])
        transforms["frames"].append(frame)
        
        if not USE_COMMON_INTRINSICS:
            frame["fl_x"] = intrinsics[i,0,0].item() * RESCALE_FACTOR
            frame["fl_y"] = intrinsics[i,1,1].item() * RESCALE_FACTOR
            frame["cx"] = intrinsics[i,0,2].item() * RESCALE_FACTOR
            frame["cy"] = intrinsics[i,1,2].item() * RESCALE_FACTOR
            img = Image.open(images_array[i])
            width, height = img.size
            transforms["w"] = width 
            transforms["h"] = height 

#Save transform file
with open("{}/transforms.json".format(DATA_PATH), 'w') as f:
    json.dump(transforms, f, indent=4)

In [24]:
import roma

all_poses = torch.stack(list(scene.im_poses))
Q = all_poses[:,:4]
Q = torch.nn.functional.normalize(Q, p=2, dim=1)
T = signed_expm1(all_poses[:,4:7])
tf = roma.RigidUnitQuat(Q, T).normalize().to_homogeneous()

OPENGL = torch.tensor([[1, 0, 0, 0],
                       [0, -1, 0, 0],
                       [0, 0, -1, 0],
                       [0, 0, 0, 1]], dtype=torch.float32).to(device)

tf = torch.matmul(tf, OPENGL)

tf = roma.RigidUnitQuat(Q, T).normalize()
print(tf.linear[:,0]/tf.linear[:,3])

tensor([ 6.0194e-02,  1.3301e-02, -7.9469e-04,  2.2044e-02,  1.6840e-01,
        -1.2183e+00, -2.1366e-01,  3.1851e-02, -9.0499e-02, -7.3949e-02,
         4.7403e-03, -1.3288e-03,  7.4363e-03, -2.7734e-02, -1.8639e-02,
        -2.9648e-02, -1.5664e-02, -2.5353e-03, -1.6939e-02, -2.8425e-02,
        -1.6713e-02, -8.4597e-03, -3.1244e-03,  6.5929e-03,  4.6599e-02,
         1.5926e-02,  1.2407e-02, -7.0894e-04,  1.3250e-03,  4.6859e-03,
        -1.1399e-03, -9.5838e-04, -2.0674e-03,  3.2499e-03, -5.9040e-03,
        -8.2583e-03, -4.0776e-03,  8.7962e-03, -2.3547e-02, -8.8475e-04,
        -2.1960e-02, -1.4557e-02,  7.9809e-03,  1.3969e-02,  2.0686e-02,
         1.1537e-02,  2.2201e-02,  3.3209e-02,  4.0130e-02, -9.3726e-03],
       device='cuda:0', grad_fn=<DivBackward0>)
