In [1]:
import torchprofile
from torchinfo import summary
from dust3r.model import AsymmetricCroCo3DStereo
from dust3r.datasets.co3d import Co3d
from dust3r.datasets import get_data_loader 
def build_dataset(dataset, batch_size, num_workers, test=False):
    split = ['Train', 'Test'][test]
    print(f'Building {split} Data loader for dataset: ', dataset)
    loader = get_data_loader(dataset,
                             batch_size=batch_size,
                             num_workers=num_workers,
                             pin_mem=True,
                             shuffle=not (test),
                             drop_last=not (test))

    print(f"{split} dataset length: ", len(loader))
    return loader
args = {}
inf = float('inf')
dataset = Co3d(split='train', ROOT="data/co3d_subset_processed", resolution=512, aug_crop=16)
data_loader_train = build_dataset(dataset,1, 1, test=False)
view1, view2 = next(iter(data_loader_train))
args = {
    'pos_embed': 'RoPE100',
    'img_size': (512, 512),
    'head_type': 'dpt',
    'output_mode': 'pts3d',
    'depth_mode': ('exp', -inf, inf),
    'conf_mode': ('exp', 1, inf),
    'enc_embed_dim': 1024,
    'enc_depth': 24,
    'enc_num_heads': 16,
    'dec_embed_dim': 768,
    'dec_depth': 12,
    'dec_num_heads': 12

}
model = AsymmetricCroCo3DStereo(**args)
model.__repr__
#summary(model,col_names=("input_size","output_size","num_params",),row_settings=('var_names','depth'), depth=10,input_size=(3,224,224))

Building Train Data loader for dataset:  Co3d(19800 pairs,split='train',seed=None,resolutions=[512x512],transform=Compose( ToTensor() Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))))
Train dataset length:  19800


<bound method Module.__repr__ of AsymmetricCroCo3DStereo(
  (patch_embed): PatchEmbedDust3R(
    (proj): Conv2d(3, 1024, kernel_size=(16, 16), stride=(16, 16))
    (norm): Identity()
  )
  (mask_generator): RandomMask()
  (rope): RoPE2D()
  (enc_blocks): ModuleList(
    (0-23): 24 x Block(
      (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=1024, out_features=3072, bias=True)
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=1024, out_features=1024, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
        (rope): RoPE2D()
      )
      (drop_path): Identity()
      (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=1024, out_features=4096, bias=True)
        (act): GELU(approximate='none')
        (drop1): Dropout(p=0.0, inplace=False)
        (fc2): Linear(in_features=4096, out_features=1024, bias

In [2]:
pred1, pred2 = model(view1, view2)
print(pred1['conf'].shape)
print(pred1['pts3d'].shape)

++++++++++++++++++ENCODER++++++++++++++++++
feat1 torch.Size([1, 1024, 1024])
feat2 torch.Size([1, 1024, 1024])
++++++++++++++++++DECODER++++++++++++++++++
0 _f1 torch.Size([1, 1024, 1024])
0 _f2 torch.Size([1, 1024, 1024])
decoder_embed f1 torch.Size([1, 1024, 768])
decoder_embed f2 torch.Size([1, 1024, 768])
1 _f1 torch.Size([1, 1024, 768])
1 _f2 torch.Size([1, 1024, 768])
2 _f1 torch.Size([1, 1024, 768])
2 _f2 torch.Size([1, 1024, 768])
3 _f1 torch.Size([1, 1024, 768])
3 _f2 torch.Size([1, 1024, 768])
4 _f1 torch.Size([1, 1024, 768])
4 _f2 torch.Size([1, 1024, 768])
5 _f1 torch.Size([1, 1024, 768])
5 _f2 torch.Size([1, 1024, 768])
6 _f1 torch.Size([1, 1024, 768])
6 _f2 torch.Size([1, 1024, 768])
7 _f1 torch.Size([1, 1024, 768])
7 _f2 torch.Size([1, 1024, 768])
8 _f1 torch.Size([1, 1024, 768])
8 _f2 torch.Size([1, 1024, 768])
9 _f1 torch.Size([1, 1024, 768])
9 _f2 torch.Size([1, 1024, 768])
10 _f1 torch.Size([1, 1024, 768])
10 _f2 torch.Size([1, 1024, 768])
11 _f1 torch.Size([1, 1024

In [3]:
import torch
from dust3r.dpt_custom import DPTOutputAdapter

args = {
    'num_channels': 1,
    'stride_level': 1,
    'patch_size': 16,
    'main_tasks': ('rgb',),
    'hooks': [2, 5, 8, 11],
    'layer_dims': [96, 192, 384, 768],
    'feature_dim': 196,
    'last_dim': 32,
    'use_bn': False,
    'dim_tokens_enc': 768,
    'head_type': 'regression',
    'output_width_ratio': 1
}
model = DPTOutputAdapter(**args)
model.init()
#print(model.__repr__())
# Create dummy input
encoder_tokens = [(torch.ones(1,1024,768)+i) for i in range(13)]

image_size = (512, 512)

# Forward pass
output = model(encoder_tokens, image_size)

# Print the output
print("output",output.shape)

layers0 torch.Size([1, 1024, 768])
layers1 torch.Size([1, 1024, 768])
layers2 torch.Size([1, 1024, 768])
layers3 torch.Size([1, 1024, 768])
++++++++++++++++++++++++++++++++
adapt_tokens0 torch.Size([1, 1024, 768])
adapt_tokens1 torch.Size([1, 1024, 768])
adapt_tokens2 torch.Size([1, 1024, 768])
adapt_tokens3 torch.Size([1, 1024, 768])
++++++++++++++++++++++++++++++++
reshape0 torch.Size([1, 768, 32, 32])
reshape1 torch.Size([1, 768, 32, 32])
reshape2 torch.Size([1, 768, 32, 32])
reshape3 torch.Size([1, 768, 32, 32])
++++++++++++++++++++++++++++++++
act_postprocess0 torch.Size([1, 96, 128, 128])
act_postprocess1 torch.Size([1, 192, 64, 64])
act_postprocess2 torch.Size([1, 384, 32, 32])
act_postprocess3 torch.Size([1, 768, 16, 16])
++++++++++++++++++++++++++++++++
rn0 torch.Size([1, 196, 128, 128])
rn1 torch.Size([1, 196, 64, 64])
rn2 torch.Size([1, 196, 32, 32])
rn3 torch.Size([1, 196, 16, 16])
++++++++++++++++++++++++++++++++
layers3 torch.Size([1, 196, 16, 16])
path4 torch.Size([1, 19

In [6]:
import torch
from torch import nn

class Custom_Head(nn.Module):
    def __init__(self,
                 last_dim,
                 use_bn,
                 dim_tokens_enc,
                 output_width_ratio,
                 **kwargs):
        super().__init__()
        

In [7]:
encoder_tokens = [(torch.ones(1,1024,768)+i) for i in range(13)]
for i in range(13):
    print(encoder_tokens[i])

tensor([[[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.]]])
tensor([[[2., 2., 2.,  ..., 2., 2., 2.],
         [2., 2., 2.,  ..., 2., 2., 2.],
         [2., 2., 2.,  ..., 2., 2., 2.],
         ...,
         [2., 2., 2.,  ..., 2., 2., 2.],
         [2., 2., 2.,  ..., 2., 2., 2.],
         [2., 2., 2.,  ..., 2., 2., 2.]]])
tensor([[[3., 3., 3.,  ..., 3., 3., 3.],
         [3., 3., 3.,  ..., 3., 3., 3.],
         [3., 3., 3.,  ..., 3., 3., 3.],
         ...,
         [3., 3., 3.,  ..., 3., 3., 3.],
         [3., 3., 3.,  ..., 3., 3., 3.],
         [3., 3., 3.,  ..., 3., 3., 3.]]])
tensor([[[4., 4., 4.,  ..., 4., 4., 4.],
         [4., 4., 4.,  ..., 4., 4., 4.],
         [4., 4., 4.,  ..., 4., 4., 4.],
         ...,
         [4., 4., 4.,  ..., 4., 4., 4.],
         [4., 4., 4.,  ..., 4., 4., 

In [8]:
from torch import nn
from torch import torch

input_size = 768
output_size = 768
input_tensor = torch.randn(1, 768, 768)

conv1d = nn.Conv1d(768, 768,1)
linear = nn.Linear(768, 768)
print("conv1d parameter sizes:")
for name, param in conv1d.named_parameters():
    print(f"{name}: {param.size()}")

print("\nlinear parameter sizes:")
for name, param in linear.named_parameters():
    print(f"{name}: {param.size()}")

conv1d_params = sum(p.numel() for p in conv1d.parameters())
linear_params = sum(p.numel() for p in linear.parameters())

param_diff = conv1d_params - linear_params
print(f"The difference in parameter count between conv1d and linear is: {param_diff}")

conv1d parameter sizes:
weight: torch.Size([768, 768, 1])
bias: torch.Size([768])

linear parameter sizes:
weight: torch.Size([768, 768])
bias: torch.Size([768])
The difference in parameter count between conv1d and linear is: 0
