In [1]:
import torchprofile
from torchinfo import summary
from dust3r.model import AsymmetricCroCo3DStereo
from dust3r.datasets.co3d import Co3d
from dust3r.datasets import get_data_loader 
def build_dataset(dataset, batch_size, num_workers, test=False):
    split = ['Train', 'Test'][test]
    print(f'Building {split} Data loader for dataset: ', dataset)
    loader = get_data_loader(dataset,
                             batch_size=batch_size,
                             num_workers=num_workers,
                             pin_mem=True,
                             shuffle=not (test),
                             drop_last=not (test))

    print(f"{split} dataset length: ", len(loader))
    return loader
args = {}
inf = float('inf')
dataset = Co3d(split='train', ROOT="data/co3d_subset_processed", resolution=512, aug_crop=16)
data_loader_train = build_dataset(dataset,1, 1, test=False)
view1, view2 = next(iter(data_loader_train))


Building Train Data loader for dataset:  Co3d(19800 pairs,split='train',seed=None,resolutions=[512x512],transform=Compose( ToTensor() Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))))
Train dataset length:  19800


In [2]:
import torch
import torchprofile

args = {
    'pos_embed': 'RoPE100',
    'img_size': (512, 512),
    'head_type': 'dsca',
    'output_mode': 'pts3d',
    'depth_mode': ('exp', -inf, inf),
    'conf_mode': ('exp', 1, inf),
    'enc_embed_dim': 1024,
    'enc_depth': 24,
    'enc_num_heads': 16,
    'dec_embed_dim': 768,
    'dec_depth': 12,
    'dec_num_heads': 12
}
model = AsymmetricCroCo3DStereo(**args)
total_params = sum(p.numel() for p in model.parameters())
print("Total parameters:", total_params)

#model.__repr__
#summary(model,col_names=("input_size","output_size","num_params",),row_settings=('var_names','depth'), depth=10,input_size=(3,224,224))
# 모델 FLOPs 측정
flops = torchprofile.profile_macs(model, (view1, view2))
gflops = flops / 1e9

print(f"GFLOPs: {gflops}")
print(f"FLOPs: {flops}")


Total parameters: 566476808
13 13
GFLOPs: 1291.126442241
FLOPs: 1291126442241




In [3]:
args = {
    'pos_embed': 'RoPE100',
    'img_size': (512, 512),
    'head_type': 'dpt',
    'output_mode': 'pts3d',
    'depth_mode': ('exp', -inf, inf),
    'conf_mode': ('exp', 1, inf),
    'enc_embed_dim': 1024,
    'enc_depth': 24,
    'enc_num_heads': 16,
    'dec_embed_dim': 768,
    'dec_depth': 12,
    'dec_num_heads': 12
}
model = AsymmetricCroCo3DStereo(**args)
total_params = sum(p.numel() for p in model.parameters())
print("Total parameters:", total_params)

Total parameters: 571171208


In [2]:
pred1, pred2 = model(view1, view2)
print(pred1['conf'].shape)
print(pred1['pts3d'].shape)

print(pred2['conf'].shape)
print(pred2['pts3d_in_other_view'].shape)

+++++++++++++++++++DPThead+++++++++++++++++++
layers0 torch.Size([1, 1024, 1024])
layers1 torch.Size([1, 1024, 768])
layers2 torch.Size([1, 1024, 768])
layers3 torch.Size([1, 1024, 768])
++++++++++++++++++++++++++++++++
reshape0 torch.Size([1, 1024, 32, 32])
reshape1 torch.Size([1, 768, 32, 32])
reshape2 torch.Size([1, 768, 32, 32])
reshape3 torch.Size([1, 768, 32, 32])
++++++++++++++++++++++++++++++++
ks0 torch.Size([1, 96, 16, 16])
ks1 torch.Size([1, 192, 16, 16])
ks2 torch.Size([1, 384, 16, 16])
ks3 torch.Size([1, 768, 16, 16])
vs0 torch.Size([1, 96, 16, 16])
vs1 torch.Size([1, 192, 16, 16])
vs2 torch.Size([1, 384, 16, 16])
vs3 torch.Size([1, 768, 16, 16])
qs0 torch.Size([1, 96, 32, 32])
qs1 torch.Size([1, 192, 32, 32])
qs2 torch.Size([1, 384, 32, 32])
qs3 torch.Size([1, 768, 32, 32])
q torch.Size([1, 96, 1024])
k torch.Size([1, 96, 256])
v torch.Size([1, 96, 256])
q torch.Size([1, 192, 1024])
k torch.Size([1, 192, 256])
v torch.Size([1, 192, 256])
q torch.Size([1, 384, 1024])
k tor

In [None]:
print(pred1['conf'].shape)
print(pred1['pts3d'].shape)

print(pred2['conf'].shape)
print(pred2['pts3d_in_other_view'].shape)

torch.Size([1, 512, 512])
torch.Size([1, 512, 512, 3])
torch.Size([1, 512, 512])
torch.Size([1, 512, 512, 3])


++++++++++++++++++ENCODER++++++++++++++++++
feat1 torch.Size([1, 1024, 1024])
feat2 torch.Size([1, 1024, 1024])
++++++++++++++++++DECODER++++++++++++++++++
0 _f1 torch.Size([1, 1024, 1024])
0 _f2 torch.Size([1, 1024, 1024])
decoder_embed f1 torch.Size([1, 1024, 768])
decoder_embed f2 torch.Size([1, 1024, 768])
1 _f1 torch.Size([1, 1024, 768])
1 _f2 torch.Size([1, 1024, 768])
2 _f1 torch.Size([1, 1024, 768])
2 _f2 torch.Size([1, 1024, 768])
3 _f1 torch.Size([1, 1024, 768])
3 _f2 torch.Size([1, 1024, 768])
4 _f1 torch.Size([1, 1024, 768])
4 _f2 torch.Size([1, 1024, 768])
5 _f1 torch.Size([1, 1024, 768])
5 _f2 torch.Size([1, 1024, 768])
6 _f1 torch.Size([1, 1024, 768])
6 _f2 torch.Size([1, 1024, 768])
7 _f1 torch.Size([1, 1024, 768])
7 _f2 torch.Size([1, 1024, 768])
8 _f1 torch.Size([1, 1024, 768])
8 _f2 torch.Size([1, 1024, 768])
9 _f1 torch.Size([1, 1024, 768])
9 _f2 torch.Size([1, 1024, 768])
10 _f1 torch.Size([1, 1024, 768])
10 _f2 torch.Size([1, 1024, 768])
11 _f1 torch.Size([1, 1024, 768])
11 _f2 torch.Size([1, 1024, 768])
12 _f1 torch.Size([1, 1024, 768])
12 _f2 torch.Size([1, 1024, 768])
final_output before 14
final_output after 13
13
+++++++++++++++++++DPThead+++++++++++++++++++
adapt_tokens0 torch.Size([1, 1024, 1024])
adapt_tokens1 torch.Size([1, 1024, 768])
adapt_tokens2 torch.Size([1, 1024, 768])
adapt_tokens3 torch.Size([1, 1024, 768])
++++++++++++++++++++++++++++++++
reshape0 torch.Size([1, 1024, 32, 32])
reshape1 torch.Size([1, 768, 32, 32])
reshape2 torch.Size([1, 768, 32, 32])
reshape3 torch.Size([1, 768, 32, 32])
++++++++++++++++++++++++++++++++
act_postprocess0 torch.Size([1, 96, 128, 128])
act_postprocess1 torch.Size([1, 192, 64, 64])
act_postprocess2 torch.Size([1, 384, 32, 32])
act_postprocess3 torch.Size([1, 768, 16, 16])
++++++++++++++++++++++++++++++++
rn0 torch.Size([1, 256, 128, 128])
rn1 torch.Size([1, 256, 64, 64])
rn2 torch.Size([1, 256, 32, 32])
rn3 torch.Size([1, 256, 16, 16])
++++++++++++++++++++++++++++++++
layers3 torch.Size([1, 256, 16, 16])
path4 torch.Size([1, 256, 32, 32]) layer2 torch.Size([1, 256, 32, 32])
path3 torch.Size([1, 256, 64, 64]) layer1: torch.Size([1, 256, 64, 64])
path2 torch.Size([1, 256, 128, 128]) layer0: torch.Size([1, 256, 128, 128])
path1 torch.Size([1, 256, 256, 256])
+++++++++++++++++++DPThead+++++++++++++++++++
adapt_tokens0 torch.Size([1, 1024, 1024])
adapt_tokens1 torch.Size([1, 1024, 768])
adapt_tokens2 torch.Size([1, 1024, 768])
adapt_tokens3 torch.Size([1, 1024, 768])
++++++++++++++++++++++++++++++++
reshape0 torch.Size([1, 1024, 32, 32])
reshape1 torch.Size([1, 768, 32, 32])
reshape2 torch.Size([1, 768, 32, 32])
reshape3 torch.Size([1, 768, 32, 32])
++++++++++++++++++++++++++++++++
act_postprocess0 torch.Size([1, 96, 128, 128])
act_postprocess1 torch.Size([1, 192, 64, 64])
act_postprocess2 torch.Size([1, 384, 32, 32])
act_postprocess3 torch.Size([1, 768, 16, 16])
++++++++++++++++++++++++++++++++
rn0 torch.Size([1, 256, 128, 128])
rn1 torch.Size([1, 256, 64, 64])
rn2 torch.Size([1, 256, 32, 32])
rn3 torch.Size([1, 256, 16, 16])
++++++++++++++++++++++++++++++++
layers3 torch.Size([1, 256, 16, 16])
path4 torch.Size([1, 256, 32, 32]) layer2 torch.Size([1, 256, 32, 32])
path3 torch.Size([1, 256, 64, 64]) layer1: torch.Size([1, 256, 64, 64])
path2 torch.Size([1, 256, 128, 128]) layer0: torch.Size([1, 256, 128, 128])
path1 torch.Size([1, 256, 256, 256])
torch.Size([1, 512, 512])
torch.Size([1, 512, 512, 3])

헤더의 첫 입력은 인코더에서 나온 출력, 두번째 부터 디코더에서 나온 출력
원래는 reshape해주고 act_postprocess 에서 cnn과 cnntranspose를 해주고 (w,h)를 2배로 증가
그 결과를 layer_rn층을 통과하는데 이 층의 역할은 차원을 통일시켜주는 역할이다. (c)를 통일 시켜줌
다음에 fuse층을 통과하면 잔차연결을 통해 레이어를 통합,확장시켜준다. (c,w,h)+(c,w,h) = (c,w*2,h*2) 이 과정을 hierachical하게 반복하면서
최종적으로 (last_dim = feature_dim//2, 256, 256)이 나오게 된다.



adapt_tokens 에나온 출력을 활용해서 CA를 하는 방안을 생각해보자
그러면 그 결과를 reshape해서 다시 act_postprocess로 넣어주고...
크로스어텐션한 결과를 써주니까 레이어를 퓨전하는 과정을 생략해도 될 것 같다.



In [None]:
import torch
from dust3r.dpt_custom import DPTOutputAdapter

args = {
    'num_channels': 1,
    'stride_level': 1,
    'patch_size': 16,
    'main_tasks': ('rgb',),
    'hooks': [2, 5, 8, 11],
    'layer_dims': [96, 192, 384, 768],
    'feature_dim': 196,
    'last_dim': 32,
    'use_bn': False,
    'dim_tokens_enc': 768,
    'head_type': 'regression',
    'output_width_ratio': 1
}
model = DPTOutputAdapter(**args)
model.init()
#print(model.__repr__())
# Create dummy input
encoder_tokens = [(torch.ones(1,1024,768)+i) for i in range(13)]

image_size = (512, 512)

# Forward pass
output = model(encoder_tokens, image_size)

# Print the output
print("output",output.shape)

output torch.Size([1, 1, 512, 512])


In [None]:
import torch
from torch import nn

class Custom_Head(nn.Moudle):
    def __init__(self,
                 last_dim,
                 use_bn,
                 dim_tokens_enc,
                 output_width_ratio,
                 **kwargs):
        super().__init__()
        

AttributeError: module 'torch.nn' has no attribute 'Moudle'

In [None]:
encoder_tokens = [(torch.ones(1,1024,768)+i) for i in range(13)]
for i in range(13):
    print(encoder_tokens[i])

tensor([[[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.]]])
tensor([[[2., 2., 2.,  ..., 2., 2., 2.],
         [2., 2., 2.,  ..., 2., 2., 2.],
         [2., 2., 2.,  ..., 2., 2., 2.],
         ...,
         [2., 2., 2.,  ..., 2., 2., 2.],
         [2., 2., 2.,  ..., 2., 2., 2.],
         [2., 2., 2.,  ..., 2., 2., 2.]]])
tensor([[[3., 3., 3.,  ..., 3., 3., 3.],
         [3., 3., 3.,  ..., 3., 3., 3.],
         [3., 3., 3.,  ..., 3., 3., 3.],
         ...,
         [3., 3., 3.,  ..., 3., 3., 3.],
         [3., 3., 3.,  ..., 3., 3., 3.],
         [3., 3., 3.,  ..., 3., 3., 3.]]])
tensor([[[4., 4., 4.,  ..., 4., 4., 4.],
         [4., 4., 4.,  ..., 4., 4., 4.],
         [4., 4., 4.,  ..., 4., 4., 4.],
         ...,
         [4., 4., 4.,  ..., 4., 4., 4.],
         [4., 4., 4.,  ..., 4., 4., 

In [None]:
# from torch import nn
# from torch import torch

# input_size = 768
# output_size = 768
# input_tensor = torch.randn(1, 768, 768)

# conv1d = nn.Conv1d(768, 768,1)
# linear = nn.Linear(768, 768)
# print("conv1d parameter sizes:")
# for name, param in conv1d.named_parameters():
#     print(f"{name}: {param.size()}")

# print("\nlinear parameter sizes:")
# for name, param in linear.named_parameters():
#     print(f"{name}: {param.size()}")

# conv1d_params = sum(p.numel() for p in conv1d.parameters())
# linear_params = sum(p.numel() for p in linear.parameters())

# param_diff = conv1d_params - linear_params
# print(f"The difference in parameter count between conv1d and linear is: {param_diff}")

conv1d parameter sizes:
weight: torch.Size([768, 768, 1])
bias: torch.Size([768])

linear parameter sizes:
weight: torch.Size([768, 768])
bias: torch.Size([768])
The difference in parameter count between conv1d and linear is: 0
