In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from ultralytics import YOLO
from torchvision import transforms
from PIL import Image
import numpy as np

In [2]:
img_path1 = '../data/rawdata_cropped/class1/2022-03-28_103204_1_T3_2346.jpg'
img_path2 = '../data/rawdata_cropped/class1/2022-03-28_103204_1_T5_2348.jpg'

transform = transforms.Compose([
    transforms.ToTensor()  # [H, W, C] -> [C, H, W], 并归一化到 [0, 1]
])

# 加载图片并转换
image1 = Image.open(img_path1).convert('RGB')
image2 = Image.open(img_path2).convert('RGB')

tensor1 = transform(image1) # shape: (3, H, W)
tensor2 = transform(image2)

# 可选：将两个 Tensor 堆叠在一起，生成一个 batch（形状为 [2, 3, H, W]）
batch = torch.stack([tensor1, tensor2])

# 打印维度验证
print("Tensor1 shape:", tensor1.shape)
print("Tensor2 shape:", tensor2.shape)
print("Batch shape:", batch.shape)

Tensor1 shape: torch.Size([3, 1504, 1504])
Tensor2 shape: torch.Size([3, 1504, 1504])
Batch shape: torch.Size([2, 3, 1504, 1504])


In [6]:
model = YOLO('yolo11x-dseg.yaml').load('weights/dual_yolo_pretrained.pt')
model.info(verbose=True)

# model.predict(batch) #, visualize = True)
# model.predict([[image1, image2]])

import torch

# 准备张量输入
blue_tensor = transform(image1).unsqueeze(0)  # [1, 3, H, W]
white_tensor= transform(image2).unsqueeze(0)  # [1, 3, H, W]
dual_tensor = torch.cat([blue_tensor, white_tensor], dim=1)  # [1, 6, H, W]

# 直接调用模型的forward方法（绕过predict接口）
model.model.eval()
with torch.no_grad():
    outputs = model.model(dual_tensor)

0 -1 1 Conv [64, 3, 2]
96 3
1 -1 1 Conv [128, 3, 2]
192 3
2 -1 2 C3k2 [256, False, 0.25]
384 3
3 -1 1 Conv [256, 3, 2]
384 3
4 -1 2 C3k2 [512, False, 0.25]
768 3
5 -1 1 Conv [512, 3, 2]
768 3
6 -1 2 C3k2 [512, True]
768 3
7 -1 1 Conv [1024, 3, 2]
768 3
8 -1 2 C3k2 [1024, True]
768 3
9 -1 1 SPPF [1024, 5]
768 3
10 -1 2 C2PSA [1024]
768 3
11 -1 1 Conv [64, 3, 2]
768 96
12 -1 1 Conv [128, 3, 2]
768 192
13 -1 2 C3k2 [256, False, 0.25]
768 384
14 -1 1 Conv [256, 3, 2]
768 384
15 -1 2 C3k2 [512, False, 0.25]
768 768
16 -1 1 Conv [512, 3, 2]
768 768
17 -1 2 C3k2 [512, True]
768 768
18 -1 1 Conv [1024, 3, 2]
768 768
19 -1 2 C3k2 [1024, True]
768 768
20 -1 1 SPPF [1024, 5]
768 768
21 -1 2 C2PSA [1024]
768 768
22 4 1 nn.Identity []
768 768
23 6 1 nn.Identity []
768 768
24 10 1 nn.Identity []
768 768
25 -1 1 nn.Upsample ['None', 2, 'nearest']
768 768
26 [-1, 23] 1 Concat [1]
768 1536
27 -1 2 C3k2 [512, False]
768 768
28 -1 1 nn.Upsample ['None', 2, 'nearest']
768 768
29 [-1, 22] 1 Concat [1]
768 

In [7]:
print(len(outputs))
print(outputs[0].shape)

2
torch.Size([1, 39, 46389])


In [8]:
print(outputs[1][0][0].shape)
print(outputs[1][0][1].shape)
print(outputs[1][0][2].shape)

torch.Size([1, 67, 188, 188])
torch.Size([1, 67, 94, 94])
torch.Size([1, 67, 47, 47])


In [9]:
outputs[1][1].shape

torch.Size([1, 32, 46389])

In [10]:
outputs[1][2].shape

torch.Size([1, 32, 376, 376])

In [11]:
outputs[1][2]

tensor([[[[ 1.7710e-04,  7.1227e-05, -2.1646e-04,  ...,  1.2241e-04, -1.8428e-04,  1.4486e-04],
          [ 5.9316e-04,  8.9377e-04,  1.0883e-03,  ...,  8.8355e-04,  1.1299e-03,  1.0725e-03],
          [ 6.0623e-04,  1.2159e-03,  9.3081e-04,  ...,  1.1847e-03,  9.5595e-04,  1.1655e-03],
          ...,
          [ 9.6128e-04,  8.7854e-04,  1.5194e-03,  ...,  8.7184e-04,  1.2353e-03,  1.0161e-03],
          [ 4.8849e-04,  1.5596e-03,  8.4110e-04,  ...,  1.2496e-03,  8.7963e-04,  1.2062e-03],
          [ 3.8277e-04,  1.5843e-03,  1.3197e-03,  ...,  1.4526e-03,  1.5163e-03,  1.4957e-03]],

         [[ 1.4279e-03,  2.4088e-03,  2.4572e-03,  ...,  2.2151e-03,  2.5890e-03,  1.6382e-03],
          [ 8.7761e-04,  1.7414e-03,  1.3264e-03,  ...,  1.4527e-03,  1.5855e-03,  6.0589e-04],
          [ 1.1385e-03,  1.4630e-03,  1.4934e-03,  ...,  1.3499e-03,  1.5933e-03,  5.6698e-04],
          ...,
          [ 8.9044e-04,  1.4891e-03,  1.6814e-03,  ...,  1.5946e-03,  1.5676e-03,  6.5937e-04],
        