Input Image [Batch, 3, H, W]
       |
(0): Conv2d -> [Batch, 64, H/2, W/2]
       |
(1): Conv2d -> [Batch, 128, H/4, W/4]
       |
(2): C3k2 -> [Batch, 256, H/4, W/4]
       |
(3): Conv2d -> [Batch, 256, H/8, W/8]
       |
(4): C3k2 -> [Batch, 512, H/8, W/8]
       |
(5): Conv2d -> [Batch, 512, H/16, W/16]
       |
(6): C3k2 -> [Batch, 512, H/16, W/16]
       |
(7): Conv2d -> [Batch, 512, H/32, W/32]
       |
(8): C3k2 -> [Batch, 512, H/32, W/32]
       |
(9): SPPF -> [Batch, 512, H/32, W/32]
       |
(10): C2PSA -> [Batch, 512, H/32, W/32]
       |
(11): Upsample x2 -> [Batch, 512, H/16, W/16]
       |
(12): Concat with (6) output
       |
(13): C3k2 -> [Batch, 512, H/16, W/16]
       |
(14): Upsample x2 -> [Batch, 512, H/8, W/8]
       |
(15): Concat with (4) output
       |
(16): C3k2 -> [Batch, 256, H/8, W/8]
       |
(17): Conv2d (Downsample) -> [Batch, 256, H/16, W/16]
       |
(18): Concat with (13) output
       |
(19): C3k2 -> [Batch, 512, H/16, W/16]
       |
(20): Conv2d (Downsample) -> [Batch, 512, H/32, W/32]
       |
(21): Concat with (9) output
       |
(22): C3k2 -> [Batch, 512, H/32, W/32]
       |
(23): Detect
      - Small Objects: [Batch, 80, H/8, W/8]
      - Medium Objects: [Batch, 80, H/16, W/16]
      - Large Objects: [Batch, 80, H/32, W/32]


In [5]:
from ultralytics import YOLO

yolo = YOLO("yolo11m.pt")

yolo

YOLO(
  (model): DetectionModel(
    (model): Sequential(
      (0): Conv(
        (conv): Conv2d(3, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(64, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (act): SiLU(inplace=True)
      )
      (1): Conv(
        (conv): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(128, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (act): SiLU(inplace=True)
      )
      (2): C3k2(
        (cv1): Conv(
          (conv): Conv2d(128, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn): BatchNorm2d(128, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
          (act): SiLU(inplace=True)
        )
        (cv2): Conv(
          (conv): Conv2d(192, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn): BatchNorm2d(256, eps=0.001, momentum=0.03, affine=True, track_

In [1]:
import torch
import torch.nn as nn
from ultralytics import YOLO

# YOLO 모델 불러오기 (YOLOv5 예시)
class YOLOv5WithCustomHead(nn.Module):
    def __init__(self, yolo):
        super(YOLOv5WithCustomHead, self).__init__()
        self.yolo = yolo.model  # YOLO 모델의 모든 레이어
        
        # YOLO의 중간층에서 사용할 레이어 인덱스를 확인해야 함
        self.custom_head = nn.Sequential(
            nn.Conv2d(256, 128, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.AdaptiveAvgPool2d((1, 1)),  # Global Average Pooling
            nn.Flatten(),
            nn.Linear(128, 10)  # 예시로 10개 클래스를 분류하는 헤드
        )

    def forward(self, x):
        outputs = []  # 여러 텐서를 저장할 리스트
        
        # 중간 피처 맵을 추출하는 부분
        for i, layer in enumerate(self.yolo.model):
            x = layer(x)
            if isinstance(x, list):  # x가 리스트인 경우
                x = torch.cat(x, dim=1)  # 리스트의 텐서를 결합
            if i == 9:  # 예시로 9번째 레이어 출력 사용
                feature_map = x  # 중간 피처 맵 추출
                outputs.append(feature_map)  # 필요한 피처 맵 저장

        # 기존 YOLO 아웃풋 (객체 탐지)
        yolo_output = x

        # 새로운 헤드에서 중간 피처 맵 사용
        custom_output = self.custom_head(feature_map)
        
        return yolo_output, custom_output

# 기존 YOLOv5 모델 불러오기
yolo = YOLO('yolo11m.pt')

# 커스텀 모델 생성 (YOLOv5에 새로운 헤드 추가)
model_with_custom_head = YOLOv5WithCustomHead(yolo)

# 예시 입력 데이터
input_data = torch.randn(1, 3, 640, 640)

# 모델 실행
yolo_output, custom_output = model_with_custom_head(input_data)

# 결과 출력
print("YOLO Output Shape:", yolo_output.shape)
print("Custom Head Output Shape:", custom_output.shape)



TypeError: cat() received an invalid combination of arguments - got (Tensor, int), but expected one of:
 * (tuple of Tensors tensors, int dim, *, Tensor out)
 * (tuple of Tensors tensors, name dim, *, Tensor out)


In [9]:
yolo.model

DetectionModel(
  (model): Sequential(
    (0): Conv(
      (conv): Conv2d(3, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (bn): BatchNorm2d(64, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
      (act): SiLU(inplace=True)
    )
    (1): Conv(
      (conv): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (bn): BatchNorm2d(128, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
      (act): SiLU(inplace=True)
    )
    (2): C3k2(
      (cv1): Conv(
        (conv): Conv2d(128, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn): BatchNorm2d(128, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (act): SiLU(inplace=True)
      )
      (cv2): Conv(
        (conv): Conv2d(192, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn): BatchNorm2d(256, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (act): SiLU(inplace=True)
   