In [None]:
import copy
import math

import torch
import torch.nn as nn
from torch.nn.init import constant_, xavier_uniform_

from ultralytics.utils.tal import TORCH_1_10, dist2bbox, dist2rbox, make_anchors

from ultralytics.nn.modules.block import DFL, BNContrastiveHead, ContrastiveHead, Proto
from ultralytics.nn.modules.conv import Conv, DWConv
from ultralytics.nn.modules.transformer import MLP, DeformableTransformerDecoder, DeformableTransformerDecoderLayer
from ultralytics.nn.modules.utils import bias_init_with_prob, linear_init


class Detect_onebone(nn.Module):
    """YOLOv8 Detect head for detection models."""

    dynamic = False  # force grid reconstruction
    export = False  # export mode
    end2end = False  # end2end
    max_det = 300  # max_det
    shape = None
    anchors = torch.empty(0)  # init
    strides = torch.empty(0)  # init

    def __init__(self, nc=80, ch=()):
        """Initializes the YOLOv8 detection layer with specified number of classes and channels."""
        super().__init__()
        self.nc = nc  # number of classes
        self.nl = len(ch)  # number of detection layers
        self.reg_max = 16  # DFL channels (ch[0] // 16 to scale 4/8/12/16/20 for n/s/m/l/x)
        self.no = nc + self.reg_max * 4  # number of outputs per anchor
        self.stride = torch.zeros(self.nl)  # strides computed during build
        c2, c3 = max((16, ch[0] // 4, self.reg_max * 4)), max(ch[0], min(self.nc, 100))  # channels
        self.cv2 = nn.ModuleList(
            nn.Sequential(Conv(x, c2, 3), Conv(c2, c2, 3), nn.Conv2d(c2, 4 * self.reg_max, 1)) for x in ch
        )
        self.cv3 = nn.ModuleList(
            nn.Sequential(
                nn.Sequential(DWConv(x, x, 3), Conv(x, c3, 1)),
                nn.Sequential(DWConv(c3, c3, 3), Conv(c3, c3, 1)),
                nn.Conv2d(c3, self.nc, 1),
            )
            for x in ch
        )
        self.dfl = DFL(self.reg_max) if self.reg_max > 1 else nn.Identity()

        if self.end2end:
            self.one2one_cv2 = copy.deepcopy(self.cv2)
            self.one2one_cv3 = copy.deepcopy(self.cv3)

    def forward(self, x):
        """Concatenates and returns predicted bounding boxes and class probabilities."""
        if self.end2end:
            return self.forward_end2end(x)

        for i in range(self.nl):
            x[i] = torch.cat((self.cv2[i](x[i]), self.cv3[i](x[i])), 1)
        if self.training:  # Training path
            return x
        y = self._inference(x)
        return y if self.export else (y, x)

    def forward_end2end(self, x):
        """
        Performs forward pass of the v10Detect module.

        Args:
            x (tensor): Input tensor.

        Returns:
            (dict, tensor): If not in training mode, returns a dictionary containing the outputs of both one2many and one2one detections.
                           If in training mode, returns a dictionary containing the outputs of one2many and one2one detections separately.
        """
        x_detach = [xi.detach() for xi in x]
        one2one = [
            torch.cat((self.one2one_cv2[i](x_detach[i]), self.one2one_cv3[i](x_detach[i])), 1) for i in range(self.nl)
        ]
        for i in range(self.nl):
            x[i] = torch.cat((self.cv2[i](x[i]), self.cv3[i](x[i])), 1)
        if self.training:  # Training path
            return {"one2many": x, "one2one": one2one}

        y = self._inference(one2one)
        y = self.postprocess(y.permute(0, 2, 1), self.max_det, self.nc)
        return y if self.export else (y, {"one2many": x, "one2one": one2one})

    def _inference(self, x):
        """Decode predicted bounding boxes and class probabilities based on multiple-level feature maps."""
        # Inference path
        shape = x[0].shape  # BCHW
        x_cat = torch.cat([xi.view(shape[0], self.no, -1) for xi in x], 2)
        if self.dynamic or self.shape != shape:
            self.anchors, self.strides = (x.transpose(0, 1) for x in make_anchors(x, self.stride, 0.5))
            self.shape = shape

        if self.export and self.format in {"saved_model", "pb", "tflite", "edgetpu", "tfjs"}:  # avoid TF FlexSplitV ops
            box = x_cat[:, : self.reg_max * 4]
            cls = x_cat[:, self.reg_max * 4 :]
        else:
            box, cls = x_cat.split((self.reg_max * 4, self.nc), 1)

        if self.export and self.format in {"tflite", "edgetpu"}:
            # Precompute normalization factor to increase numerical stability
            # See https://github.com/ultralytics/ultralytics/issues/7371
            grid_h = shape[2]
            grid_w = shape[3]
            grid_size = torch.tensor([grid_w, grid_h, grid_w, grid_h], device=box.device).reshape(1, 4, 1)
            norm = self.strides / (self.stride[0] * grid_size)
            dbox = self.decode_bboxes(self.dfl(box) * norm, self.anchors.unsqueeze(0) * norm[:, :2])
        else:
            dbox = self.decode_bboxes(self.dfl(box), self.anchors.unsqueeze(0)) * self.strides

        return torch.cat((dbox, cls.sigmoid()), 1)

    def bias_init(self):
        """Initialize Detect() biases, WARNING: requires stride availability."""
        m = self  # self.model[-1]  # Detect() module
        # cf = torch.bincount(torch.tensor(np.concatenate(dataset.labels, 0)[:, 0]).long(), minlength=nc) + 1
        # ncf = math.log(0.6 / (m.nc - 0.999999)) if cf is None else torch.log(cf / cf.sum())  # nominal class frequency
        for a, b, s in zip(m.cv2, m.cv3, m.stride):  # from
            a[-1].bias.data[:] = 1.0  # box
            b[-1].bias.data[: m.nc] = math.log(5 / m.nc / (640 / s) ** 2)  # cls (.01 objects, 80 classes, 640 img)
        if self.end2end:
            for a, b, s in zip(m.one2one_cv2, m.one2one_cv3, m.stride):  # from
                a[-1].bias.data[:] = 1.0  # box
                b[-1].bias.data[: m.nc] = math.log(5 / m.nc / (640 / s) ** 2)  # cls (.01 objects, 80 classes, 640 img)

    def decode_bboxes(self, bboxes, anchors):
        """Decode bounding boxes."""
        return dist2bbox(bboxes, anchors, xywh=not self.end2end, dim=1)

    @staticmethod
    def postprocess(preds: torch.Tensor, max_det: int, nc: int = 80):
        """
        Post-processes YOLO model predictions.

        Args:
            preds (torch.Tensor): Raw predictions with shape (batch_size, num_anchors, 4 + nc) with last dimension
                format [x, y, w, h, class_probs].
            max_det (int): Maximum detections per image.
            nc (int, optional): Number of classes. Default: 80.

        Returns:
            (torch.Tensor): Processed predictions with shape (batch_size, min(max_det, num_anchors), 6) and last
                dimension format [x, y, w, h, max_class_prob, class_index].
        """
        batch_size, anchors, _ = preds.shape  # i.e. shape(16,8400,84)
        boxes, scores = preds.split([4, nc], dim=-1)
        index = scores.amax(dim=-1).topk(min(max_det, anchors))[1].unsqueeze(-1)
        boxes = boxes.gather(dim=1, index=index.repeat(1, 1, 4))
        scores = scores.gather(dim=1, index=index.repeat(1, 1, nc))
        scores, index = scores.flatten(1).topk(min(max_det, anchors))
        i = torch.arange(batch_size)[..., None]  # batch indices
        return torch.cat([boxes[i, index // nc], scores[..., None], (index % nc)[..., None].float()], dim=-1)

import torch

def test_detect_onebone():
    # 테스트 케이스 설정
    nc = 80  # 클래스 수
    ch = [64, 128, 256]  # 다양한 레이어의 채널 크기
    model = Detect_onebone(nc=nc, ch=ch)

    # 더미 입력 텐서 생성 (배치 크기, 채널 수, 높이, 너비 형태)
    batch_size = 2
    height, width = 80, 80
    dummy_input = [torch.rand(batch_size, c, height // (2 ** i), width // (2 ** i)) for i, c in enumerate(ch)]

    # 전방 패스
    model.eval()  # 비훈련 모드 설정
    output = model(dummy_input)

    # 출력 형태 확인
    if isinstance(output, tuple):
        preds = output[0]
    else:
        preds = output

    # preds 크기 확인
    assert isinstance(preds, torch.Tensor), "출력의 각 요소는 텐서여야 합니다."
    print("출력 형태:", preds.shape)  # preds의 형태를 출력하여 크기 확인

    # preds의 마지막 차원 크기 확인
    assert preds.shape[-1] == 4 + nc, "preds의 마지막 차원 크기는 4 + nc와 같아야 합니다."

    # 후처리 테스트
    max_det = 300
    processed = model.postprocess(preds, max_det, nc)

    # 후처리 출력 형태 확인
    assert processed.shape == (batch_size, min(max_det, preds.shape[1]), 6), \
        "후처리된 출력 형태가 예상과 다릅니다."

    print("테스트 통과!")

# 테스트 실행
test_detect_onebone()




출력 형태: torch.Size([2, 84, 8400])


RuntimeError: split_with_sizes expects split_sizes to sum exactly to 8400 (input tensor's size at dimension -1), but got split_sizes=[4, 80]

In [None]:
from ultralytics import YOLO
import torch.nn as nn
import torch

# YOLO 모델 로드
model = YOLO("yolo11m.pt")

# YOLO의 Detect 모듈 수정: 마지막 레이어를 6개의 파라미터 (cx, cy, w, h, angle)로 확장
class RotatedYOLO(nn.Module):
    def __init__(self, base_model):
        super(RotatedYOLO, self).__init__()
        self.base_model = base_model

        # YOLO의 Detect 레이어 가져오기
        self.detect_layer = self.base_model.model.model[-1]

        # YOLO의 기존 클래스 개수
        num_classes = self.detect_layer.nc

        # 새로운 Conv 레이어 추가: (cx, cy, w, h, angle) + obj_conf + class_probs
        in_channels = self.detect_layer.cv3[-1][2].out_channels  # 마지막 Conv 레이어의 출력 채널
        out_channels = 5 + 1 + num_classes  # (cx, cy, w, h, angle, obj_conf + class_probs)

        # 새로운 Conv2d 레이어 생성
        self.new_conv = nn.Conv2d(in_channels, out_channels, kernel_size=(1, 1))

        # Detect 레이어의 cv3 수정
        self.detect_layer.cv3[-1].add_module('new_conv', self.new_conv)

    def forward(self, x):
        # YOLO의 forward 과정을 수행
        return self.base_model(x)

    def train(self, *args, **kwargs):
        # YOLO의 기본 train 메소드 호출
        self.base_model.train(*args, **kwargs)

        # 필요한 커스텀 손실 함수를 train 루틴에 포함하기 위해 YOLO의 train 메소드 수정
        self.base_model.loss = self.custom_loss  # 손실 함수를 커스텀으로 변경

    # 커스텀 손실 함수 정의
    def custom_loss(self, preds, targets):
        # 바운딩 박스 손실 계산 (cx, cy, w, h)
        bbox_loss = nn.MSELoss()(preds[:, :4], targets[:, :4])  # (cx, cy, w, h)

        # 각도 손실 계산 (angle)
        angle_loss = nn.MSELoss()(preds[:, 4], targets[:, 4])  # angle

        # 총 손실 계산 (기존 바운딩 박스 손실 + 각도 손실)
        total_loss = bbox_loss + 0.1 * angle_loss  # 0.1은 각도 손실 가중치
        return total_loss


# 기존 모델에 수정된 구조 적용
rotated_model = RotatedYOLO(model)

# 학습 시작 (rotated_model을 사용하여 훈련)
results = rotated_model.train(
    data='data.yaml',  # 데이터 설정 파일
    epochs=50,         # 학습 epoch 수
    imgsz=256,         # 입력 이미지 크기
    batch=16,          # 배치 크기
    name='rotated_ship_detection',  # 실험 이름
    device=0           # GPU 사용 설정
)


In [25]:
import torch
from ultralytics import YOLO

# YOLO 모델 로드
model = YOLO("yolo11m.pt")

# 모델의 전체 구조를 요약해서 출력
print(model)
# print(model.model)
# print(model.model.model)
# print(model.model.model[-1])



YOLO(
  (model): DetectionModel(
    (model): Sequential(
      (0): Conv(
        (conv): Conv2d(3, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(64, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (act): SiLU(inplace=True)
      )
      (1): Conv(
        (conv): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(128, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (act): SiLU(inplace=True)
      )
      (2): C3k2(
        (cv1): Conv(
          (conv): Conv2d(128, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn): BatchNorm2d(128, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
          (act): SiLU(inplace=True)
        )
        (cv2): Conv(
          (conv): Conv2d(192, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn): BatchNorm2d(256, eps=0.001, momentum=0.03, affine=True, track_

In [8]:

# 모델의 레이어 이름과 모듈을 모두 출력
for name, module in model.model.named_modules():
    print(f"Layer Name: {name}, Module: {module}")


Layer Name: , Module: DetectionModel(
  (model): Sequential(
    (0): Conv(
      (conv): Conv2d(3, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (bn): BatchNorm2d(64, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
      (act): SiLU(inplace=True)
    )
    (1): Conv(
      (conv): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (bn): BatchNorm2d(128, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
      (act): SiLU(inplace=True)
    )
    (2): C3k2(
      (cv1): Conv(
        (conv): Conv2d(128, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn): BatchNorm2d(128, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (act): SiLU(inplace=True)
      )
      (cv2): Conv(
        (conv): Conv2d(192, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn): BatchNorm2d(256, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (act): 

In [10]:
# YOLO의 Detect 모듈 출력
detect_layer = model.model.model[-1]  # Detect 레이어에 접근
print(detect_layer)


Detect(
  (cv2): ModuleList(
    (0): Sequential(
      (0): Conv(
        (conv): Conv2d(256, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(64, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (act): SiLU(inplace=True)
      )
      (1): Conv(
        (conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(64, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (act): SiLU(inplace=True)
      )
      (2): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1))
    )
    (1-2): 2 x Sequential(
      (0): Conv(
        (conv): Conv2d(512, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(64, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (act): SiLU(inplace=True)
      )
      (1): Conv(
        (conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)

In [44]:
import torch
from ultralytics import YOLO

# YOLO 모델 로드
model = YOLO("yolo11m.pt")

import torch
from ultralytics import YOLO

# YOLO 모델 로드
model = YOLO("yolo11m.pt")

for i in range(len(model.model.model)):
    print(f"Layer {i}: {model.model.model[i]}")


Layer 0: Conv(
  (conv): Conv2d(3, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
  (bn): BatchNorm2d(64, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
  (act): SiLU(inplace=True)
)
Layer 1: Conv(
  (conv): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
  (bn): BatchNorm2d(128, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
  (act): SiLU(inplace=True)
)
Layer 2: C3k2(
  (cv1): Conv(
    (conv): Conv2d(128, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
    (bn): BatchNorm2d(128, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
    (act): SiLU(inplace=True)
  )
  (cv2): Conv(
    (conv): Conv2d(192, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
    (bn): BatchNorm2d(256, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
    (act): SiLU(inplace=True)
  )
  (m): ModuleList(
    (0): C3k(
      (cv1): Conv(
        (conv): Conv2d(64, 32, kernel_size=(1, 1), str

In [52]:
import torch
from ultralytics import YOLO

# YOLO 모델 로드
model = YOLO("yolo11m.pt")

# Backbone과 Head 모듈 가져오기
backbone = model.model.model[:-1]  # Head를 제외한 Backbone
head = model.model.model[-1]        # Head

# 입력 텐서 생성 (예: 1x3x256x256)
input_tensor = torch.randn(1, 3, 256, 256)

# Backbone에 입력 텐서를 전달하여 출력 텐서 생성
backbone_output = backbone(input_tensor)

TypeError: cat() received an invalid combination of arguments - got (Tensor, int), but expected one of:
 * (tuple of Tensors tensors, int dim, *, Tensor out)
 * (tuple of Tensors tensors, name dim, *, Tensor out)


In [40]:
import torch
from ultralytics import YOLO

# YOLO 모델 로드
model = YOLO("yolo11m.pt")

# 임의의 입력 텐서 생성 (배치 크기 1, 채널 수 3, 높이 256, 너비 256)
input_tensor = torch.randn(1, 3, 256, 256)

# 출력의 shape을 저장할 리스트
output_shapes = []

# 각 레이어의 출력을 캡처할 hook 함수 정의
def get_output_shape(module, input, output):
    # 출력의 shape을 추가하는 재귀 함수
    def append_shapes(output_item):
        if isinstance(output_item, (tuple, list)):
            for item in output_item:
                append_shapes(item)  # 재귀적으로 호출
        else:
            output_shapes.append(output_item.shape)  # 텐서인 경우 shape 저장

    append_shapes(output)  # 처음 호출

# 모델의 모든 레이어에 hook 등록
for layer in model.model.modules():
    if hasattr(layer, 'register_forward_hook'):
        layer.register_forward_hook(get_output_shape)

# Pass the input tensor through the model
with torch.no_grad():  # 기울기 계산을 하지 않도록 설정
    model(input_tensor)  # 출력이 필요 없으므로 할당하지 않음

# Print the output shapes
for i, shape in enumerate(output_shapes):
    print(f"Layer {i + 1} output shape: {shape}")



0: 256x256 (no detections), 41.0ms
Speed: 0.0ms preprocess, 41.0ms inference, 2.0ms postprocess per image at shape (1, 3, 256, 256)
Layer 1 output shape: torch.Size([1, 64, 320, 320])
Layer 2 output shape: torch.Size([1, 64, 320, 320])
Layer 3 output shape: torch.Size([1, 128, 160, 160])
Layer 4 output shape: torch.Size([1, 128, 160, 160])
Layer 5 output shape: torch.Size([1, 128, 160, 160])
Layer 6 output shape: torch.Size([1, 128, 160, 160])
Layer 7 output shape: torch.Size([1, 32, 160, 160])
Layer 8 output shape: torch.Size([1, 32, 160, 160])
Layer 9 output shape: torch.Size([1, 32, 160, 160])
Layer 10 output shape: torch.Size([1, 32, 160, 160])
Layer 11 output shape: torch.Size([1, 32, 160, 160])
Layer 12 output shape: torch.Size([1, 32, 160, 160])
Layer 13 output shape: torch.Size([1, 32, 160, 160])
Layer 14 output shape: torch.Size([1, 32, 160, 160])
Layer 15 output shape: torch.Size([1, 32, 160, 160])
Layer 16 output shape: torch.Size([1, 32, 160, 160])
Layer 17 output shape: t