In [7]:
import timm


In [10]:
timm.list_pretrained()
[nextvit for nextvit in timm.list_pretrained() if 'nextvit' in nextvit]

['nextvit_base.bd_in1k',
 'nextvit_base.bd_in1k_384',
 'nextvit_base.bd_ssld_6m_in1k',
 'nextvit_base.bd_ssld_6m_in1k_384',
 'nextvit_large.bd_in1k',
 'nextvit_large.bd_in1k_384',
 'nextvit_large.bd_ssld_6m_in1k',
 'nextvit_large.bd_ssld_6m_in1k_384',
 'nextvit_small.bd_in1k',
 'nextvit_small.bd_in1k_384',
 'nextvit_small.bd_ssld_6m_in1k',
 'nextvit_small.bd_ssld_6m_in1k_384']

In [11]:
import timm
model = timm.create_model('nextvit_small.bd_ssld_6m_in1k', pretrained=True)
model.pretrained_cfg

{'url': '',
 'hf_hub_id': 'timm/nextvit_small.bd_ssld_6m_in1k',
 'architecture': 'nextvit_small',
 'tag': 'bd_ssld_6m_in1k',
 'custom_load': False,
 'input_size': (3, 224, 224),
 'fixed_input_size': False,
 'interpolation': 'bicubic',
 'crop_pct': 0.95,
 'crop_mode': 'center',
 'mean': (0.485, 0.456, 0.406),
 'std': (0.229, 0.224, 0.225),
 'num_classes': 1000,
 'pool_size': (7, 7),
 'first_conv': 'stem.0.conv',
 'classifier': 'head.fc'}

In [7]:
import torch.nn as nn

# 기존 모델의 head 부분을 제외하고 새로운 모델을 정의합니다.
model_without_head = nn.Sequential(*list(model.children())[:-1])
model_without_head

Sequential(
  (0): Sequential(
    (0): ConvNormAct(
      (conv): Conv2d(3, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (norm): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (act): ReLU(inplace=True)
    )
    (1): ConvNormAct(
      (conv): Conv2d(64, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (norm): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (act): ReLU(inplace=True)
    )
    (2): ConvNormAct(
      (conv): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (norm): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (act): ReLU(inplace=True)
    )
    (3): ConvNormAct(
      (conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (norm): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (act): ReLU(inplac

In [3]:
nn.Sequential(*list(model.children())[:1])

Sequential(
  (0): Sequential(
    (0): ConvNormAct(
      (conv): Conv2d(3, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (norm): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (act): ReLU(inplace=True)
    )
    (1): ConvNormAct(
      (conv): Conv2d(64, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (norm): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (act): ReLU(inplace=True)
    )
    (2): ConvNormAct(
      (conv): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (norm): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (act): ReLU(inplace=True)
    )
    (3): ConvNormAct(
      (conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (norm): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (act): ReLU(inplac

In [2]:
import torch.nn as nn

# 기존 모델에서 head 부분을 제외
model_without_head = nn.Sequential(*list(model.children())[:-1])

# 새로운 detection head 정의 (예시로 Conv2d와 Linear 사용)
class DetectionHead(nn.Module):
    def __init__(self, in_features, num_classes, num_boxes):
        super(DetectionHead, self).__init__()
        # Detection-specific layers 예시 (Conv2d 또는 FC 레이어 등)
        self.conv = nn.Conv2d(in_features, 256, kernel_size=3, stride=1, padding=1)
        self.fc1 = nn.Linear(256 * 8 * 8, 1024)  # 적절한 입력 크기로 설정
        self.fc2 = nn.Linear(1024, num_classes * num_boxes * 4)  # Bounding box 예측 (좌표, 클래스)

    def forward(self, x):
        x = self.conv(x)
        x = x.view(x.size(0), -1)  # Flattening
        x = self.fc1(x)
        x = self.fc2(x)
        return x.view(x.size(0), -1, 4)  # 예시로 [batch, num_boxes, 4] 형태

# 새로운 detection head 추가
in_features = 1024  # 기존 모델의 마지막 레이어 출력 크기 확인 후 설정
num_classes = 20    # 예시로 20개 클래스
num_boxes = 5       # 예시로 5개 박스 예측

detection_head = DetectionHead(in_features, num_classes, num_boxes)

# 전체 모델
class ModelWithDetectionHead(nn.Module):
    def __init__(self, backbone, head):
        super(ModelWithDetectionHead, self).__init__()
        self.backbone = backbone
        self.head = head

    def forward(self, x):
        x = self.backbone(x)
        x = self.head(x)
        return x

# 모델 초기화
model_with_detection_head = ModelWithDetectionHead(model_without_head, detection_head)


NameError: name 'model' is not defined

In [3]:
from transformers import DetrImageProcessor, DetrForObjectDetection
import torch
from PIL import Image
import requests

url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)

# you can specify the revision tag if you don't want the timm dependency
processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50", revision="no_timm")
model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50", revision="no_timm")

inputs = processor(images=image, return_tensors="pt")
outputs = model(**inputs)

# convert outputs (bounding boxes and class logits) to COCO API
# let's only keep detections with score > 0.9
target_sizes = torch.tensor([image.size[::-1]])
results = processor.post_process_object_detection(outputs, target_sizes=target_sizes, threshold=0.9)[0]

for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
    box = [round(i, 2) for i in box.tolist()]
    print(
            f"Detected {model.config.id2label[label.item()]} with confidence "
            f"{round(score.item(), 3)} at location {box}"
    )


  from .autonotebook import tqdm as notebook_tqdm


Detected remote with confidence 0.998 at location [40.16, 70.81, 175.55, 117.98]
Detected remote with confidence 0.996 at location [333.24, 72.55, 368.33, 187.66]
Detected couch with confidence 0.995 at location [-0.02, 1.15, 639.73, 473.76]
Detected cat with confidence 0.999 at location [13.24, 52.05, 314.02, 470.93]
Detected cat with confidence 0.999 at location [345.4, 23.85, 640.37, 368.72]


In [5]:
processor

DetrImageProcessor {
  "do_convert_annotations": true,
  "do_normalize": true,
  "do_pad": true,
  "do_rescale": true,
  "do_resize": true,
  "format": "coco_detection",
  "image_mean": [
    0.485,
    0.456,
    0.406
  ],
  "image_processor_type": "DetrImageProcessor",
  "image_std": [
    0.229,
    0.224,
    0.225
  ],
  "pad_size": null,
  "resample": 2,
  "rescale_factor": 0.00392156862745098,
  "size": {
    "longest_edge": 1333,
    "shortest_edge": 800
  }
}

In [12]:
model

DetrForObjectDetection(
  (model): DetrModel(
    (backbone): DetrConvModel(
      (conv_encoder): DetrConvEncoder(
        (model): ResNetBackbone(
          (embedder): ResNetEmbeddings(
            (embedder): ResNetConvLayer(
              (convolution): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
              (normalization): DetrFrozenBatchNorm2d()
              (activation): ReLU()
            )
            (pooler): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
          )
          (encoder): ResNetEncoder(
            (stages): ModuleList(
              (0): ResNetStage(
                (layers): Sequential(
                  (0): ResNetBottleNeckLayer(
                    (shortcut): ResNetShortCut(
                      (convolution): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
                      (normalization): DetrFrozenBatchNorm2d()
                    )
                    (layer): Seq

In [11]:
import torchsummary

torchsummary.summary(model, **inputs)

TypeError: summary() got an unexpected keyword argument 'pixel_values'