In [1]:
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit  # автоматически инициализирует CUDA контекст
import torch
import cv2
import sys
import math
import time
import numpy as np
import yaml
import torch
import numpy as np
import onnx
import onnxruntime as ort


""" TRT_LOGGER = trt.Logger(trt.Logger.WARNING)

def load_engine(trt_runtime, engine_path):
    with open(engine_path, "rb") as f:
        engine_data = f.read()
    return trt_runtime.deserialize_cuda_engine(engine_data)

trt_runtime = trt.Runtime(TRT_LOGGER)
engine = load_engine(trt_runtime, "MCITrack.trt")
engine """




In [2]:
trt.__version__

'10.9.0.34'

In [3]:
import onnx

# Загрузите модель
model = onnx.load("MCITrack.onnx")

# Входы (inputs)
print("Входы модели:")
for input in model.graph.input:
    name = input.name
    shape = []
    for dim in input.type.tensor_type.shape.dim:
        if dim.dim_param:
            shape.append(dim.dim_param)
        else:
            shape.append(dim.dim_value)
    print(f"  {name}: {shape}")

# Выходы (outputs)
print("\nВыходы модели:")
for output in model.graph.output:
    name = output.name
    shape = []
    for dim in output.type.tensor_type.shape.dim:
        if dim.dim_param:
            shape.append(dim.dim_param)
        else:
            shape.append(dim.dim_value)
    print(f"  {name}: {shape}")

Входы модели:
  template_list: [1, 3, 112, 112]
  search_list: [1, 3, 112, 112]
  template_anno_list: [1, 3, 112, 112]
  onnx::Unsqueeze_3: [1, 3, 112, 112]
  onnx::Unsqueeze_4: [1, 3, 112, 112]
  onnx::Unsqueeze_5: [1, 3, 224, 224]
  onnx::Unsqueeze_6: [1, 4]
  onnx::Unsqueeze_7: [1, 4]
  onnx::Unsqueeze_8: [1, 4]
  onnx::Unsqueeze_9: [1, 4]
  onnx::Unsqueeze_10: [1, 4]

Выходы модели:
  pred_boxes: [1, 1, 4]
  score_map: [1, 1, 14, 14]
  size_map: [1, 2, 14, 14]
  offset_map: [1, 2, 14, 14]


In [19]:
import tensorrt as trt

TRT_LOGGER = trt.Logger(trt.Logger.WARNING)

# Инициализация TensorRT билдера и парсера ONNX
with trt.Builder(TRT_LOGGER) as builder, builder.create_network(1) as network, trt.OnnxParser(network, TRT_LOGGER) as parser:
    # Загрузка ONNX-модели и парсинг в TensorRT
    with open("./MCITrack.onnx", "rb") as model_file:
        parser.parse(model_file.read())

    builder_config = builder.create_builder_config()
    #builder_config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 32)
    #builder_config.set_tactic_sources(1 << int(trt.TacticSource.CUBLAS))
    #builder_config.set_flag(trt.BuilderFlag.FP16)
       
    serialized_network = builder.build_serialized_network(network, builder_config)
    with open("MCITrack.trt", "wb")  as f:
        f.write(serialized_network)

In [20]:
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)

def load_trt_engine(engine_path: str):
    with open(engine_path, 'rb') as file, trt.Runtime(TRT_LOGGER) as runtime:
        engine = runtime.deserialize_cuda_engine(file.read())
    return engine

In [21]:
ORT="MCITrack.trt"

In [22]:
engine = load_trt_engine(ORT)
context = engine.create_execution_context()

In [23]:
context

<tensorrt.tensorrt.IExecutionContext at 0x2803bdd07f0>

In [24]:
def initialize_trt():
    # Входы (пример для трех входов по 112x112 и одного по 224x224, плюс 5 входов по [1,4])
    input_size_template_list = np.prod([1, 3, 112, 112]) * np.dtype(np.float32).itemsize
    d_input_template_list = cuda.mem_alloc(int(input_size_template_list))
    
    input_size_search_list = np.prod([1, 3, 112, 112]) * np.dtype(np.float32).itemsize
    d_input_search_list = cuda.mem_alloc(int(input_size_search_list))
    
    input_size_template_anno_list = np.prod([1, 3, 112, 112]) * np.dtype(np.float32).itemsize
    d_input_template_anno_list = cuda.mem_alloc(int(input_size_template_anno_list))
    
    
    input_size_unsqueeze_3 = np.prod([1, 3, 112, 112]) * np.dtype(np.float32).itemsize
    d_input_unsqueeze_3 = cuda.mem_alloc(int(input_size_unsqueeze_3))
    
    
    input_size_unsqueeze_4 = np.prod([1, 3, 112, 112]) * np.dtype(np.float32).itemsize
    d_input_unsqueeze_4 = cuda.mem_alloc(int(input_size_unsqueeze_4))
    
    input_size_unsqueeze_5 = np.prod([1, 3, 224, 224]) * np.dtype(np.float32).itemsize
    d_input_unsqueeze_5 = cuda.mem_alloc(int(input_size_unsqueeze_5))
    
    # Пять входов по [1,4]
    input_size_unsqueeze_list = np.prod([1, 4]) * np.dtype(np.float32).itemsize
    d_input_unsqueeze_6 = cuda.mem_alloc(int(input_size_unsqueeze_list))
    d_input_unsqueeze_7 = cuda.mem_alloc(int(input_size_unsqueeze_list))
    d_input_unsqueeze_8 = cuda.mem_alloc(int(input_size_unsqueeze_list))
    d_input_unsqueeze_9 = cuda.mem_alloc(int(input_size_unsqueeze_list))
    d_input_unsqueeze_10 = cuda.mem_alloc(int(input_size_unsqueeze_list))
    
    # Выходы
    output_size_predboxes = np.prod([1, 1, 4]) * np.dtype(np.float32).itemsize
    d_output_predboxes = cuda.mem_alloc(int(output_size_predboxes))

    output_size_score = np.prod([1, 1, 14, 14]) * np.dtype(np.float32).itemsize
    d_output_score = cuda.mem_alloc(int(output_size_score))

    output_size_size = np.prod([1, 2, 14, 14]) * np.dtype(np.float32).itemsize
    d_output_size = cuda.mem_alloc(int(output_size_size))

    output_size_offset = np.prod([1, 2, 14, 14]) * np.dtype(np.float32).itemsize
    d_output_offset = cuda.mem_alloc(int(output_size_offset))

    # Вернуть всё в кортеже (в том порядке, как вам нужно)
    return (
        d_input_template_list,
        d_input_search_list,
        d_input_template_anno_list,
        d_input_unsqueeze_3,
        d_input_unsqueeze_4,
        d_input_unsqueeze_5,
        d_input_unsqueeze_6,
        d_input_unsqueeze_7,
        d_input_unsqueeze_8,
        d_input_unsqueeze_9,
        d_input_unsqueeze_10,
        d_output_predboxes,
        d_output_score,
        d_output_size,
        d_output_offset
    )

In [25]:
def predictV2_head(
    context,
    input_template_list,
    input_search_list,
    input_template_anno_list,
    input_unsqueeze_3,
    input_unsqueeze_4,
    input_unsqueeze_5,
    input_unsqueeze_6,
    input_unsqueeze_7,
    input_unsqueeze_8,
    input_unsqueeze_9,
    input_unsqueeze_10,
    d_input_template_list,
    d_input_search_list,
    d_input_template_anno_list,
    d_input_unsqueeze_3,
    d_input_unsqueeze_4,
    d_input_unsqueeze_5,
    d_input_unsqueeze_6,
    d_input_unsqueeze_7,
    d_input_unsqueeze_8,
    d_input_unsqueeze_9,
    d_input_unsqueeze_10,
    d_output_pred_boxes,
    d_output_score_map,
    d_output_size_map,
    d_output_offset_map
):
    # Prepare all inputs
    cuda.memcpy_htod(d_input_template_list, input_template_list.ravel())
    cuda.memcpy_htod(d_input_search_list, input_search_list.ravel())
    cuda.memcpy_htod(d_input_template_anno_list, input_template_anno_list.ravel())
    cuda.memcpy_htod(d_input_unsqueeze_3, input_unsqueeze_3.ravel())
    cuda.memcpy_htod(d_input_unsqueeze_4, input_unsqueeze_4.ravel())
    cuda.memcpy_htod(d_input_unsqueeze_5, input_unsqueeze_5.ravel())
    cuda.memcpy_htod(d_input_unsqueeze_6, input_unsqueeze_6.ravel())
    cuda.memcpy_htod(d_input_unsqueeze_7, input_unsqueeze_7.ravel())
    cuda.memcpy_htod(d_input_unsqueeze_8, input_unsqueeze_8.ravel())
    cuda.memcpy_htod(d_input_unsqueeze_9, input_unsqueeze_9.ravel())
    cuda.memcpy_htod(d_input_unsqueeze_10, input_unsqueeze_10.ravel())

    # Execute model with all bindings in correct order
    context.execute_v2(bindings=[
        int(d_input_template_list),     # 0
        int(d_input_search_list),       # 1
        int(d_input_template_anno_list),# 2
        int(d_input_unsqueeze_3),       # 3
        int(d_input_unsqueeze_4),       # 4
        int(d_input_unsqueeze_5),       # 5
        int(d_input_unsqueeze_6),       # 6
        int(d_input_unsqueeze_7),       # 7
        int(d_input_unsqueeze_8),       # 8
        int(d_input_unsqueeze_9),       # 9
        int(d_input_unsqueeze_10),      # 10
        int(d_output_pred_boxes),       # 11
        int(d_output_score_map),        # 12
        int(d_output_size_map),         # 13
        int(d_output_offset_map)        # 14
    ])

    # Fetch output data with new shapes
    output_pred_boxes = np.empty([1, 1, 4], dtype=np.float32)
    cuda.memcpy_dtoh(output_pred_boxes, d_output_pred_boxes)

    output_score_map = np.empty([1, 1, 14, 14], dtype=np.float32)
    cuda.memcpy_dtoh(output_score_map, d_output_score_map)

    output_size_map = np.empty([1, 2, 14, 14], dtype=np.float32)
    cuda.memcpy_dtoh(output_size_map, d_output_size_map)

    output_offset_map = np.empty([1, 2, 14, 14], dtype=np.float32)
    cuda.memcpy_dtoh(output_offset_map, d_output_offset_map)

    return output_pred_boxes, output_score_map, output_size_map, output_offset_map

In [26]:
data  = initialize_trt()

In [27]:
data

(<pycuda._driver.DeviceAllocation at 0x28048679fc0>,
 <pycuda._driver.DeviceAllocation at 0x28048f75840>,
 <pycuda._driver.DeviceAllocation at 0x28048f75a20>,
 <pycuda._driver.DeviceAllocation at 0x28048f75a80>,
 <pycuda._driver.DeviceAllocation at 0x28048f75ae0>,
 <pycuda._driver.DeviceAllocation at 0x28048f75b40>,
 <pycuda._driver.DeviceAllocation at 0x28048f75ba0>,
 <pycuda._driver.DeviceAllocation at 0x28048f757e0>,
 <pycuda._driver.DeviceAllocation at 0x28048f75c00>,
 <pycuda._driver.DeviceAllocation at 0x28048f75c60>,
 <pycuda._driver.DeviceAllocation at 0x28048f75cc0>,
 <pycuda._driver.DeviceAllocation at 0x28048f75d80>,
 <pycuda._driver.DeviceAllocation at 0x28048f75de0>,
 <pycuda._driver.DeviceAllocation at 0x28048f75ea0>,
 <pycuda._driver.DeviceAllocation at 0x28048f75f00>)

In [28]:
# Входы с размерами [1, 3, 112, 112]
template_list = np.random.rand(1, 3, 112, 112).astype(np.float32)
search_list = np.random.rand(1, 3, 112, 112).astype(np.float32)
template_anno_list = np.random.rand(1, 3, 112, 112).astype(np.float32)
onnx_Unsqueeze_3 = np.random.rand(1, 3, 112, 112).astype(np.float32)
onnx_Unsqueeze_4 = np.random.rand(1, 3, 112, 112).astype(np.float32)

# Вход с размером [1, 3, 224, 224]
onnx_Unsqueeze_5 = np.random.rand(1, 3, 224, 224).astype(np.float32)

# Входы с размером [1, 4]
onnx_Unsqueeze_6 = np.random.rand(1, 4).astype(np.float32)
onnx_Unsqueeze_7 = np.random.rand(1, 4).astype(np.float32)
onnx_Unsqueeze_8 = np.random.rand(1, 4).astype(np.float32)
onnx_Unsqueeze_9 = np.random.rand(1, 4).astype(np.float32)
onnx_Unsqueeze_10 = np.random.rand(1, 4).astype(np.float32)

In [29]:
predictV2_head(context,template_list,search_list,template_anno_list,onnx_Unsqueeze_3,onnx_Unsqueeze_4,onnx_Unsqueeze_5,onnx_Unsqueeze_6,onnx_Unsqueeze_7,onnx_Unsqueeze_8,onnx_Unsqueeze_9,onnx_Unsqueeze_10,
               data[0],data[1],data[2],data[3],data[4],data[5],data[6],data[7],data[8],data[9],data[10],data[11],data[12],data[13],data[14])

(array([[[0.61137617, 0.30011144, 0.61331606, 0.5224328 ]]], dtype=float32),
 array([[[[0.00200634, 0.00486906, 0.00525712, 0.00507968, 0.00560934,
           0.00545395, 0.00511253, 0.00554725, 0.00594276, 0.00574282,
           0.00530569, 0.00643427, 0.00829869, 0.0156656 ],
          [0.00586623, 0.14317323, 0.1445024 , 0.14997768, 0.1711544 ,
           0.1425818 , 0.13096015, 0.14212836, 0.16233288, 0.14584352,
           0.134955  , 0.15879007, 0.1850622 , 0.21613029],
          [0.00846663, 0.13219371, 0.13083975, 0.12932365, 0.14836118,
           0.12070306, 0.11661218, 0.13038355, 0.1523124 , 0.13976036,
           0.13045266, 0.14359589, 0.18131988, 0.2163243 ],
          [0.007554  , 0.09691673, 0.09970979, 0.09621135, 0.10238244,
           0.08856169, 0.08712081, 0.0983836 , 0.10861786, 0.10632068,
           0.10184842, 0.109666  , 0.13287713, 0.15201056],
          [0.00716   , 0.09172104, 0.09547081, 0.08911525, 0.09413879,
           0.08257841, 0.08016594, 0.0853309

In [None]:
import onnx

# Загрузите модель
model = onnx.load("MCITrack.onnx")



# Примените инференс формы
inferred_model = onnx.shape_inference.infer_shapes(model)

# Найдите нужную ноду
node_name = "Reshape_113"
node = None
for n in inferred_model.graph.node:
    if n.name == node_name:
        node = n
        break

def get_tensor_shape(graph, name):
    for v in list(graph.value_info) + list(graph.input) + list(graph.output):
        if v.name == name:
            return [d.dim_param if d.dim_param else d.dim_value for d in v.type.tensor_type.shape.dim]
    return None

if node is None:
    print(f"Node {node_name} not found!")
else:
    print(f"Node '{node_name}' найден.")
    print("  inputs:", node.input)
    print("  outputs:", node.output)

    for i, inp in enumerate(node.input):
        shape = get_tensor_shape(inferred_model.graph, inp)
        print(f"  Вход {i}: {inp} -> {shape}")

    for i, out in enumerate(node.output):
        shape = get_tensor_shape(inferred_model.graph, out)
        print(f"  Выход {i}: {out} -> {shape}")
# Найдите нужную ноду
node_name = "Reshape_113"
node = None
for n in model.graph.node:
    if n.name == node_name:
        node = n
        break

if node is None:
    print(f"Node {node_name} not found!")
else:
    print(f"Node '{node_name}' найден.")
    print("  inputs:", node.input)
    print("  outputs:", node.output)

    # Функция для получения размеров по имени tensor'a
    def get_tensor_shape(name):
        for v in model.graph.value_info:
            if v.name == name:
                return [d.dim_param if d.dim_param else d.dim_value
                        for d in v.type.tensor_type.shape.dim]
        for v in model.graph.input:
            if v.name == name:
                return [d.dim_param if d.dim_param else d.dim_value
                        for d in v.type.tensor_type.shape.dim]
        for v in model.graph.output:
            if v.name == name:
                return [d.dim_param if d.dim_param else d.dim_value
                        for d in v.type.tensor_type.shape.dim]
        return None

    for i, inp in enumerate(node.input):
        shape = get_tensor_shape(inp)
        print(f"  Вход {i}: {inp} -> {shape}")

    for i, out in enumerate(node.output):
        shape = get_tensor_shape(out)
        print(f"  Выход {i}: {out} -> {shape}")

Node 'Reshape_113' найден.
  inputs: ['onnx::Reshape_567', 'onnx::Reshape_4767']
  outputs: ['z']
  Вход 0: onnx::Reshape_567 -> [1, 5, 3, 112, 112]
  Вход 1: onnx::Reshape_4767 -> None
  Выход 0: z -> [5, 3, 112, 112]
Node 'Reshape_113' найден.
  inputs: ['onnx::Reshape_567', 'onnx::Reshape_4767']
  outputs: ['z']
  Вход 0: onnx::Reshape_567 -> None
  Вход 1: onnx::Reshape_4767 -> None
  Выход 0: z -> None


In [2]:
def cal_bbox(score_map_ctr, size_map, offset_map, return_score=True):
    
    # 2. Получаем размеры feature map
    feat_h, feat_w = score_map_ctr.shape[-2], score_map_ctr.shape[-1]
    
    # 3. Находим позицию с максимальным score (современный способ)
    max_score, flat_idx = torch.max(score_map_ctr.flatten(1), dim=1)
    idx = flat_idx.unsqueeze(1)
    idx_y = torch.div(flat_idx, feat_w, rounding_mode='floor')
    idx_x = flat_idx % feat_w
    
    # 4. Подготовка индексов для gather
    gather_idx = idx.unsqueeze(1).expand(-1, 2, -1)
    
    # 5. Обработка size_map (расширяем если 1 канал)
    if size_map.size(1) == 1:
        size_map = size_map.expand(-1, 2, -1, -1)
    
    # 6. Получаем размеры и смещения
    try:
        size = size_map.flatten(2).gather(2, gather_idx)
        offset = offset_map.flatten(2).gather(2, gather_idx).squeeze(-1)
    except RuntimeError as e:
        print("Ошибка размерностей:")
        print(f"score_map_ctr: {score_map_ctr.shape}")
        print(f"size_map: {size_map.shape}")
        print(f"offset_map: {offset_map.shape}")
        print(f"gather_idx: {gather_idx.shape}")
        raise
    
    # 7. Формируем bbox (cx, cy, w, h)
    bbox = torch.cat([
        (idx_x.to(torch.float) + offset[:, 0:1]) / feat_w,
        (idx_y.to(torch.float) + offset[:, 1:2]) / feat_h,
        size.squeeze(-1)
    ], dim=1)
    
    return (bbox, max_score) if return_score else bbox

""" def cal_bbox(score_map_ctr, size_map, offset_map, return_score=True):
        feat_sz = 14
        max_score, idx = torch.max(score_map_ctr.flatten(1), dim=1, keepdim=True) # score_map_ctr.flatten(1): torch.Size([32, 256]) idx: torch.Size([32, 1]) max_score: torch.Size([32, 1])
        idx_y = torch.div(idx, feat_sz, rounding_mode='floor')
        idx_x = idx % feat_sz
       
        

        idx = idx.unsqueeze(1).expand(idx.shape[0], 2, 1)
        size = size_map.flatten(2).gather(dim=2, index=idx) # size_map: torch.Size([32, 2, 16, 16])  size_map.flatten(2): torch.Size([32, 2, 256])
        offset = offset_map.flatten(2).gather(dim=2, index=idx).squeeze(-1)

        # bbox = torch.cat([idx_x - size[:, 0] / 2, idx_y - size[:, 1] / 2,
        #                   idx_x + size[:, 0] / 2, idx_y + size[:, 1] / 2], dim=1) / self.feat_sz
        # cx, cy, w, h
        bbox = torch.cat([(idx_x.to(torch.float) + offset[:, :1]) / feat_sz,
                          (idx_y.to(torch.float) + offset[:, 1:]) / feat_sz,
                          size.squeeze(-1)], dim=1)

        if return_score:
            return bbox, max_score
        return (bbox, max_score) if return_score else bbox """
        
class Preprocessor(object):
    def __init__(self):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.mean = torch.tensor([0.485, 0.456, 0.406]).view((1, 3, 1, 1)).to(self.device)
        self.std = torch.tensor([0.229, 0.224, 0.225]).view((1, 3, 1, 1)).to(self.device)
        self.mm_mean = torch.tensor([0.485, 0.456, 0.406, 0.485, 0.456, 0.406]).view((1, 6, 1, 1)).to(self.device)
        self.mm_std = torch.tensor([0.229, 0.224, 0.225, 0.229, 0.224, 0.225]).view((1, 6, 1, 1)).to(self.device)

    def process(self, img_arr: np.ndarray):
        if img_arr.shape[-1] == 6:
            mean = self.mm_mean
            std = self.mm_std
        else:
            mean = self.mean
            std = self.std
        # Deal with the image patch
        img_tensor = torch.tensor(img_arr).to(self.device).float().permute((2,0,1)).unsqueeze(dim=0)        
        img_tensor_norm = ((img_tensor / 255.0) - mean) / std  # (1,3,H,W)
        return img_tensor_norm
    
def hann1d(sz: int, centered = True) -> torch.Tensor:
    """1D cosine window."""
    if centered:
        return 0.5 * (1 - torch.cos((2 * math.pi / (sz + 1)) * torch.arange(1, sz + 1).float()))
    w = 0.5 * (1 + torch.cos((2 * math.pi / (sz + 2)) * torch.arange(0, sz//2 + 1).float()))
    return torch.cat([w, w[1:sz-sz//2].flip((0,))])
    
def hann2d(sz: torch.Tensor, centered = True) -> torch.Tensor:
    """2D cosine window."""
    return hann1d(sz[0].item(), centered).reshape(1, 1, -1, 1) * hann1d(sz[1].item(), centered).reshape(1, 1, 1, -1)    

def sample_target(im, target_bb, search_area_factor, output_sz=None):
   
    if not isinstance(target_bb, list):
        x, y, w, h = target_bb.tolist()
    else:
        x, y, w, h = target_bb
    # Crop image
    crop_sz = math.ceil(math.sqrt(w * h) * search_area_factor)

    if crop_sz < 1:
        raise Exception('Too small bounding box.')

    x1 = round(x + 0.5 * w - crop_sz * 0.5)
    x2 = x1 + crop_sz

    y1 = round(y + 0.5 * h - crop_sz * 0.5)
    y2 = y1 + crop_sz

    x1_pad = max(0, -x1)
    x2_pad = max(x2 - im.shape[1] + 1, 0)

    y1_pad = max(0, -y1)
    y2_pad = max(y2 - im.shape[0] + 1, 0)

    # Crop target
    im_crop = im[y1 + y1_pad:y2 - y2_pad, x1 + x1_pad:x2 - x2_pad, :]

    # Pad
    im_crop_padded = cv2.copyMakeBorder(im_crop, y1_pad, y2_pad, x1_pad, x2_pad, cv2.BORDER_CONSTANT)
    # deal with attention mask
    H, W, _ = im_crop_padded.shape

    if output_sz is not None:
        resize_factor = output_sz / crop_sz
        im_crop_padded = cv2.resize(im_crop_padded, (output_sz, output_sz))

        return im_crop_padded, resize_factor

    else:
        return im_crop_padded, 1.0
def transform_image_to_crop(box_in: torch.Tensor, box_extract: torch.Tensor, resize_factor: float,
                            crop_sz: torch.Tensor, normalize=False) -> torch.Tensor:
   
    box_extract_center = box_extract[0:2] + 0.5 * box_extract[2:4]

    box_in_center = box_in[0:2] + 0.5 * box_in[2:4]

    box_out_center = (crop_sz - 1) / 2 + (box_in_center - box_extract_center) * resize_factor
    box_out_wh = box_in[2:4] * resize_factor

    box_out = torch.cat((box_out_center - 0.5 * box_out_wh, box_out_wh))
    if normalize:
        return box_out / (crop_sz[0]-1)
    else:
        return box_out
def clip_box(box: list, H, W, margin=0):
    x1, y1, w, h = box
    x2, y2 = x1 + w, y1 + h
    x1 = min(max(0, x1), W-margin)
    x2 = min(max(margin, x2), W)
    y1 = min(max(0, y1), H-margin)
    y2 = min(max(margin, y2), H)
    w = max(margin, x2-x1)
    h = max(margin, y2-y1)
    return [x1, y1, w, h]

class BaseTracker():
    """Base class for all trackers."""

    def __init__(self, params):
        self.params = params
        self.visdom = None
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    def predicts_segmentation_mask(self):
        return False

    def initialize(self, image, info: dict) -> dict:
        """Overload this function in your tracker. This should initialize the model."""
        raise NotImplementedError

    def track(self, image, info: dict = None) -> dict:
        """Overload this function in your tracker. This should track in the frame and update the model."""
        raise NotImplementedError

    def visdom_draw_tracking(self, image, box, segmentation=None):
        # Упрощенная обработка box без OrderedDict
        if isinstance(box, dict):  # Проверяем на обычный dict вместо OrderedDict
            box = list(box.values())  # Берем только значения
        elif not isinstance(box, (list, tuple)):  # Если не коллекция
            box = (box,)  # Превращаем в кортеж
        
        # Визуализация
        if segmentation is None:
            self.visdom.register((image, *box), 'Tracking', 1, 'Tracking')
        else:
            self.visdom.register((image, *box, segmentation), 'Tracking', 1, 'Tracking')

In [3]:
cfg = {}

# MODEL
cfg["MODEL"] = {}

# MODEL.ENCODER
cfg["MODEL"]["ENCODER"] = {
    "TYPE": "dinov2_vitb14",  # encoder model
    "DROP_PATH": 0,
    "PRETRAIN_TYPE": "mae",  # mae, default, or scratch. This parameter is not activated for dinov2.
    "USE_CHECKPOINT": False,  # to save the memory.
    "STRIDE": 14,
    "POS_TYPE": 'interpolate',  # type of loading the positional encoding. "interpolate" or "index".
    "TOKEN_TYPE_INDICATE": False,  # add a token_type_embedding to indicate the search, template_foreground, template_background
    "INTERACTION_INDEXES": [[0, 6], [6, 12], [12, 18], [18, 24]],
    "GRAD_CKPT": False
}

# MODEL.NECK
cfg["MODEL"]["NECK"] = {
    "N_LAYERS": 4,
    "D_MODEL": 512,
    "D_STATE": 16  # MAMABA_HIDDEN_STATE
}

# MODEL.DECODER
cfg["MODEL"]["DECODER"] = {
    "TYPE": "CENTER",  # MLP, CORNER, CENTER
    "NUM_CHANNELS": 256
}

# TRAIN
cfg["TRAIN"] = {
    "LR": 0.0001,
    "WEIGHT_DECAY": 0.0001,
    "EPOCH": 500,
    "LR_DROP_EPOCH": 400,
    "BATCH_SIZE": 8,
    "NUM_WORKER": 8,
    "OPTIMIZER": "ADAMW",
    "ENCODER_MULTIPLIER": 0.1,  # encoder's LR = this factor * LR
    "FREEZE_ENCODER": False,  # for freezing the parameters of encoder
    "ENCODER_OPEN": [],  # only for debug, open some layers of encoder when FREEZE_ENCODER is True
    "CE_WEIGHT": 1.0,  # weight for cross-entropy loss
    "GIOU_WEIGHT": 2.0,
    "L1_WEIGHT": 5.0,
    "PRINT_INTERVAL": 50,  # interval to print the training log
    "GRAD_CLIP_NORM": 0.1,
    "FIX_BN": False,
    "ENCODER_W": "",
    "TYPE": "normal",  # normal, peft or fft
    "PRETRAINED_PATH": None
}

# TRAIN.SCHEDULER
cfg["TRAIN"]["SCHEDULER"] = {
    "TYPE": "step",
    "DECAY_RATE": 0.1
}

# DATA
cfg["DATA"] = {
    "MEAN": [0.485, 0.456, 0.406],
    "STD": [0.229, 0.224, 0.225],
    "MAX_SAMPLE_INTERVAL": 200,
    "SAMPLER_MODE": "order",
    "LOADER": "tracking"
}

# DATA.TRAIN
cfg["DATA"]["TRAIN"] = {
    "DATASETS_NAME": ["LASOT", "GOT10K_vottrain"],
    "DATASETS_RATIO": [1, 1],
    "SAMPLE_PER_EPOCH": 60000
}

# DATA.SEARCH
cfg["DATA"]["SEARCH"] = {
    "NUMBER": 1,  # number of search region, only support 1 for now.
    "SIZE": 256,
    "FACTOR": 4.0,
    "CENTER_JITTER": 3.5,
    "SCALE_JITTER": 0.5
}

# DATA.TEMPLATE
cfg["DATA"]["TEMPLATE"] = {
    "NUMBER": 1,
    "SIZE": 128,
    "FACTOR": 2.0,
    "CENTER_JITTER": 0,
    "SCALE_JITTER": 0
}

# TEST
cfg["TEST"] = {
    "TEMPLATE_FACTOR": 4.0,
    "TEMPLATE_SIZE": 256,
    "SEARCH_FACTOR": 2.0,
    "SEARCH_SIZE": 128,
    "EPOCH": 500,
    "WINDOW": False,  # window penalty
    "NUM_TEMPLATES": 1
}

# TEST.UPT
cfg["TEST"]["UPT"] = {
    "DEFAULT": 1,
    "LASOT": 0,
    "LASOT_EXTENSION_SUBSET": 0,
    "TRACKINGNET": 0,
    "TNL2K": 0,
    "NFS": 0,
    "UAV": 0,
    "VOT20": 0,
    "GOT10K_TEST": 0
}

# TEST.UPH
cfg["TEST"]["UPH"] = {
    "DEFAULT": 1,
    "LASOT": 0,
    "LASOT_EXTENSION_SUBSET": 0,
    "TRACKINGNET": 0,
    "TNL2K": 0,
    "NFS": 0,
    "UAV": 0,
    "VOT20": 0,
    "GOT10K_TEST": 0
}

# TEST.INTER
cfg["TEST"]["INTER"] = {
    "DEFAULT": 999999,
    "LASOT": 0,
    "LASOT_EXTENSION_SUBSET": 0,
    "TRACKINGNET": 0,
    "TNL2K": 0,
    "NFS": 0,
    "UAV": 0,
    "VOT20": 0,
    "GOT10K_TEST": 0
}

# TEST.MB
cfg["TEST"]["MB"] = {
    "DEFAULT": 500,
    "LASOT": 0,
    "LASOT_EXTENSION_SUBSET": 0,
    "TRACKINGNET": 0,
    "TNL2K": 0,
    "NFS": 0,
    "UAV": 0,
    "VOT20": 0,
    "GOT10K_TEST": 0
}

In [4]:
#Params
class TrackerParams:
    """Class for tracker parameters."""
    def set_default_values(self, default_vals: dict):
        for name, val in default_vals.items():
            if not hasattr(self, name):
                setattr(self, name, val)

    def get(self, name: str, *default):
        """Get a parameter value with the given name. If it does not exists, it return the default value given as a
        second argument or returns an error if no default value is given."""
        if len(default) > 1:
            raise ValueError('Can only give one default value.')

        if not default:
            return getattr(self, name)

        return getattr(self, name, default[0])

    def has(self, name: str):
        """Check if there exist a parameter with the given name."""
        return hasattr(self, name)

def _update_config(base_cfg, exp_cfg):
    if isinstance(base_cfg, dict) and isinstance(exp_cfg, dict):
        for k, v in exp_cfg.items():
            if k in base_cfg:
                if not isinstance(v, dict):
                    base_cfg[k] = v
                else:
                    _update_config(base_cfg[k], v)
            else:
                raise ValueError("{} not exist in config.py".format(k))
    else:
        return

def update_config_from_file(filename):
    exp_config = None
    with open(filename) as f:
        exp_config = yaml.safe_load(f)
        _update_config(cfg, exp_config)
    
def parameters(yaml_name: str):
    params = TrackerParams()

    yaml_file = "mcitrack_t224.yaml"
    update_config_from_file(yaml_file)
    params.cfg = cfg
    print("test config: ", cfg)

    params.yaml_name = yaml_name
    # template and search region
    params.template_factor = cfg["TEST"]["TEMPLATE_FACTOR"]
    params.template_size = cfg["TEST"]["TEMPLATE_SIZE"]
    params.search_factor = cfg["TEST"]["SEARCH_FACTOR"]
    params.search_size = cfg["TEST"]["SEARCH_SIZE"]

    # Network checkpoint path
    params.checkpoint = "MCITrack.trt"
    # whether to save boxes from all queries
    params.save_all_boxes = False

    return params

params = parameters("./mcitrack_t224.yaml")

test config:  {'MODEL': {'ENCODER': {'TYPE': 'fastitpnt', 'DROP_PATH': 0.1, 'PRETRAIN_TYPE': './fast_itpn_tiny_1600e_1k.pt', 'USE_CHECKPOINT': False, 'STRIDE': 16, 'POS_TYPE': 'index', 'TOKEN_TYPE_INDICATE': True, 'INTERACTION_INDEXES': [[4, 7], [7, 10], [10, 13], [13, 16]], 'GRAD_CKPT': False}, 'NECK': {'N_LAYERS': 4, 'D_MODEL': 384, 'D_STATE': 16}, 'DECODER': {'TYPE': 'CENTER', 'NUM_CHANNELS': 256}}, 'TRAIN': {'LR': 0.0004, 'WEIGHT_DECAY': 0.0001, 'EPOCH': 300, 'LR_DROP_EPOCH': 240, 'BATCH_SIZE': 64, 'NUM_WORKER': 10, 'OPTIMIZER': 'ADAMW', 'ENCODER_MULTIPLIER': 0.1, 'FREEZE_ENCODER': False, 'ENCODER_OPEN': [], 'CE_WEIGHT': 1.0, 'GIOU_WEIGHT': 2.0, 'L1_WEIGHT': 5.0, 'PRINT_INTERVAL': 50, 'GRAD_CLIP_NORM': 0.1, 'FIX_BN': False, 'ENCODER_W': '', 'TYPE': 'normal', 'PRETRAINED_PATH': None, 'SCHEDULER': {'TYPE': 'step', 'DECAY_RATE': 0.1}}, 'DATA': {'MEAN': [0.485, 0.456, 0.406], 'STD': [0.229, 0.224, 0.225], 'MAX_SAMPLE_INTERVAL': 400, 'SAMPLER_MODE': 'order', 'LOADER': 'tracking', 'TRAIN

In [6]:
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)

class MCITRACK(BaseTracker):
    def __init__(self, params):
        super(MCITRACK, self).__init__(params)
        self.cfg = params.cfg

        # Загружаем engine
        self.engine = self.load_engine("MCITrack.trt")
        if not self.engine:
            raise RuntimeError("Не удалось загрузить TensorRT engine.")
        
        # Создаем execution context
        self.context = self.engine.create_execution_context()

        # Инициализируем память для ввода/вывода
        self.d_input_z, self.d_input_x, self.d_input_anno, self.d_output_cls = self.initialize_memory()

        self.preprocessor = Preprocessor()
        self.state = None
        self.frame_id = 0
        self.save_all_boxes = params.save_all_boxes
        self.z_dict1 = {}

    def load_engine(self, engine_path):
        with open(engine_path, "rb") as engine_file, trt.Runtime(TRT_LOGGER) as runtime:
            engine = runtime.deserialize_cuda_engine(engine_file.read())
        return engine

    def initialize_memory(self):
        # Выделяем память
        self.d_input_z = cuda.mem_alloc(1 * 3 * 112 * 112 * np.dtype(np.float32).itemsize)
        self.d_input_x = cuda.mem_alloc(1 * 3 * 224 * 224 * np.dtype(np.float32).itemsize)
        self.d_input_anno = cuda.mem_alloc(1 * 4 * np.dtype(np.float32).itemsize)
        self.d_output_cls = cuda.mem_alloc(1 * 4 * np.dtype(np.float32).itemsize)

        print("Allocated memory addresses:")
        print("d_input_z:", int(self.d_input_z))
        print("d_input_x:", int(self.d_input_x))
        print("d_input_anno:", int(self.d_input_anno))
        print("d_output_cls:", int(self.d_output_cls))

        return self.d_input_z, self.d_input_x, self.d_input_anno, self.d_output_cls



        return d_input_z, d_input_x, d_input_anno, d_output_cls

    def predict(self, input_x, input_z, input_anno):
        cuda.memcpy_htod(self.d_input_z, input_z.ravel())
        cuda.memcpy_htod(self.d_input_x, input_x.ravel())
        cuda.memcpy_htod(self.d_input_anno, input_anno.ravel())

        bindings = [int(self.d_input_z), int(self.d_input_x), int(self.d_input_anno), int(self.d_output_cls)]
        print("Bindings:", bindings)
        self.context.execute_v2(bindings=bindings)

        output_data = np.empty([1, 4], dtype=np.float32)
        cuda.memcpy_dtoh(output_data, self.d_output_cls)

        return output_data
    def initialize(self, image, info: dict):
        z_patch_arr, _ = sample_target(image, info['init_bbox'], self.params.template_factor,
                                    output_sz=self.params.template_size)
        self.template = self.preprocessor.process(z_patch_arr)
        self.template_anno = torch.tensor(info['init_bbox'], dtype=torch.float32)  
        self.state = info['init_bbox']
        self.frame_id = 0


    def track(self, image, info: dict = None):
        H, W, _ = image.shape
        self.frame_id += 1

        x_patch_arr, resize_factor = sample_target(image, self.state, self.params.search_factor,
                                                   output_sz=self.params.search_size)
        search = self.preprocessor.process(x_patch_arr)

        search_np = search.cpu().numpy().astype(np.float32)
        template_np = self.template.cpu().numpy().astype(np.float32)
        template_anno_np = self.template_anno.cpu().numpy().astype(np.float32)

        outputs = self.predict(search_np, template_np, template_anno_np)

        pred_boxes = torch.from_numpy(outputs).view(-1, 4)
        pred_box = (pred_boxes.mean(dim=0) * self.params.search_size / resize_factor).tolist()
        
        self.state = clip_box(self.map_box_back(pred_box, resize_factor), H, W, margin=10)

        return {"target_bbox": self.state, "confidence": 0}
    
    def map_box_back(self, pred_box: list, resize_factor: float):
        cx_prev, cy_prev = self.state[0] + 0.5 * self.state[2], self.state[1] + 0.5 * self.state[3]
        cx, cy, w, h = pred_box
        half_side = 0.5 * self.params.search_size / resize_factor
        cx_real = cx + (cx_prev - half_side)
        cy_real = cy + (cy_prev - half_side)
        return [cx_real - 0.5 * w, cy_real - 0.5 * h, w, h]

    def map_box_back_batch(self, pred_box: torch.Tensor, resize_factor: float):
        cx_prev, cy_prev = self.state[0] + 0.5 * self.state[2], self.state[1] + 0.5 * self.state[3]
        cx, cy, w, h = pred_box.unbind(-1)
        half_side = 0.5 * self.params.search_size / resize_factor
        cx_real = cx + (cx_prev - half_side)
        cy_real = cy + (cy_prev - half_side)
        return torch.stack([cx_real - 0.5 * w, cy_real - 0.5 * h, w, h], dim=-1)


In [7]:
#Диагностика
import numpy as np
import torch
import cv2

def create_test_image():
    """Создает тестовое изображение размером 640x480 с случайными цветами"""
    image = np.random.randint(0, 255, (480, 640, 3), dtype=np.uint8)
    return image

def create_test_info():
    """Создает тестовый bounding box"""
    init_bbox = [200, 150, 100, 100]  # [x, y, width, height]
    return {'init_bbox': init_bbox}

def diagnose_tracker(tracker):
    print("Запуск диагностики...")

    # Создание тестовых данных
    image = create_test_image()
    info = create_test_info()

    # Проверка инициализации трекера
    print("Проверка инициализации трекера...")
    tracker.initialize(image, info)
    assert tracker.state is not None, "Ошибка: tracker.state не инициализирован!"

    # Проверка входных данных
    print("Проверка входных данных...")
    x_patch_arr, resize_factor = sample_target(image, tracker.state, tracker.params.search_factor,
                                               output_sz=tracker.params.search_size)
    search = tracker.preprocessor.process(x_patch_arr)

    search_np = search.cpu().numpy().astype(np.float32)
    template_np = tracker.template.cpu().numpy().astype(np.float32)
    template_anno_np = tracker.template_anno.cpu().numpy().astype(np.float32)

    assert search_np is not None, "Ошибка: search_np = None!"
    assert template_np is not None, "Ошибка: template_np = None!"
    assert template_anno_np is not None, "Ошибка: template_anno_np = None!"

    print("Диапазон входных данных:")
    print(f"search_np: min={search_np.min()}, max={search_np.max()}")
    print(f"template_np: min={template_np.min()}, max={template_np.max()}")
    print(f"template_anno_np: min={template_anno_np.min()}, max={template_anno_np.max()}")

    # Проверка предсказания
    print("Запуск модели...")
    outputs = tracker.predict(search_np, template_np, template_anno_np)

    print("Диапазон выходных данных:")
    print(f"outputs: min={outputs.min()}, max={outputs.max()}")

    assert outputs is not None, "Ошибка: outputs = None!"
    assert np.any(outputs), "Ошибка: Все значения outputs равны нулю!"

    print("Диагностика завершена успешно!")

# Пример вызова:
# tracker = MCITRACK(params)  # Создай объект трекера перед вызовом диагностики
# diagnose_tracker(tracker)


In [8]:
treacker = MCITRACK(params)

Allocated memory addresses:
d_input_z: 47326617600
d_input_x: 47326768128
d_input_anno: 47327370240
d_output_cls: 47327370752


In [9]:
diagnose_tracker(treacker)

Запуск диагностики...
Проверка инициализации трекера...
Проверка входных данных...
Диапазон входных данных:
search_np: min=-2.0665297508239746, max=2.605142116546631
template_np: min=-2.0494048595428467, max=2.517995834350586
template_anno_np: min=100.0, max=200.0
Запуск модели...
Bindings: [47326617600, 47326768128, 47327370240, 47327370752]
Диапазон выходных данных:
outputs: min=0.0, max=0.0


AssertionError: Ошибка: Все значения outputs равны нулю!

In [None]:
# Трекинг по видео
file = "0516.mp4"
video = cv2.VideoCapture(file)
ok, image = video.read()
if not video.isOpened():
    print("Could not open video")
    sys.exit()
    
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

x, y, w, h = cv2.selectROI( image, fromCenter=False)
init_state = [x, y, w, h]
def _build_init_info(box):
            return {'init_bbox': box}
treacker.initialize(image, _build_init_info(init_state))
counter = 0
while True:
    ok, image = video.read()
    if not ok:
        break

    # Конвертация для трекера
    tracker_image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    
    # Трекинг
    start_time = time.time()
    out = treacker.track(tracker_image)
    state = [int(s) for s in out['target_bbox']]
    #best_score = out["best_score"]
    best_score = 1
    fps = 1 / (time.time() - start_time + 1e-6)

    # Визуализация
    display_image = image.copy()
    x, y, w, h = state
    
    # Динамический цвет рамки в зависимости от уверенности
    color = (0, 255, 0) if best_score > 0.7 else (0, 255, 255) if best_score > 0.4 else (0, 0, 255)
    thickness = 3 if best_score > 0.7 else 2
    
    # Рисуем bounding box с увеличенными размерами
    cv2.rectangle(display_image, (x, y), (x + w, y + h), color, thickness)
    
    # Добавляем информационный текст
    cv2.putText(display_image, f"Score: {best_score:.2f}", (x, y-10), 
                cv2.FONT_HERSHEY_SIMPLEX, 0.7, color, 2)
    cv2.putText(display_image, f"FPS: {fps:.1f}", (20, 40), 
                cv2.FONT_HERSHEY_SIMPLEX, 0.8, (255, 255, 255), 2)
    
    cv2.imshow("tracking", display_image)
    
    # Обработка клавиш
    key = cv2.waitKey(1) & 0xFF
    if key == 32:  # SPACE - переинициализация
        x, y, w, h = cv2.selectROI("Select ROI", image, fromCenter=False)
        if w > 10 and h > 10:  # Минимальный размер ROI
            init_state = [x, y, w, h]
            print("Переинициализация...")
            treacker.initialize(tracker_image, _build_init_info(init_state))
    elif key == 27:  # ESC - выход
        break
cv2.destroyAllWindows()

Bindings: [47413542912, 47413693440, 47414295552, 47414296064]


LogicError: cuMemcpyDtoH failed: an illegal memory access was encountered

: 