In [1]:
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit  # автоматически инициализирует CUDA контекст
import torch
import cv2
import sys
import math
import time
import numpy as np
import yaml
import torch
import numpy as np
import onnx
import onnxruntime as ort


""" TRT_LOGGER = trt.Logger(trt.Logger.WARNING)

def load_engine(trt_runtime, engine_path):
    with open(engine_path, "rb") as f:
        engine_data = f.read()
    return trt_runtime.deserialize_cuda_engine(engine_data)

trt_runtime = trt.Runtime(TRT_LOGGER)
engine = load_engine(trt_runtime, "MCITrack.trt")
engine """




In [2]:
def cal_bbox(score_map_ctr, size_map, offset_map, return_score=True):
    
    # 2. Получаем размеры feature map
    feat_h, feat_w = score_map_ctr.shape[-2], score_map_ctr.shape[-1]
    
    # 3. Находим позицию с максимальным score (современный способ)
    max_score, flat_idx = torch.max(score_map_ctr.flatten(1), dim=1)
    idx = flat_idx.unsqueeze(1)
    idx_y = torch.div(flat_idx, feat_w, rounding_mode='floor')
    idx_x = flat_idx % feat_w
    
    # 4. Подготовка индексов для gather
    gather_idx = idx.unsqueeze(1).expand(-1, 2, -1)
    
    # 5. Обработка size_map (расширяем если 1 канал)
    if size_map.size(1) == 1:
        size_map = size_map.expand(-1, 2, -1, -1)
    
    # 6. Получаем размеры и смещения
    try:
        size = size_map.flatten(2).gather(2, gather_idx)
        offset = offset_map.flatten(2).gather(2, gather_idx).squeeze(-1)
    except RuntimeError as e:
        print("Ошибка размерностей:")
        print(f"score_map_ctr: {score_map_ctr.shape}")
        print(f"size_map: {size_map.shape}")
        print(f"offset_map: {offset_map.shape}")
        print(f"gather_idx: {gather_idx.shape}")
        raise
    
    # 7. Формируем bbox (cx, cy, w, h)
    bbox = torch.cat([
        (idx_x.to(torch.float) + offset[:, 0:1]) / feat_w,
        (idx_y.to(torch.float) + offset[:, 1:2]) / feat_h,
        size.squeeze(-1)
    ], dim=1)
    
    return (bbox, max_score) if return_score else bbox

""" def cal_bbox(score_map_ctr, size_map, offset_map, return_score=True):
        feat_sz = 14
        max_score, idx = torch.max(score_map_ctr.flatten(1), dim=1, keepdim=True) # score_map_ctr.flatten(1): torch.Size([32, 256]) idx: torch.Size([32, 1]) max_score: torch.Size([32, 1])
        idx_y = torch.div(idx, feat_sz, rounding_mode='floor')
        idx_x = idx % feat_sz
       
        

        idx = idx.unsqueeze(1).expand(idx.shape[0], 2, 1)
        size = size_map.flatten(2).gather(dim=2, index=idx) # size_map: torch.Size([32, 2, 16, 16])  size_map.flatten(2): torch.Size([32, 2, 256])
        offset = offset_map.flatten(2).gather(dim=2, index=idx).squeeze(-1)

        # bbox = torch.cat([idx_x - size[:, 0] / 2, idx_y - size[:, 1] / 2,
        #                   idx_x + size[:, 0] / 2, idx_y + size[:, 1] / 2], dim=1) / self.feat_sz
        # cx, cy, w, h
        bbox = torch.cat([(idx_x.to(torch.float) + offset[:, :1]) / feat_sz,
                          (idx_y.to(torch.float) + offset[:, 1:]) / feat_sz,
                          size.squeeze(-1)], dim=1)

        if return_score:
            return bbox, max_score
        return (bbox, max_score) if return_score else bbox """
        
class Preprocessor(object):
    def __init__(self):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.mean = torch.tensor([0.485, 0.456, 0.406]).view((1, 3, 1, 1)).to(self.device)
        self.std = torch.tensor([0.229, 0.224, 0.225]).view((1, 3, 1, 1)).to(self.device)
        self.mm_mean = torch.tensor([0.485, 0.456, 0.406, 0.485, 0.456, 0.406]).view((1, 6, 1, 1)).to(self.device)
        self.mm_std = torch.tensor([0.229, 0.224, 0.225, 0.229, 0.224, 0.225]).view((1, 6, 1, 1)).to(self.device)

    def process(self, img_arr: np.ndarray):
        if img_arr.shape[-1] == 6:
            mean = self.mm_mean
            std = self.mm_std
        else:
            mean = self.mean
            std = self.std
        # Deal with the image patch
        img_tensor = torch.tensor(img_arr).to(self.device).float().permute((2,0,1)).unsqueeze(dim=0)        
        img_tensor_norm = ((img_tensor / 255.0) - mean) / std  # (1,3,H,W)
        return img_tensor_norm
    
def hann1d(sz: int, centered = True) -> torch.Tensor:
    """1D cosine window."""
    if centered:
        return 0.5 * (1 - torch.cos((2 * math.pi / (sz + 1)) * torch.arange(1, sz + 1).float()))
    w = 0.5 * (1 + torch.cos((2 * math.pi / (sz + 2)) * torch.arange(0, sz//2 + 1).float()))
    return torch.cat([w, w[1:sz-sz//2].flip((0,))])
    
def hann2d(sz: torch.Tensor, centered = True) -> torch.Tensor:
    """2D cosine window."""
    return hann1d(sz[0].item(), centered).reshape(1, 1, -1, 1) * hann1d(sz[1].item(), centered).reshape(1, 1, 1, -1)    

def sample_target(im, target_bb, search_area_factor, output_sz=None):
   
    if not isinstance(target_bb, list):
        x, y, w, h = target_bb.tolist()
    else:
        x, y, w, h = target_bb
    # Crop image
    crop_sz = math.ceil(math.sqrt(w * h) * search_area_factor)

    if crop_sz < 1:
        raise Exception('Too small bounding box.')

    x1 = round(x + 0.5 * w - crop_sz * 0.5)
    x2 = x1 + crop_sz

    y1 = round(y + 0.5 * h - crop_sz * 0.5)
    y2 = y1 + crop_sz

    x1_pad = max(0, -x1)
    x2_pad = max(x2 - im.shape[1] + 1, 0)

    y1_pad = max(0, -y1)
    y2_pad = max(y2 - im.shape[0] + 1, 0)

    # Crop target
    im_crop = im[y1 + y1_pad:y2 - y2_pad, x1 + x1_pad:x2 - x2_pad, :]

    # Pad
    im_crop_padded = cv2.copyMakeBorder(im_crop, y1_pad, y2_pad, x1_pad, x2_pad, cv2.BORDER_CONSTANT)
    # deal with attention mask
    H, W, _ = im_crop_padded.shape

    if output_sz is not None:
        resize_factor = output_sz / crop_sz
        im_crop_padded = cv2.resize(im_crop_padded, (output_sz, output_sz))

        return im_crop_padded, resize_factor

    else:
        return im_crop_padded, 1.0
def transform_image_to_crop(box_in: torch.Tensor, box_extract: torch.Tensor, resize_factor: float,
                            crop_sz: torch.Tensor, normalize=False) -> torch.Tensor:
   
    box_extract_center = box_extract[0:2] + 0.5 * box_extract[2:4]

    box_in_center = box_in[0:2] + 0.5 * box_in[2:4]

    box_out_center = (crop_sz - 1) / 2 + (box_in_center - box_extract_center) * resize_factor
    box_out_wh = box_in[2:4] * resize_factor

    box_out = torch.cat((box_out_center - 0.5 * box_out_wh, box_out_wh))
    if normalize:
        return box_out / (crop_sz[0]-1)
    else:
        return box_out
def clip_box(box: list, H, W, margin=0):
    x1, y1, w, h = box
    x2, y2 = x1 + w, y1 + h
    x1 = min(max(0, x1), W-margin)
    x2 = min(max(margin, x2), W)
    y1 = min(max(0, y1), H-margin)
    y2 = min(max(margin, y2), H)
    w = max(margin, x2-x1)
    h = max(margin, y2-y1)
    return [x1, y1, w, h]

class BaseTracker():
    """Base class for all trackers."""

    def __init__(self, params):
        self.params = params
        self.visdom = None
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    def predicts_segmentation_mask(self):
        return False

    def initialize(self, image, info: dict) -> dict:
        """Overload this function in your tracker. This should initialize the model."""
        raise NotImplementedError

    def track(self, image, info: dict = None) -> dict:
        """Overload this function in your tracker. This should track in the frame and update the model."""
        raise NotImplementedError

    def visdom_draw_tracking(self, image, box, segmentation=None):
        # Упрощенная обработка box без OrderedDict
        if isinstance(box, dict):  # Проверяем на обычный dict вместо OrderedDict
            box = list(box.values())  # Берем только значения
        elif not isinstance(box, (list, tuple)):  # Если не коллекция
            box = (box,)  # Превращаем в кортеж
        
        # Визуализация
        if segmentation is None:
            self.visdom.register((image, *box), 'Tracking', 1, 'Tracking')
        else:
            self.visdom.register((image, *box, segmentation), 'Tracking', 1, 'Tracking')

In [3]:
cfg = {}

# MODEL
cfg["MODEL"] = {}

# MODEL.ENCODER
cfg["MODEL"]["ENCODER"] = {
    "TYPE": "dinov2_vitb14",  # encoder model
    "DROP_PATH": 0,
    "PRETRAIN_TYPE": "mae",  # mae, default, or scratch. This parameter is not activated for dinov2.
    "USE_CHECKPOINT": False,  # to save the memory.
    "STRIDE": 14,
    "POS_TYPE": 'interpolate',  # type of loading the positional encoding. "interpolate" or "index".
    "TOKEN_TYPE_INDICATE": False,  # add a token_type_embedding to indicate the search, template_foreground, template_background
    "INTERACTION_INDEXES": [[0, 6], [6, 12], [12, 18], [18, 24]],
    "GRAD_CKPT": False
}

# MODEL.NECK
cfg["MODEL"]["NECK"] = {
    "N_LAYERS": 4,
    "D_MODEL": 512,
    "D_STATE": 16  # MAMABA_HIDDEN_STATE
}

# MODEL.DECODER
cfg["MODEL"]["DECODER"] = {
    "TYPE": "CENTER",  # MLP, CORNER, CENTER
    "NUM_CHANNELS": 256
}

# TRAIN
cfg["TRAIN"] = {
    "LR": 0.0001,
    "WEIGHT_DECAY": 0.0001,
    "EPOCH": 500,
    "LR_DROP_EPOCH": 400,
    "BATCH_SIZE": 8,
    "NUM_WORKER": 8,
    "OPTIMIZER": "ADAMW",
    "ENCODER_MULTIPLIER": 0.1,  # encoder's LR = this factor * LR
    "FREEZE_ENCODER": False,  # for freezing the parameters of encoder
    "ENCODER_OPEN": [],  # only for debug, open some layers of encoder when FREEZE_ENCODER is True
    "CE_WEIGHT": 1.0,  # weight for cross-entropy loss
    "GIOU_WEIGHT": 2.0,
    "L1_WEIGHT": 5.0,
    "PRINT_INTERVAL": 50,  # interval to print the training log
    "GRAD_CLIP_NORM": 0.1,
    "FIX_BN": False,
    "ENCODER_W": "",
    "TYPE": "normal",  # normal, peft or fft
    "PRETRAINED_PATH": None
}

# TRAIN.SCHEDULER
cfg["TRAIN"]["SCHEDULER"] = {
    "TYPE": "step",
    "DECAY_RATE": 0.1
}

# DATA
cfg["DATA"] = {
    "MEAN": [0.485, 0.456, 0.406],
    "STD": [0.229, 0.224, 0.225],
    "MAX_SAMPLE_INTERVAL": 200,
    "SAMPLER_MODE": "order",
    "LOADER": "tracking"
}

# DATA.TRAIN
cfg["DATA"]["TRAIN"] = {
    "DATASETS_NAME": ["LASOT", "GOT10K_vottrain"],
    "DATASETS_RATIO": [1, 1],
    "SAMPLE_PER_EPOCH": 60000
}

# DATA.SEARCH
cfg["DATA"]["SEARCH"] = {
    "NUMBER": 1,  # number of search region, only support 1 for now.
    "SIZE": 256,
    "FACTOR": 4.0,
    "CENTER_JITTER": 3.5,
    "SCALE_JITTER": 0.5
}

# DATA.TEMPLATE
cfg["DATA"]["TEMPLATE"] = {
    "NUMBER": 1,
    "SIZE": 128,
    "FACTOR": 2.0,
    "CENTER_JITTER": 0,
    "SCALE_JITTER": 0
}

# TEST
cfg["TEST"] = {
    "TEMPLATE_FACTOR": 4.0,
    "TEMPLATE_SIZE": 256,
    "SEARCH_FACTOR": 2.0,
    "SEARCH_SIZE": 128,
    "EPOCH": 500,
    "WINDOW": False,  # window penalty
    "NUM_TEMPLATES": 1
}

# TEST.UPT
cfg["TEST"]["UPT"] = {
    "DEFAULT": 1,
    "LASOT": 0,
    "LASOT_EXTENSION_SUBSET": 0,
    "TRACKINGNET": 0,
    "TNL2K": 0,
    "NFS": 0,
    "UAV": 0,
    "VOT20": 0,
    "GOT10K_TEST": 0
}

# TEST.UPH
cfg["TEST"]["UPH"] = {
    "DEFAULT": 1,
    "LASOT": 0,
    "LASOT_EXTENSION_SUBSET": 0,
    "TRACKINGNET": 0,
    "TNL2K": 0,
    "NFS": 0,
    "UAV": 0,
    "VOT20": 0,
    "GOT10K_TEST": 0
}

# TEST.INTER
cfg["TEST"]["INTER"] = {
    "DEFAULT": 999999,
    "LASOT": 0,
    "LASOT_EXTENSION_SUBSET": 0,
    "TRACKINGNET": 0,
    "TNL2K": 0,
    "NFS": 0,
    "UAV": 0,
    "VOT20": 0,
    "GOT10K_TEST": 0
}

# TEST.MB
cfg["TEST"]["MB"] = {
    "DEFAULT": 500,
    "LASOT": 0,
    "LASOT_EXTENSION_SUBSET": 0,
    "TRACKINGNET": 0,
    "TNL2K": 0,
    "NFS": 0,
    "UAV": 0,
    "VOT20": 0,
    "GOT10K_TEST": 0
}

In [4]:
#Params
class TrackerParams:
    """Class for tracker parameters."""
    def set_default_values(self, default_vals: dict):
        for name, val in default_vals.items():
            if not hasattr(self, name):
                setattr(self, name, val)

    def get(self, name: str, *default):
        """Get a parameter value with the given name. If it does not exists, it return the default value given as a
        second argument or returns an error if no default value is given."""
        if len(default) > 1:
            raise ValueError('Can only give one default value.')

        if not default:
            return getattr(self, name)

        return getattr(self, name, default[0])

    def has(self, name: str):
        """Check if there exist a parameter with the given name."""
        return hasattr(self, name)

def _update_config(base_cfg, exp_cfg):
    if isinstance(base_cfg, dict) and isinstance(exp_cfg, dict):
        for k, v in exp_cfg.items():
            if k in base_cfg:
                if not isinstance(v, dict):
                    base_cfg[k] = v
                else:
                    _update_config(base_cfg[k], v)
            else:
                raise ValueError("{} not exist in config.py".format(k))
    else:
        return

def update_config_from_file(filename):
    exp_config = None
    with open(filename) as f:
        exp_config = yaml.safe_load(f)
        _update_config(cfg, exp_config)
    
def parameters(yaml_name: str):
    params = TrackerParams()

    yaml_file = "mcitrack_t224.yaml"
    update_config_from_file(yaml_file)
    params.cfg = cfg
    print("test config: ", cfg)

    params.yaml_name = yaml_name
    # template and search region
    params.template_factor = cfg["TEST"]["TEMPLATE_FACTOR"]
    params.template_size = cfg["TEST"]["TEMPLATE_SIZE"]
    params.search_factor = cfg["TEST"]["SEARCH_FACTOR"]
    params.search_size = cfg["TEST"]["SEARCH_SIZE"]

    # Network checkpoint path
    params.checkpoint = "MCITrack.trt"
    # whether to save boxes from all queries
    params.save_all_boxes = False

    return params

params = parameters("./mcitrack_t224.yaml")

test config:  {'MODEL': {'ENCODER': {'TYPE': 'fastitpnt', 'DROP_PATH': 0.1, 'PRETRAIN_TYPE': './fast_itpn_tiny_1600e_1k.pt', 'USE_CHECKPOINT': False, 'STRIDE': 16, 'POS_TYPE': 'index', 'TOKEN_TYPE_INDICATE': True, 'INTERACTION_INDEXES': [[4, 7], [7, 10], [10, 13], [13, 16]], 'GRAD_CKPT': False}, 'NECK': {'N_LAYERS': 4, 'D_MODEL': 384, 'D_STATE': 16}, 'DECODER': {'TYPE': 'CENTER', 'NUM_CHANNELS': 256}}, 'TRAIN': {'LR': 0.0004, 'WEIGHT_DECAY': 0.0001, 'EPOCH': 300, 'LR_DROP_EPOCH': 240, 'BATCH_SIZE': 64, 'NUM_WORKER': 10, 'OPTIMIZER': 'ADAMW', 'ENCODER_MULTIPLIER': 0.1, 'FREEZE_ENCODER': False, 'ENCODER_OPEN': [], 'CE_WEIGHT': 1.0, 'GIOU_WEIGHT': 2.0, 'L1_WEIGHT': 5.0, 'PRINT_INTERVAL': 50, 'GRAD_CLIP_NORM': 0.1, 'FIX_BN': False, 'ENCODER_W': '', 'TYPE': 'normal', 'PRETRAINED_PATH': None, 'SCHEDULER': {'TYPE': 'step', 'DECAY_RATE': 0.1}}, 'DATA': {'MEAN': [0.485, 0.456, 0.406], 'STD': [0.229, 0.224, 0.225], 'MAX_SAMPLE_INTERVAL': 400, 'SAMPLER_MODE': 'order', 'LOADER': 'tracking', 'TRAIN

In [7]:
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
class MCITRACK2(BaseTracker):
    def __init__(self, params):
        super(MCITRACK2, self).__init__(params)

        self.cfg = params.cfg
        
        self.ort_session = self.load_engine("MCITrack.trt")
        print(self.ort_session)
                # Пример использования
        self.context, self.d_input_z, self.d_input_x, self.d_output_cls = self.initialize_trt(self.ort_session)

        self.preprocessor = Preprocessor()
        self.state = None
    
        self.frame_id = 0
        # for save boxes from all queries
        self.save_all_boxes = params.save_all_boxes
        self.z_dict1 = {}
        
    def load_engine(self,engine_path):
        with open(engine_path, "rb") as engine_file, trt.Runtime(TRT_LOGGER) as runtime:
            engine = runtime.deserialize_cuda_engine(engine_file.read())
        return engine
    
    # Инициализация памяти и контекста
    def initialize_trt(self,engine):
        context = engine.create_execution_context()

        # Предположим, что размеры входных данных известны заранее
        input_size_z = np.prod([1,3,112,112]) * np.dtype(np.float32).itemsize
        d_input_z = cuda.mem_alloc(int(input_size_z))

        input_size_x = np.prod([1,3,224,224]) * np.dtype(np.float32).itemsize
        d_input_x = cuda.mem_alloc(int(input_size_x))

        output_size_cls = np.prod([1, 4]) * np.dtype(np.float32).itemsize
        d_output_cls = cuda.mem_alloc(int(output_size_cls))



        return context, d_input_z, d_input_x, d_output_cls
    
    
    
    def predictV2(self,context, input_data_x, input_data_y, d_input_x, d_input_y, d_output_cls):
        # Prepare input
        input_shape_x = input_data_x.shape
        input_size_x = np.prod(input_shape_x) * input_data_x.itemsize
        cuda.memcpy_htod(d_input_x, input_data_x.ravel())

        input_shape_y = input_data_y.shape
        input_size_y = np.prod(input_shape_y) * input_data_y.itemsize
        cuda.memcpy_htod(d_input_y, input_data_y.ravel())

        # Execute model
        context.execute_v2(bindings=[
            int(d_input_x), int(d_input_y),
            int(d_output_cls)
        ])

        # Fetch output data
        output_data_cls = np.empty([1, 4], dtype=np.float32)
        cuda.memcpy_dtoh(output_data_cls, d_output_cls)
        return output_data_cls
    
    def initialize(self, image, info: dict):
        # forward the template once

        z_patch_arr, _ = sample_target(image, info['init_bbox'], self.params.template_factor,
                                     output_sz=self.params.template_size)

        self.template = self.preprocessor.process(z_patch_arr)
        # save states
        self.state = info['init_bbox']
        self.frame_id = 0


    def track(self, image, info: dict = None):
        H, W, _ = image.shape
        self.frame_id += 1
        x_patch_arr, resize_factor = sample_target(image, self.state, self.params.search_factor,
                                                                output_sz=self.params.search_size) # (x1, y1, w, h)
        search = self.preprocessor.process(x_patch_arr)
        
        #print(search.cpu().numpy().astype(np.float32).shape)
        #print(self.template.cpu().numpy().astype(np.float32).shape)
        
        outputs =self.predictV2(self.context,search.cpu().numpy().astype(np.float32), self.template.cpu().numpy().astype(np.float32),self.d_input_z, self.d_input_x, self.d_output_cls)

        pred_boxes = torch.from_numpy(outputs).view(-1, 4)    
        
        #pred_boxes = outputs[0].reshape(-1, 4)        
        # Baseline: Take the mean of all pred boxes as the final result
        pred_box = (pred_boxes.mean(dim=0) * self.params.search_size / resize_factor).tolist() # (cx, cy, w, h) [0,1]
        # get the final box result
        self.state = clip_box(self.map_box_back(pred_box, resize_factor), H, W, margin=10)    

        #np.clip(outputs[1], 0, 1).max()
        return {"target_bbox": self.state, "confidence": 0}

    def map_box_back(self, pred_box: list, resize_factor: float):
        cx_prev, cy_prev = self.state[0] + 0.5 * self.state[2], self.state[1] + 0.5 * self.state[3]
        cx, cy, w, h = pred_box
        half_side = 0.5 * self.params.search_size / resize_factor
        cx_real = cx + (cx_prev - half_side)
        cy_real = cy + (cy_prev - half_side)
        return [cx_real - 0.5 * w, cy_real - 0.5 * h, w, h]

    def map_box_back_batch(self, pred_box: torch.Tensor, resize_factor: float):
        cx_prev, cy_prev = self.state[0] + 0.5 * self.state[2], self.state[1] + 0.5 * self.state[3]
        cx, cy, w, h = pred_box.unbind(-1)
        half_side = 0.5 * self.params.search_size / resize_factor
        cx_real = cx + (cx_prev - half_side)
        cy_real = cy + (cy_prev - half_side)
        return torch.stack([cx_real - 0.5 * w, cy_real - 0.5 * h, w, h], dim=-1)


def get_tracker_class():
    return MCITRACK2

In [5]:
import numpy as np
import torch
import pycuda.driver as cuda
import pycuda.autoinit
import tensorrt as trt

TRT_LOGGER = trt.Logger(trt.Logger.WARNING)

class MCITRACK(BaseTracker):
    def __init__(self, params):
        super(MCITRACK, self).__init__(params)
        
        self.cfg = params.cfg
        
        # Инициализация параметров из конфига
        self.num_template = self.cfg["TEST"].get("TEMPLATE_NUMBER", 1)
        self.update_threshold = self.cfg["TEST"].get("UPDATE_THRESHOLD", 0.8)
        self.update_intervals = self.cfg["TEST"].get("UPDATE_INTERVALS", 200)
        self.memory_bank = self.cfg["TEST"].get("MEMORY_BANK", 20)
        self.window = self.cfg["TEST"].get("WINDOW", True)
        
        # Загрузка TensorRT engine
        self.engine = self.load_engine("MCITrack.trt")
        # После загрузки модели
        print("Engine bindings:", [binding for binding in self.engine])        
        self.context, self.d_input_z, self.d_input_x, self.d_input_anno, self.d_output_boxes = self.initialize_trt(self.engine)
        
        self.preprocessor = Preprocessor()
        self.state = None
        self.frame_id = 0
        self.save_all_boxes = params.save_all_boxes
        
        # Инициализация списков шаблонов
        self.template_list = []
        self.template_anno_list = []
        self.memory_template_list = []
        self.memory_template_anno_list = []
        
        # Инициализация окна для penalty
        if self.window:
            self.output_window = self._generate_window()

    def load_engine(self, engine_path):
        with open(engine_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
            return runtime.deserialize_cuda_engine(f.read())

    def initialize_trt(self, engine):
        context = engine.create_execution_context()
        
        # Выделение памяти для входов
        d_input_z = cuda.mem_alloc(1 * 3 * 112 * 112 * np.float32().itemsize)
        d_input_x = cuda.mem_alloc(1 * 3 * 224 * 224 * np.float32().itemsize)
        d_input_anno = cuda.mem_alloc(1 * 4 * np.float32().itemsize)
        
        # Выделение памяти для выходов
        d_output_boxes = cuda.mem_alloc(1 * 4 * np.float32().itemsize)
        
        return context, d_input_z, d_input_x, d_input_anno, d_output_boxes

    def _generate_window(self):
        # Генерация окна для penalty (как в оригинальном коде)
        hann_window = np.outer(np.hanning(16), np.hanning(16))
        return torch.from_numpy(hann_window).float().to('cuda')

    def initialize(self, image, info: dict):
        z_patch_arr, _ = sample_target(image, info['init_bbox'], self.params.template_factor,
                                     output_sz=self.params.template_size)
        template = self.preprocessor.process(z_patch_arr)
        
        # Инициализация шаблонов
        self.template = template
        self.template_list = [template]
        
        # Инициализация аннотаций
        init_bbox = info['init_bbox']
        self.template_anno = torch.tensor(init_bbox).view(1, 4).float()
        self.template_anno_list = [self.template_anno]
        
        # Очистка памяти
        self.memory_template_list = []
        self.memory_template_anno_list = []
        
        # Сохранение состояния
        self.state = init_bbox
        self.frame_id = 0

    def predictV2(self, context, search, template, template_anno, 
              d_input_x, d_input_z, d_input_anno, d_output_boxes):
    # Копирование данных на устройство
        cuda.memcpy_htod(d_input_z, template.ravel())
        cuda.memcpy_htod(d_input_x, search.ravel())
        cuda.memcpy_htod(d_input_anno, template_anno.ravel())

        # Получаем имена биндингов engine
        binding_names = [b for b in self.engine]
        bindings = [0] * len(binding_names)
        name_to_ptr = {
            'template_list': d_input_z,
            'search_list': d_input_x,
            'template_anno_list': d_input_anno,
            'pred_boxes': d_output_boxes
        }
        for name, ptr in name_to_ptr.items():
            if name in binding_names:
                bindings[binding_names.index(name)] = int(ptr)

        # Выполнение модели
        context.execute_v2(bindings=bindings)

        # Получение результатов
        output_boxes = np.empty([1, 4], dtype=np.float32)
        cuda.memcpy_dtoh(output_boxes, d_output_boxes)
        return output_boxes



    def track(self, image, info: dict = None):
        H, W, _ = image.shape
        self.frame_id += 1
        
        # Получаем search патч с текущей позиции
        x_patch_arr, resize_factor = sample_target(image, self.state, self.params.search_factor,
                                                output_sz=self.params.search_size)
        search = self.preprocessor.process(x_patch_arr)
        
        # Подготавливаем входные данные
        template_np = self.template.cpu().numpy().astype(np.float32)
        search_np = search.cpu().numpy().astype(np.float32)
        template_anno_np = self.template_anno.cpu().numpy().astype(np.float32)
        
        # Выполняем модель
        outputs = self.predictV2(
            self.context,
            search_np,
            template_np,
            template_anno_np,
            self.d_input_x,
            self.d_input_z,
            self.d_input_anno,
            self.d_output_boxes
        )
        
        # Преобразуем выходы модели
        pred_boxes = torch.from_numpy(outputs).view(-1, 4).to('cuda')
        
        # ВАЖНО: Проверяем выходные данные модели
        if torch.any(torch.isnan(pred_boxes)):
            print("Warning: Model returned NaN values!")
            return {"target_bbox": self.state, "best_score": 0.0}
        
        # Преобразуем координаты в абсолютные значения
        pred_box = (pred_boxes.mean(dim=0) * self.params.search_size / resize_factor).tolist()
        
        # ВАЖНО: Добавляем отладочную печать
        print(f"Raw model output: {outputs}")
        print(f"Predicted box (relative): {pred_box}")
        
        # Преобразуем в абсолютные координаты изображения
        mapped_box = self.map_box_back(pred_box, resize_factor)
        print(f"Mapped box: {mapped_box}")
        
        # Обновляем состояние с проверкой границ
        new_state = clip_box(mapped_box, H, W, margin=10)
        print(f"New state: {new_state}")
        
        # Проверяем на резкие изменения положения
        if self.state is not None:
            prev_center = np.array([self.state[0] + self.state[2]/2, self.state[1] + self.state[3]/2])
            new_center = np.array([new_state[0] + new_state[2]/2, new_state[1] + new_state[3]/2])
            movement = np.linalg.norm(new_center - prev_center)
            
            if movement > max(H, W) * 0.5:  # Если движение слишком большое
                print(f"Abnormal movement detected: {movement}. Keeping previous state.")
                new_state = self.state
        
        self.state = new_state
        
        return {"target_bbox": self.state, "best_score": 1.0}

    def map_box_back(self, pred_box: list, resize_factor: float):
        """Преобразует относительные координаты в абсолютные координаты изображения."""
        try:
            # Получаем текущий центр bbox
            cx_prev = self.state[0] + self.state[2] * 0.5
            cy_prev = self.state[1] + self.state[3] * 0.5
            
            # pred_box содержит [cx, cy, w, h] в относительных координатах
            cx_rel, cy_rel, w_rel, h_rel = pred_box
            
            # Преобразуем относительные координаты в абсолютные
            search_size = self.params.search_size
            cx = cx_rel * (search_size / resize_factor)
            cy = cy_rel * (search_size / resize_factor)
            w = w_rel * (search_size / resize_factor)
            h = h_rel * (search_size / resize_factor)
            
            # Вычисляем абсолютные координаты на изображении
            half_search = search_size / (2 * resize_factor)
            cx_abs = cx + (cx_prev - half_search)
            cy_abs = cy + (cy_prev - half_search)
            
            # Возвращаем в формате [x, y, w, h]
            return [cx_abs - w/2, cy_abs - h/2, w, h]
        
        except Exception as e:
            print(f"Error in map_box_back: {e}")
            return self.state  # Возвращаем предыдущее состояние при ошибке


def get_tracker_class():
    return MCITRACK

In [None]:
import numpy as np
import torch
import pycuda.driver as cuda
import pycuda.autoinit
import tensorrt as trt

TRT_LOGGER = trt.Logger(trt.Logger.WARNING)

class MCITRACK(BaseTracker):
    def __init__(self, params):
        super(MCITRACK, self).__init__(params)
        self.cfg = params.cfg

        # Инициализация параметров из конфига
        self.num_template = self.cfg["TEST"].get("TEMPLATE_NUMBER", 1)
        self.update_threshold = self.cfg["TEST"].get("UPDATE_THRESHOLD", 0.8)
        self.update_intervals = self.cfg["TEST"].get("UPDATE_INTERVALS", 200)
        self.memory_bank = self.cfg["TEST"].get("MEMORY_BANK", 20)
        self.window = self.cfg["TEST"].get("WINDOW", True)

        # Загрузка TensorRT engine
        self.engine = self.load_engine("MCITrack.trt")
        print("Engine bindings:", [binding for binding in self.engine])
        self.context, self.d_input_z, self.d_input_x, self.d_input_anno, self.d_output_boxes = self.initialize_trt(self.engine)

        self.preprocessor = Preprocessor()
        self.state = None
        self.frame_id = 0
        self.save_all_boxes = params.save_all_boxes

        # Инициализация списков шаблонов
        self.template_list = []
        self.template_anno_list = []
        self.memory_template_list = []
        self.memory_template_anno_list = []

        if self.window:
            self.output_window = self._generate_window()

    def load_engine(self, engine_path):
        with open(engine_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
            return runtime.deserialize_cuda_engine(f.read())

    def initialize_trt(self, engine):
        context = engine.create_execution_context()
        d_input_z = cuda.mem_alloc(1 * 3 * 112 * 112 * np.float32().itemsize)
        d_input_x = cuda.mem_alloc(1 * 3 * 224 * 224 * np.float32().itemsize)
        d_input_anno = cuda.mem_alloc(1 * 4 * np.float32().itemsize)
        d_output_boxes = cuda.mem_alloc(1 * 4 * np.float32().itemsize)
        return context, d_input_z, d_input_x, d_input_anno, d_output_boxes

    def _generate_window(self):
        hann_window = np.outer(np.hanning(16), np.hanning(16))
        return torch.from_numpy(hann_window).float().to('cuda')

    def initialize(self, image, info: dict):
        z_patch_arr, _ = sample_target(image, info['init_bbox'], self.params.template_factor,
                                       output_sz=self.params.template_size)
        template = self.preprocessor.process(z_patch_arr)
        self.template = template
        self.template_list = [template]
        init_bbox = info['init_bbox']
        self.template_anno = torch.tensor(init_bbox).view(1, 4).float()
        self.template_anno_list = [self.template_anno]
        self.memory_template_list = []
        self.memory_template_anno_list = []
        self.state = init_bbox
        self.frame_id = 0

    """ def predictV2(self, context, search, template, template_anno, 
              d_input_search, d_input_template, d_input_anno, d_output_boxes):
        cuda.memcpy_htod(d_input_template, template.ravel())
        cuda.memcpy_htod(d_input_search, search.ravel())
        cuda.memcpy_htod(d_input_anno, template_anno.ravel())

        # Получаем имена биндингов engine
        binding_names = [b for b in self.engine]        
        bindings = [0] * len(binding_names)
        name_to_ptr = {
            'template_list': d_input_template,
            'search_list': d_input_search,
            'template_anno_list': d_input_anno,
            'pred_boxes': d_output_boxes
        }
        for name, ptr in name_to_ptr.items():
            if name in binding_names:
                bindings[binding_names.index(name)] = int(ptr)

        # Выполнение модели
        print("My bindings:", [binding for binding in bindings])
        context.execute_v2(bindings=bindings)

        # Получение результатов
        output_boxes = np.empty([1, 4], dtype=np.float32)
        cuda.memcpy_dtoh(output_boxes, d_output_boxes)
        return output_boxes """
    def predictV2(self, context, search, template, template_anno, 
              d_input_x, d_input_z, d_input_anno, d_output_boxes):
        cuda.memcpy_htod(d_input_z, template.ravel())
        cuda.memcpy_htod(d_input_x, search.ravel())
        cuda.memcpy_htod(d_input_anno, template_anno.ravel())

        # Получаем имена биндингов engine
        if hasattr(self.engine, "num_bindings"):
            binding_names = [self.engine.get_binding_name(i) for i in range(self.engine.num_bindings)]
        else:
            binding_names = [self.engine.get_tensor_name(i) for i in range(self.engine.num_io_tensors)]

        bindings = [0] * len(binding_names)
        name_to_ptr = {
            'template_list': d_input_z,
            'search_list': d_input_x,
            'template_anno_list': d_input_anno,
            'pred_boxes': d_output_boxes
        }
        for name, ptr in name_to_ptr.items():
            if name in binding_names:
                bindings[binding_names.index(name)] = int(ptr)

        print("My bindings:", bindings)
        context.execute_v2(bindings=bindings)

        output_boxes = np.empty([1, 4], dtype=np.float32)
        cuda.memcpy_dtoh(output_boxes, d_output_boxes)
        return output_boxes





    def track(self, image, info: dict = None):
        H, W, _ = image.shape
        self.frame_id += 1

        # Получаем search патч с текущей позиции
        x_patch_arr, resize_factor = sample_target(image, self.state, self.params.search_factor,
                                                   output_sz=self.params.search_size)
        search = self.preprocessor.process(x_patch_arr)

        # Подготавливаем входные данные
        template_np = self.template.cpu().numpy().astype(np.float32)
        search_np = search.cpu().numpy().astype(np.float32)
        template_anno_np = self.template_anno.cpu().numpy().astype(np.float32)

        # Отладочная печать входов
        print("template sum:", np.sum(template_np))
        print("search sum:", np.sum(search_np))
        print("template_anno:", template_anno_np)

        # Выполняем модель
        outputs = self.predictV2(
            self.context,
            search_np,
            template_np,
            template_anno_np,
            self.d_input_x,
            self.d_input_z,
            self.d_input_anno,
            self.d_output_boxes
        )

        pred_boxes = torch.from_numpy(outputs).view(-1, 4).to('cuda')

        # ВАЖНО: Проверяем выходные данные модели
        if torch.any(torch.isnan(pred_boxes)):
            print("Warning: Model returned NaN values!")
            return {"target_bbox": self.state, "best_score": 0.0}

        pred_box = (pred_boxes.mean(dim=0) * self.params.search_size / resize_factor).tolist()

        print(f"Raw model output: {outputs}")
        print(f"Predicted box (relative): {pred_box}")

        mapped_box = self.map_box_back(pred_box, resize_factor)
        print(f"Mapped box: {mapped_box}")

        new_state = clip_box(mapped_box, H, W, margin=10)
        print(f"New state: {new_state}")

        # Проверяем на резкие изменения положения
        if self.state is not None:
            prev_center = np.array([self.state[0] + self.state[2]/2, self.state[1] + self.state[3]/2])
            new_center = np.array([new_state[0] + new_state[2]/2, new_state[1] + new_state[3]/2])
            movement = np.linalg.norm(new_center - prev_center)
            if movement > max(H, W) * 0.5:
                print(f"Abnormal movement detected: {movement}. Keeping previous state.")
                new_state = self.state

        self.state = new_state

        return {"target_bbox": self.state, "best_score": 1.0}

    def map_box_back(self, pred_box: list, resize_factor: float):
        try:
            cx_prev = self.state[0] + self.state[2] * 0.5
            cy_prev = self.state[1] + self.state[3] * 0.5
            cx_rel, cy_rel, w_rel, h_rel = pred_box
            search_size = self.params.search_size
            cx = cx_rel * (search_size / resize_factor)
            cy = cy_rel * (search_size / resize_factor)
            w = w_rel * (search_size / resize_factor)
            h = h_rel * (search_size / resize_factor)
            half_search = search_size / (2 * resize_factor)
            cx_abs = cx + (cx_prev - half_search)
            cy_abs = cy + (cy_prev - half_search)
            return [cx_abs - w/2, cy_abs - h/2, w, h]
        except Exception as e:
            print(f"Error in map_box_back: {e}")
            return self.state


In [12]:
treacker = MCITRACK(params)

Engine bindings: ['template_list', 'search_list', 'template_anno_list', 'onnx::Unsqueeze_3', 'onnx::Unsqueeze_4', 'onnx::Unsqueeze_5', 'onnx::Unsqueeze_6', 'onnx::Unsqueeze_7', 'onnx::Unsqueeze_8', 'onnx::Unsqueeze_9', 'onnx::Unsqueeze_10', 'pred_boxes', 'score_map', 'size_map', 'offset_map']


In [13]:
# Трекинг по видео
file = "0516.mp4"
video = cv2.VideoCapture(file)
ok, image = video.read()
if not video.isOpened():
    print("Could not open video")
    sys.exit()
    
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

x, y, w, h = cv2.selectROI( image, fromCenter=False)
init_state = [x, y, w, h]
def _build_init_info(box):
            return {'init_bbox': box}
treacker.initialize(image, _build_init_info(init_state))
counter = 0
while True:
    ok, image = video.read()
    if not ok:
        break

    # Конвертация для трекера
    tracker_image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    
    # Трекинг
    start_time = time.time()
    out = treacker.track(tracker_image)
    state = [int(s) for s in out['target_bbox']]
    best_score = out["best_score"]
    fps = 1 / (time.time() - start_time + 1e-6)

    # Визуализация
    display_image = image.copy()
    x, y, w, h = state
    
    # Динамический цвет рамки в зависимости от уверенности
    color = (0, 255, 0) if best_score > 0.7 else (0, 255, 255) if best_score > 0.4 else (0, 0, 255)
    thickness = 3 if best_score > 0.7 else 2
    
    # Рисуем bounding box с увеличенными размерами
    cv2.rectangle(display_image, (x, y), (x + w, y + h), color, thickness)
    
    # Добавляем информационный текст
    cv2.putText(display_image, f"Score: {best_score:.2f}", (x, y-10), 
                cv2.FONT_HERSHEY_SIMPLEX, 0.7, color, 2)
    cv2.putText(display_image, f"FPS: {fps:.1f}", (20, 40), 
                cv2.FONT_HERSHEY_SIMPLEX, 0.8, (255, 255, 255), 2)
    
    cv2.imshow("tracking", display_image)
    
    # Обработка клавиш
    key = cv2.waitKey(1) & 0xFF
    if key == 32:  # SPACE - переинициализация
        x, y, w, h = cv2.selectROI("Select ROI", image, fromCenter=False)
        if w > 10 and h > 10:  # Минимальный размер ROI
            init_state = [x, y, w, h]
            print("Переинициализация...")
            treacker.initialize(tracker_image, _build_init_info(init_state))
    elif key == 27:  # ESC - выход
        break
cv2.destroyAllWindows()

template sum: 29689.477
search sum: 141122.58
template_anno: [[885. 215.  47.  53.]]
My bindings: [64585992192, 64586142720, 64586744832, 0, 0, 0, 0, 0, 0, 0, 0, 64586745344, 0, 0, 0]
Raw model output: [[0. 0. 0. 0.]]
Predicted box (relative): [0.0, 0.0, 0.0, 0.0]
Mapped box: [808.5, 141.5, 0.0, 0.0]
New state: [808.5, 141.5, 10, 10]
template sum: 29689.477
search sum: 201707.12
template_anno: [[885. 215.  47.  53.]]
My bindings: [64585992192, 64586142720, 64586744832, 0, 0, 0, 0, 0, 0, 0, 0, 64586745344, 0, 0, 0]
Raw model output: [[0. 0. 0. 0.]]
Predicted box (relative): [0.0, 0.0, 0.0, 0.0]
Mapped box: [793.5, 126.5, 0.0, 0.0]
New state: [793.5, 126.5, 10, 10]
template sum: 29689.477
search sum: 213294.45
template_anno: [[885. 215.  47.  53.]]
My bindings: [64585992192, 64586142720, 64586744832, 0, 0, 0, 0, 0, 0, 0, 0, 64586745344, 0, 0, 0]
Raw model output: [[0. 0. 0. 0.]]
Predicted box (relative): [0.0, 0.0, 0.0, 0.0]
Mapped box: [778.5, 111.5, 0.0, 0.0]
New state: [778.5, 111.5, 