In [2]:
import torch
import cv2
import sys
import math
import time
import numpy as np
import yaml
import pycuda.autoinit
import onnx
import onnxruntime as ort
import tensorrt as trt
import pycuda.driver as cuda


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
def sample_target(im, target_bb, search_area_factor, output_sz=None):
   
    if not isinstance(target_bb, list):
        x, y, w, h = target_bb.tolist()
    else:
        x, y, w, h = target_bb
    # Crop image
    crop_sz = math.ceil(math.sqrt(w * h) * search_area_factor)

    if crop_sz < 1:
        raise Exception('Too small bounding box.')

    x1 = round(x + 0.5 * w - crop_sz * 0.5)
    x2 = x1 + crop_sz

    y1 = round(y + 0.5 * h - crop_sz * 0.5)
    y2 = y1 + crop_sz

    x1_pad = max(0, -x1)
    x2_pad = max(x2 - im.shape[1] + 1, 0)

    y1_pad = max(0, -y1)
    y2_pad = max(y2 - im.shape[0] + 1, 0)

    # Crop target
    im_crop = im[y1 + y1_pad:y2 - y2_pad, x1 + x1_pad:x2 - x2_pad, :]

    # Pad
    im_crop_padded = cv2.copyMakeBorder(im_crop, y1_pad, y2_pad, x1_pad, x2_pad, cv2.BORDER_CONSTANT)
    # deal with attention mask
    H, W, _ = im_crop_padded.shape

    if output_sz is not None:
        resize_factor = output_sz / crop_sz
        im_crop_padded = cv2.resize(im_crop_padded, (output_sz, output_sz))

        return im_crop_padded, resize_factor

    else:
        return im_crop_padded, 1.0
    
    
def hann1d(sz: int, centered = True) -> torch.Tensor:
    """1D cosine window."""
    if centered:
        return 0.5 * (1 - torch.cos((2 * math.pi / (sz + 1)) * torch.arange(1, sz + 1).float()))
    w = 0.5 * (1 + torch.cos((2 * math.pi / (sz + 2)) * torch.arange(0, sz//2 + 1).float()))
    return torch.cat([w, w[1:sz-sz//2].flip((0,))])
    
def hann2d(sz: torch.Tensor, centered = True) -> torch.Tensor:
    """2D cosine window."""
    return hann1d(sz[0].item(), centered).reshape(1, 1, -1, 1) * hann1d(sz[1].item(), centered).reshape(1, 1, 1, -1)  

In [4]:
def transform_image_to_crop(box_in: torch.Tensor, box_extract: torch.Tensor, resize_factor: float,
                            crop_sz: torch.Tensor, normalize=False) -> torch.Tensor:
    """ Transform the box co-ordinates from the original image co-ordinates to the co-ordinates of the cropped image
    args:
        box_in - the box for which the co-ordinates are to be transformed
        box_extract - the box about which the image crop has been extracted.
        resize_factor - the ratio between the original image scale and the scale of the image crop
        crop_sz - size of the cropped image

    returns:
        torch.Tensor - transformed co-ordinates of box_in
    """
    box_extract_center = box_extract[0:2] + 0.5 * box_extract[2:4]

    box_in_center = box_in[0:2] + 0.5 * box_in[2:4]

    box_out_center = (crop_sz - 1) / 2 + (box_in_center - box_extract_center) * resize_factor
    box_out_wh = box_in[2:4] * resize_factor

    box_out = torch.cat((box_out_center - 0.5 * box_out_wh, box_out_wh))
    if normalize:
        return box_out / (crop_sz[0]-1)
    else:
        return box_out
def clip_box(box: list, H, W, margin=0):
    x1, y1, w, h = box
    x2, y2 = x1 + w, y1 + h
    x1 = min(max(0, x1), W-margin)
    x2 = min(max(margin, x2), W)
    y1 = min(max(0, y1), H-margin)
    y2 = min(max(margin, y2), H)
    w = max(margin, x2-x1)
    h = max(margin, y2-y1)
    return [x1, y1, w, h]

def box_xyxy_to_cxcywh(x):
    x0, y0, x1, y1 = x.unbind(-1)
    b = [(x0 + x1) / 2, (y0 + y1) / 2,
         (x1 - x0), (y1 - y0)]
    return torch.stack(b, dim=-1)

In [5]:
class Preprocessor(object):
    def __init__(self):
        self.mean = torch.tensor([0.485, 0.456, 0.406]).view((1, 3, 1, 1)).cuda()
        self.std = torch.tensor([0.229, 0.224, 0.225]).view((1, 3, 1, 1)).cuda()
        self.mm_mean = torch.tensor([0.485, 0.456, 0.406, 0.485, 0.456, 0.406]).view((1, 6, 1, 1)).cuda()
        self.mm_std = torch.tensor([0.229, 0.224, 0.225, 0.229, 0.224, 0.225]).view((1, 6, 1, 1)).cuda()

    def process(self, img_arr: np.ndarray):
        if img_arr.shape[-1] == 6:
            mean = self.mm_mean
            std = self.mm_std
        else:
            mean = self.mean
            std = self.std
        # Deal with the image patch
        img_tensor = torch.tensor(img_arr).cuda().float().permute((2,0,1)).unsqueeze(dim=0)
        # img_tensor = torch.tensor(img_arr).float().permute((2,0,1)).unsqueeze(dim=0)
        img_tensor_norm = ((img_tensor / 255.0) - mean) / std  # (1,3,H,W)
        return img_tensor_norm
    
    


In [6]:
def cal_bbox(score_map_ctr, size_map, offset_map, return_score=True):
        feat_sz = 14
        max_score, idx = torch.max(score_map_ctr.flatten(1), dim=1, keepdim=True) # score_map_ctr.flatten(1): torch.Size([32, 256]) idx: torch.Size([32, 1]) max_score: torch.Size([32, 1])
        idx_y = torch.div(idx, feat_sz, rounding_mode='floor')
        idx_x = idx % feat_sz
       
        

        idx = idx.unsqueeze(1).expand(idx.shape[0], 2, 1)
        size = size_map.flatten(2).gather(dim=2, index=idx) # size_map: torch.Size([32, 2, 16, 16])  size_map.flatten(2): torch.Size([32, 2, 256])
        offset = offset_map.flatten(2).gather(dim=2, index=idx).squeeze(-1)

        # bbox = torch.cat([idx_x - size[:, 0] / 2, idx_y - size[:, 1] / 2,
        #                   idx_x + size[:, 0] / 2, idx_y + size[:, 1] / 2], dim=1) / self.feat_sz
        # cx, cy, w, h
        bbox = torch.cat([(idx_x.to(torch.float) + offset[:, :1]) / feat_sz,
                          (idx_y.to(torch.float) + offset[:, 1:]) / feat_sz,
                          size.squeeze(-1)], dim=1)

        if return_score:
            return bbox, max_score
        return bbox

In [7]:
class BaseTracker:
    """Base class for all trackers."""

    def __init__(self, params):
        self.params = params
        self.visdom = None

    def predicts_segmentation_mask(self):
        return False

    def initialize(self, image, info: dict) -> dict:
        """Overload this function in your tracker. This should initialize the model."""
        raise NotImplementedError

    def track(self, image, info: dict = None) -> dict:
        """Overload this function in your tracker. This should track in the frame and update the model."""
        raise NotImplementedError

    def visdom_draw_tracking(self, image, box, segmentation=None):
        # Упрощенная обработка box без OrderedDict
        if isinstance(box, dict):  # Проверяем на обычный dict вместо OrderedDict
            box = list(box.values())  # Берем только значения
        elif not isinstance(box, (list, tuple)):  # Если не коллекция
            box = (box,)  # Превращаем в кортеж
        
        # Визуализация
        if segmentation is None:
            self.visdom.register((image, *box), 'Tracking', 1, 'Tracking')
        else:
            self.visdom.register((image, *box, segmentation), 'Tracking', 1, 'Tracking')

In [None]:
#OLD
class MCITRACK(BaseTracker):
    def __init__(self, params):
        
        super(MCITRACK, self).__init__(params)
        
        """Загрузка TensorRT модели"""
        with open("MCITrac.trt", "rb") as f:
            engine_data = f.read()
        self.runtime = trt.Runtime(trt.Logger())
        self.engine = self.runtime.deserialize_cuda_engine(engine_data)
        self.context = self.engine.create_execution_context()
        print(f"Context profile #: {self.context.engine.num_optimization_profiles}")

        # self.ort_session = ort.InferenceSession("MCITrack.onnx", providers=[('TensorrtExecutionProvider', {'trt_engine_cache_enable': True,
        #                                'trt_engine_cache_path': './trt',
        #                                "trt_fp16_enable": True,
        #                                'device_id': 0,
        #                                }),('CUDAExecutionProvider')])
        #self.ort_session = ort.InferenceSession("MCITrac.onnx", providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
        
        # self.onnx_input_names = [inp.name for inp in self.ort_session.get_inputs()]
        # self.onnx_output_names = [out.name for out in self.ort_session.get_outputs()]

          # Получаем имена тензоров                
        self.tensor_names = [self.engine.get_tensor_name(i) for i in range(self.engine.num_io_tensors)]
        self.input_tensors = self.tensor_names[0:2]  # Исключаем выходные тензоры
        self.input_shapes = [self.context.get_tensor_shape(name) for name in self.input_tensors]
        self.output_tensors = self.tensor_names[2:]  # Исключаем входные тензоры
        self.output_shapes = [self.context.get_tensor_shape(name) for name in self.output_tensors]

         # переменные под GPU память
        self.d_inputs = {}
        self.d_outputs = {}

        self.allocate_memory()

    def allocate_memory(self):
        """Выделение правильного объема памяти в GPU"""

        # Выделяем память под входные тензоры
        for name, shape in zip(self.input_tensors, self.input_shapes):
            size = int(np.prod(shape) * np.dtype(np.float32).itemsize)
            self.d_inputs[name] = cuda.mem_alloc(size)

        # Выделяем память под выходные тензоры
        for name, shape in zip(self.output_tensors, self.output_shapes):
            size = int(np.prod(shape) * np.dtype(np.float32).itemsize)
            self.d_outputs[name] = cuda.mem_alloc(size)
        
    def run_inference(self, z_numpy, x_numpy):
        """Запуск инференса для текущего кадра"""
        
        z_numpy = z_numpy.astype(np.float32).copy(order='C')
        x_numpy = x_numpy.astype(np.float32).copy(order='C')

        cuda.memcpy_htod(self.d_inputs["z"], z_numpy)
        cuda.memcpy_htod(self.d_inputs["x"], x_numpy)

        # Создаем список всех входных и выходных буферов
        bindings = [int(self.d_inputs["z"]), int(self.d_inputs["x"])] + [int(self.d_outputs[name]) for name in self.output_tensors]
        
        # Запускаем инференс
        self.context.execute_v2(bindings)

    def copy_results_to_cpu(self):
        """Извлекаем результаты инференса из GPU"""
        cpu_outputs = {}
        i = 0
        for name in self.output_tensors:
            cpu_outputs[KEYS[i]] = np.empty(self.output_shapes[self.output_tensors.index(name)], dtype=np.float32)
            cuda.memcpy_dtoh(cpu_outputs[KEYS[i]], self.d_outputs[name])
            i +=1
        # print(f"type(cpu_outputs): {type(cpu_outputs)}") 
        return cpu_outputs

        self.cfg = params.cfg

        self.preprocessor = Preprocessor()
        self.state = None

        self.fx_sz = self.cfg["TEST"]["SEARCH_SIZE"] // self.cfg["MODEL"]["ENCODER"]["STRIDE"]
        if self.cfg["TEST"]["WINDOW"] == True:  # for window penalty
            self.output_window = hann2d(torch.tensor([self.fx_sz, self.fx_sz]).long(), centered=True).cuda()

        self.num_template = self.cfg["TEST"]["NUM_TEMPLATES"]

   
        self.frame_id = 0
        # for update
        self.h_state = [None] * self.cfg["MODEL"]["NECK"]["N_LAYERS"]



        self.memory_bank = self.cfg["TEST"]["MB"]["DEFAULT"]
        self.update_h_t = self.cfg["TEST"]["UPH"]["DEFAULT"]
        self.update_threshold = self.cfg["TEST"]["UPT"]["DEFAULT"]
        self.update_intervals = self.cfg["TEST"]["INTER"]["DEFAULT"]
        print("Update threshold is: ", self.memory_bank)

    def initialize(self, image, info: dict):


        # get the initial templates
        z_patch_arr, resize_factor = sample_target(image, info['init_bbox'], self.params.template_factor,
                                                   output_sz=self.params.template_size)
        z_patch_arr = z_patch_arr
        template = self.preprocessor.process(z_patch_arr)
        self.template_list = [template] * self.num_template

        self.state = info['init_bbox']
        prev_box_crop = transform_image_to_crop(torch.tensor(info['init_bbox']),
                                                torch.tensor(info['init_bbox']),
                                                resize_factor,
                                                torch.Tensor([self.params.template_size, self.params.template_size]),
                                                normalize=True)
        self.template_anno_list = [prev_box_crop.to(template.device).unsqueeze(0)] * self.num_template
        self.frame_id = 0
        self.memory_template_list = self.template_list.copy()
        self.memory_template_anno_list = self.template_anno_list.copy()


    def track(self, image, info: dict = None):
        H, W, _ = image.shape
        self.frame_id += 1
        x_patch_arr, resize_factor = sample_target(image, self.state, self.params.search_factor,
                                                   output_sz=self.params.search_size)  # (x1, y1, w, h)
        search = self.preprocessor.process(x_patch_arr)
        search_list = [search]

        # run the encoder
        """
        with torch.no_grad():
            enc_opt = self.network.forward_encoder(self.template_list, search_list, self.template_anno_list)

        # run the time neck
        with torch.no_grad():
            hidden_state = self.h_state.copy()
            encoder_out,out_neck, h = self.network.forward_neck(enc_opt, hidden_state)
        # run the decoder
        with torch.no_grad():
            out_dict = self.network.forward_decoder(feature=out_neck)
        """
        
        
        template_list_np = [t.cpu().numpy() for t in self.template_list]
        search_list_np = [s.cpu().numpy() for s in search_list]
        template_anno_list_np = [ta.cpu().numpy() for ta in self.template_anno_list]

        # 2. Flatten inputs
        all_inputs_np = template_list_np + search_list_np + template_anno_list_np
        
        input_names = [inp.name for inp in self.ort_session.get_inputs()]
        
        input_feed = {name: data for name, data in zip(input_names, all_inputs_np)}

        # 5. Run inference
        outputs = self.ort_session.run(None, input_feed)
        #print("Expected input names:", input_names)
        output_names = [out.name for out in self.ort_session.get_outputs()]
        out_dict_np = {name: arr for name, arr in zip(self.onnx_output_names, outputs)}
        

        # 3. Create the input feed dictionary
        #input_feed = {name: data for name, data in zip(self.onnx_input_names, all_inputs_np)}
        
        #input_dict = {name: data for name, data in zip(self.onnx_input_names, all_inputs_np)}
        #np.savez('onnx_test_inputs.npz', **input_dict)
        # 4. Run ONNX inference
        #onnx_outputs = self.ort_session.run(None, input_feed)

        # 5. Reconstruct the output dictionary
        #out_dict = {name: torch.from_numpy(arr) for name, arr in zip(self.onnx_output_names, onnx_outputs)}
        
        #out_dict_np = {name: arr for name, arr in zip(self.onnx_output_names, onnx_outputs)}

        # If you need the outputs as torch tensors, convert them back.
        # Note: The data will be on the CPU. Move it to a GPU if needed, e.g., .to('cuda').
        out_dict = {
            'pred_boxes': torch.from_numpy(out_dict_np['pred_boxes']).to('cuda'),
            'score_map': torch.from_numpy(out_dict_np['score_map']).to('cuda'),
            'size_map': torch.from_numpy(out_dict_np['size_map']).to('cuda'),
            'offset_map': torch.from_numpy(out_dict_np['offset_map']).to('cuda'),
                }

        
       # with torch.no_grad():
       #     out_dict = self.network.forward(
       #         template_list=self.template_list,
       #         search_list=search_list,
       #         template_anno_list=self.template_anno_list,
       #         
       #         gt_score_map=None
       #     )
            
        """print(len(self.template_list))
        for i in self.template_list:
            print(i.shape)
        print(len(search_list))
        for i in search_list:
            print(i.shape)
        print(len(self.template_anno_list))
        for i in self.template_anno_list:
            print(i.shape)
        print(len(self.h_state))
        
        print("-"*50)
        """
        

        # add hann windows
        pred_score_map = out_dict['score_map']
        if self.cfg["TEST"]["WINDOW"] == True:  # for window penalty
            response = self.output_window * pred_score_map
        else:
            response = pred_score_map
        #if 'size_map' in out_dict.keys():
        pred_boxes, conf_score = cal_bbox(response, out_dict['size_map'],
                                                                   out_dict['offset_map'])
        #else:
        #    pred_boxes, conf_score = cal_bbox(response,
        #                                                           out_dict['offset_map'],
        #                                                           return_score=True)
        pred_boxes = pred_boxes.view(-1, 4)
        # Baseline: Take the mean of all pred boxes as the final result
        pred_box = (pred_boxes.mean(dim=0) * self.params.search_size / resize_factor).tolist()  # (cx, cy, w, h) [0,1]
        # get the final box result
        self.state = clip_box(self.map_box_back(pred_box, resize_factor), H, W, margin=10)
        # update hiden state
        self.h_state = h
        if conf_score.item() < self.update_h_t:
            self.h_state = [None] * self.cfg["MODEL"]["NECK"]["N_LAYERS"]

        # update the template
        if self.num_template > 1:
            if (conf_score > self.update_threshold):
                z_patch_arr, resize_factor = sample_target(image, self.state, self.params.template_factor,
                                                           output_sz=self.params.template_size)
                template = self.preprocessor.process(z_patch_arr)
                self.memory_template_list.append(template)
                prev_box_crop = transform_image_to_crop(torch.tensor(self.state),
                                                        torch.tensor(self.state),
                                                        resize_factor,
                                                        torch.Tensor(
                                                            [self.params.template_size, self.params.template_size]),
                                                        normalize=True)
                self.memory_template_anno_list.append(prev_box_crop.to(template.device).unsqueeze(0))
                if len(self.memory_template_list) > self.memory_bank:
                    self.memory_template_list.pop(0)
                    self.memory_template_anno_list.pop(0)
        if (self.frame_id % self.update_intervals == 0):
            assert len(self.memory_template_anno_list) == len(self.memory_template_list)
            len_list = len(self.memory_template_anno_list)
            interval = len_list // self.num_template
            for i in range(1, self.num_template):
                idx = interval * i
                if idx > len_list:
                    idx = len_list
                self.template_list.append(self.memory_template_list[idx])
                self.template_list.pop(1)
                self.template_anno_list.append(self.memory_template_anno_list[idx])
                self.template_anno_list.pop(1)
        assert len(self.template_list) == self.num_template



        return {"target_bbox": self.state,
                "best_score": conf_score}

    def map_box_back(self, pred_box: list, resize_factor: float):
        cx_prev, cy_prev = self.state[0] + 0.5 * self.state[2], self.state[1] + 0.5 * self.state[3]
        cx, cy, w, h = pred_box
        half_side = 0.5 * self.params.search_size / resize_factor
        cx_real = cx + (cx_prev - half_side)
        cy_real = cy + (cy_prev - half_side)
        return [cx_real - 0.5 * w, cy_real - 0.5 * h, w, h]

    def map_box_back_batch(self, pred_box: torch.Tensor, resize_factor: float):
        cx_prev, cy_prev = self.state[0] + 0.5 * self.state[2], self.state[1] + 0.5 * self.state[3]
        cx, cy, w, h = pred_box.unbind(-1)  # (N,4) --> (N,)
        half_side = 0.5 * self.params.search_size / resize_factor
        cx_real = cx + (cx_prev - half_side)
        cy_real = cy + (cy_prev - half_side)
        return torch.stack([cx_real - 0.5 * w, cy_real - 0.5 * h, w, h], dim=-1)

In [23]:
#
class MCITRACK(BaseTracker):
    def __init__(self, params):
        super(MCITRACK, self).__init__(params)
        
        # Проверяем и загружаем ONNX-модель
        self.onnx_model = onnx.load("MCITrack.onnx")
        onnx.checker.check_model(self.onnx_model)
        self.ort_session = ort.InferenceSession(
            "MCITrack.onnx",
            providers=[
                ('TensorrtExecutionProvider', {
                    'trt_engine_cache_enable': True,
                    'trt_engine_cache_path': './trt',
                    "trt_fp16_enable": True,
                    'device_id': 0,
                }),
                ('CUDAExecutionProvider')
            ]
        )

        self.onnx_input_names = [inp.name for inp in self.ort_session.get_inputs()]
        self.onnx_output_names = [out.name for out in self.ort_session.get_outputs()]

        self.cfg = params.cfg
        self.preprocessor = Preprocessor()
        self.state = None

        self.fx_sz = self.cfg["TEST"]["SEARCH_SIZE"] // self.cfg["MODEL"]["ENCODER"]["STRIDE"]
        if self.cfg["TEST"]["WINDOW"]:
            self.output_window = hann2d(torch.tensor([self.fx_sz, self.fx_sz]).long(), centered=True).cuda()
        self.num_template = self.cfg["TEST"]["NUM_TEMPLATES"]

        self.frame_id = 0
        self.h_state = [None] * self.cfg["MODEL"]["NECK"]["N_LAYERS"]
        self.memory_bank = self.cfg["TEST"]["MB"]["DEFAULT"]
        self.update_h_t = self.cfg["TEST"]["UPH"]["DEFAULT"]
        self.update_threshold = self.cfg["TEST"]["UPT"]["DEFAULT"]
        self.update_intervals = self.cfg["TEST"]["INTER"]["DEFAULT"]

    def initialize(self, image, info: dict):
        z_patch_arr, resize_factor = sample_target(image, info['init_bbox'], self.params.template_factor,
                                                   output_sz=self.params.template_size)
        template = self.preprocessor.process(z_patch_arr)
        self.template_list = [template] * self.num_template

        self.state = info['init_bbox']
        prev_box_crop = transform_image_to_crop(torch.tensor(info['init_bbox']),
                                                torch.tensor(info['init_bbox']),
                                                resize_factor,
                                                torch.Tensor([self.params.template_size, self.params.template_size]),
                                                normalize=True)
        self.template_anno_list = [prev_box_crop.to(template.device).unsqueeze(0)] * self.num_template
        self.frame_id = 0
        self.memory_template_list = self.template_list.copy()
        self.memory_template_anno_list = self.template_anno_list.copy()

    def track(self, image, info: dict = None):
        H, W, _ = image.shape
        self.frame_id += 1
        x_patch_arr, resize_factor = sample_target(image, self.state, self.params.search_factor,
                                                   output_sz=self.params.search_size)
        search = self.preprocessor.process(x_patch_arr)
        search_list = [search]

        # Преобразуем входы в numpy
        template_list_np = [t.cpu().numpy() for t in self.template_list]
        search_list_np = [s.cpu().numpy() for s in search_list]
        template_anno_list_np = [ta.cpu().numpy() for ta in self.template_anno_list]
        all_inputs_np = template_list_np + search_list_np + template_anno_list_np

        # Собираем словарь входов
        input_feed = {name: data for name, data in zip(self.onnx_input_names, all_inputs_np)}

        # Запускаем инференс
        outputs = self.ort_session.run(None, input_feed)
        out_dict_np = {name: arr for name, arr in zip(self.onnx_output_names, outputs)}

        # Переводим в torch.Tensor
        out_dict = {
            'pred_boxes': torch.from_numpy(out_dict_np['pred_boxes']).to('cuda'),
            'score_map': torch.from_numpy(out_dict_np['score_map']).to('cuda'),
            'size_map': torch.from_numpy(out_dict_np['size_map']).to('cuda'),
            'offset_map': torch.from_numpy(out_dict_np['offset_map']).to('cuda'),
        }

        # add hann windows
        pred_score_map = out_dict['score_map']
        if self.cfg["TEST"]["WINDOW"]:
            response = self.output_window * pred_score_map
        else:
            response = pred_score_map

        pred_boxes, conf_score = cal_bbox(response, out_dict['size_map'], out_dict['offset_map'])
        pred_boxes = pred_boxes.view(-1, 4)
        pred_box = (pred_boxes.mean(dim=0) * self.params.search_size / resize_factor).tolist()
        self.state = clip_box(self.map_box_back(pred_box, resize_factor), H, W, margin=10)
       
        


        # update hiden state (если используете)
        # self.h_state = ... (если нужно)
        if conf_score.item() < self.update_h_t:
            self.h_state = [None] * self.cfg["MODEL"]["NECK"]["N_LAYERS"]

        return {
            "target_bbox": self.state,
            "best_score": conf_score.item(),
        }

    def map_box_back(self, box, resize_factor):
        cx, cy, w, h = box
        x = cx - w / 2
        y = cy - h / 2
        x /= resize_factor
        y /= resize_factor
        w /= resize_factor
        h /= resize_factor
        return [x, y, w, h]

In [9]:
cfg = {}

# MODEL
cfg["MODEL"] = {}

# MODEL.ENCODER
cfg["MODEL"]["ENCODER"] = {
    "TYPE": "dinov2_vitb14",  # encoder model
    "DROP_PATH": 0,
    "PRETRAIN_TYPE": "mae",  # mae, default, or scratch. This parameter is not activated for dinov2.
    "USE_CHECKPOINT": False,  # to save the memory.
    "STRIDE": 14,
    "POS_TYPE": 'interpolate',  # type of loading the positional encoding. "interpolate" or "index".
    "TOKEN_TYPE_INDICATE": False,  # add a token_type_embedding to indicate the search, template_foreground, template_background
    "INTERACTION_INDEXES": [[0, 6], [6, 12], [12, 18], [18, 24]],
    "GRAD_CKPT": False
}

# MODEL.NECK
cfg["MODEL"]["NECK"] = {
    "N_LAYERS": 4,
    "D_MODEL": 512,
    "D_STATE": 16  # MAMABA_HIDDEN_STATE
}

# MODEL.DECODER
cfg["MODEL"]["DECODER"] = {
    "TYPE": "CENTER",  # MLP, CORNER, CENTER
    "NUM_CHANNELS": 256
}

# TRAIN
cfg["TRAIN"] = {
    "LR": 0.0001,
    "WEIGHT_DECAY": 0.0001,
    "EPOCH": 500,
    "LR_DROP_EPOCH": 400,
    "BATCH_SIZE": 8,
    "NUM_WORKER": 8,
    "OPTIMIZER": "ADAMW",
    "ENCODER_MULTIPLIER": 0.1,  # encoder's LR = this factor * LR
    "FREEZE_ENCODER": False,  # for freezing the parameters of encoder
    "ENCODER_OPEN": [],  # only for debug, open some layers of encoder when FREEZE_ENCODER is True
    "CE_WEIGHT": 1.0,  # weight for cross-entropy loss
    "GIOU_WEIGHT": 2.0,
    "L1_WEIGHT": 5.0,
    "PRINT_INTERVAL": 50,  # interval to print the training log
    "GRAD_CLIP_NORM": 0.1,
    "FIX_BN": False,
    "ENCODER_W": "",
    "TYPE": "normal",  # normal, peft or fft
    "PRETRAINED_PATH": None
}

# TRAIN.SCHEDULER
cfg["TRAIN"]["SCHEDULER"] = {
    "TYPE": "step",
    "DECAY_RATE": 0.1
}

# DATA
cfg["DATA"] = {
    "MEAN": [0.485, 0.456, 0.406],
    "STD": [0.229, 0.224, 0.225],
    "MAX_SAMPLE_INTERVAL": 200,
    "SAMPLER_MODE": "order",
    "LOADER": "tracking"
}

# DATA.TRAIN
cfg["DATA"]["TRAIN"] = {
    "DATASETS_NAME": ["LASOT", "GOT10K_vottrain"],
    "DATASETS_RATIO": [1, 1],
    "SAMPLE_PER_EPOCH": 60000
}

# DATA.SEARCH
cfg["DATA"]["SEARCH"] = {
    "NUMBER": 1,  # number of search region, only support 1 for now.
    "SIZE": 256,
    "FACTOR": 4.0,
    "CENTER_JITTER": 3.5,
    "SCALE_JITTER": 0.5
}

# DATA.TEMPLATE
cfg["DATA"]["TEMPLATE"] = {
    "NUMBER": 1,
    "SIZE": 128,
    "FACTOR": 2.0,
    "CENTER_JITTER": 0,
    "SCALE_JITTER": 0
}

# TEST
cfg["TEST"] = {
    "TEMPLATE_FACTOR": 4.0,
    "TEMPLATE_SIZE": 256,
    "SEARCH_FACTOR": 2.0,
    "SEARCH_SIZE": 128,
    "EPOCH": 500,
    "WINDOW": False,  # window penalty
    "NUM_TEMPLATES": 1
}

# TEST.UPT
cfg["TEST"]["UPT"] = {
    "DEFAULT": 1,
    "LASOT": 0,
    "LASOT_EXTENSION_SUBSET": 0,
    "TRACKINGNET": 0,
    "TNL2K": 0,
    "NFS": 0,
    "UAV": 0,
    "VOT20": 0,
    "GOT10K_TEST": 0
}

# TEST.UPH
cfg["TEST"]["UPH"] = {
    "DEFAULT": 1,
    "LASOT": 0,
    "LASOT_EXTENSION_SUBSET": 0,
    "TRACKINGNET": 0,
    "TNL2K": 0,
    "NFS": 0,
    "UAV": 0,
    "VOT20": 0,
    "GOT10K_TEST": 0
}

# TEST.INTER
cfg["TEST"]["INTER"] = {
    "DEFAULT": 999999,
    "LASOT": 0,
    "LASOT_EXTENSION_SUBSET": 0,
    "TRACKINGNET": 0,
    "TNL2K": 0,
    "NFS": 0,
    "UAV": 0,
    "VOT20": 0,
    "GOT10K_TEST": 0
}

# TEST.MB
cfg["TEST"]["MB"] = {
    "DEFAULT": 500,
    "LASOT": 0,
    "LASOT_EXTENSION_SUBSET": 0,
    "TRACKINGNET": 0,
    "TNL2K": 0,
    "NFS": 0,
    "UAV": 0,
    "VOT20": 0,
    "GOT10K_TEST": 0
}

In [10]:
class TrackerParams:
    """Class for tracker parameters."""
    def set_default_values(self, default_vals: dict):
        for name, val in default_vals.items():
            if not hasattr(self, name):
                setattr(self, name, val)

    def get(self, name: str, *default):
        """Get a parameter value with the given name. If it does not exists, it return the default value given as a
        second argument or returns an error if no default value is given."""
        if len(default) > 1:
            raise ValueError('Can only give one default value.')

        if not default:
            return getattr(self, name)

        return getattr(self, name, default[0])

    def has(self, name: str):
        """Check if there exist a parameter with the given name."""
        return hasattr(self, name)

def _update_config(base_cfg, exp_cfg):
    if isinstance(base_cfg, dict) and isinstance(exp_cfg, dict):
        for k, v in exp_cfg.items():
            if k in base_cfg:
                if not isinstance(v, dict):
                    base_cfg[k] = v
                else:
                    _update_config(base_cfg[k], v)
            else:
                raise ValueError("{} not exist in config.py".format(k))
    else:
        return

def update_config_from_file(filename):
    exp_config = None
    with open(filename) as f:
        exp_config = yaml.safe_load(f)
        _update_config(cfg, exp_config)
    
def parameters(yaml_name: str):
    params = TrackerParams()

    yaml_file = "mcitrack_t224.yaml"
    update_config_from_file(yaml_file)
    params.cfg = cfg
    print("test config: ", cfg)

    params.yaml_name = yaml_name
    # template and search region
    params.template_factor = cfg["TEST"]["TEMPLATE_FACTOR"]
    params.template_size = cfg["TEST"]["TEMPLATE_SIZE"]
    params.search_factor = cfg["TEST"]["SEARCH_FACTOR"]
    params.search_size = cfg["TEST"]["SEARCH_SIZE"]

    # Network checkpoint path
    params.checkpoint = "fast_itpn_tiny_1600e_1k.pt"
    # whether to save boxes from all queries
    params.save_all_boxes = False

    return params

params = parameters("./mcitrack_t224.yaml")

test config:  {'MODEL': {'ENCODER': {'TYPE': 'fastitpnt', 'DROP_PATH': 0.1, 'PRETRAIN_TYPE': './fast_itpn_tiny_1600e_1k.pt', 'USE_CHECKPOINT': False, 'STRIDE': 16, 'POS_TYPE': 'index', 'TOKEN_TYPE_INDICATE': True, 'INTERACTION_INDEXES': [[4, 7], [7, 10], [10, 13], [13, 16]], 'GRAD_CKPT': False}, 'NECK': {'N_LAYERS': 4, 'D_MODEL': 384, 'D_STATE': 16}, 'DECODER': {'TYPE': 'CENTER', 'NUM_CHANNELS': 256}}, 'TRAIN': {'LR': 0.0004, 'WEIGHT_DECAY': 0.0001, 'EPOCH': 300, 'LR_DROP_EPOCH': 240, 'BATCH_SIZE': 64, 'NUM_WORKER': 10, 'OPTIMIZER': 'ADAMW', 'ENCODER_MULTIPLIER': 0.1, 'FREEZE_ENCODER': False, 'ENCODER_OPEN': [], 'CE_WEIGHT': 1.0, 'GIOU_WEIGHT': 2.0, 'L1_WEIGHT': 5.0, 'PRINT_INTERVAL': 50, 'GRAD_CLIP_NORM': 0.1, 'FIX_BN': False, 'ENCODER_W': '', 'TYPE': 'normal', 'PRETRAINED_PATH': None, 'SCHEDULER': {'TYPE': 'step', 'DECAY_RATE': 0.1}}, 'DATA': {'MEAN': [0.485, 0.456, 0.406], 'STD': [0.229, 0.224, 0.225], 'MAX_SAMPLE_INTERVAL': 400, 'SAMPLER_MODE': 'order', 'LOADER': 'tracking', 'TRAIN

In [11]:
treacker = MCITRACK(params)

*************** EP Error ***************
EP Error D:\a\_work\1\s\onnxruntime\python\onnxruntime_pybind_state.cc:505 onnxruntime::python::RegisterTensorRTPluginsAsCustomOps Please install TensorRT libraries as mentioned in the GPU requirements page, make sure they're in the PATH or LD_LIBRARY_PATH, and that your GPU is supported.
 when using [('TensorrtExecutionProvider', {'trt_engine_cache_enable': True, 'trt_engine_cache_path': './trt', 'trt_fp16_enable': True, 'device_id': 0}), 'CUDAExecutionProvider']
Falling back to ['CPUExecutionProvider'] and retrying.
****************************************


In [22]:
#Трекинг по видео
file = "test.mp4"
video = cv2.VideoCapture(file)
#fourcc = cv2.VideoWriter_fourcc(*'XVID')
#fps=video.get(cv2.CAP_PROP_FPS)
#video_vriter = cv2.VideoWriter(file.split('.')[0]+"_"+".avi", fourcc, fps, (1920, 1080))


ok, image = video.read()
if not video.isOpened():
    print("Could not open video")
    sys.exit()
    
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

x, y, w, h = cv2.selectROI( image, fromCenter=False)
init_state = [x, y, w, h]
def _build_init_info(box):
            return {'init_bbox': box}
treacker.initialize(image, _build_init_info(init_state))
counter = 0
while True:
            ok, image = video.read()
            if not ok:
                print("Can't read frame")
                break

            
            out  = treacker.track(image)
            state = [int(s) for s in out['target_bbox']]
            best_score = out["best_score"]
            if isinstance(best_score, torch.Tensor):
                best_score = best_score.item()
         
            x, y, w, h = [int(x) for x in state]

            color = (0, 255, 0)  # Цвет в формате BGR
            cv2.rectangle(image, (x, y), (x + w, y + h), color, 2)


            cv2.imshow("tracking", image)
            #video_vriter.write(image)


            k = cv2.waitKey(1)            
            if k == 32:  # SPACE
                ok, image = video.read()                             
                x, y, w, h = cv2.selectROI( image, fromCenter=False)
                init_state = [x, y, w, h]
                treacker.initialize(image, _build_init_info(init_state))
            if k == 27:  # ESC
                break
        
                
                

cv2.destroyAllWindows()
video.release()
#video_vriter.release()



Can't read frame


In [21]:
import cv2
import sys
import torch

file = "test.mp4"
video = cv2.VideoCapture(file)

if not video.isOpened():
    print("Could not open video")
    sys.exit()

ok, frame = video.read()
if not ok:
    print("Can't read first frame")
    sys.exit()

# Выбор ROI на BGR-изображении
x, y, w, h = cv2.selectROI("Select ROI", frame, fromCenter=False)
cv2.destroyWindow("Select ROI")

def _build_init_info(box):
    return {'init_bbox': box}

frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
treacker.initialize(frame_rgb, _build_init_info([x, y, w, h]))

while True:
    ok, frame = video.read()
    if not ok:
        break

    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    out = treacker.track(frame_rgb)

    bbox = [int(v) for v in out['target_bbox']]
    print(f"Tracked bbox: {bbox}")  # Вывод координат для отладки

    # Проверяем координаты рамки
    h_img, w_img, _ = frame.shape
    x, y, bw, bh = bbox

    # Коррекция координат, чтобы рамка была в пределах изображения
    x = max(0, min(x, w_img - 1))
    y = max(0, min(y, h_img - 1))
    bw = max(1, min(bw, w_img - x))
    bh = max(1, min(bh, h_img - y))

    # Проверяем, что рамка не слишком мала
    if bw > 10 and bh > 10:
        cv2.rectangle(frame, (x, y), (x + bw, y + bh), (0, 255, 0), 2)
    else:
        print("Warning: bbox too small or invalid, skipping drawing")

    cv2.imshow("Tracking", frame)

    key = cv2.waitKey(1) & 0xFF
    if key == 27:  # ESC
        break
    elif key == 32:  # SPACE - переинициализация ROI
        x, y, w, h = cv2.selectROI("Select ROI", frame, fromCenter=False)
        cv2.destroyWindow("Select ROI")
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        treacker.initialize(frame_rgb, _build_init_info([x, y, w, h]))

cv2.destroyAllWindows()
video.release()



Tracked bbox: [1270, 710, 10, 10]
Tracked bbox: [1, 0, 10, 10]
Tracked bbox: [2, 2, 10, 10]
Tracked bbox: [2, 3, 10, 10]
Tracked bbox: [1, 3, 10, 10]
Tracked bbox: [2, 2, 10, 10]
Tracked bbox: [2, 3, 10, 10]
Tracked bbox: [2, 2, 10, 10]
Tracked bbox: [1, 2, 10, 10]
Tracked bbox: [2, 3, 10, 10]
Tracked bbox: [2, 2, 10, 10]
Tracked bbox: [2, 3, 10, 10]
Tracked bbox: [2, 2, 10, 10]
Tracked bbox: [2, 2, 10, 10]
Tracked bbox: [2, 2, 10, 10]
Tracked bbox: [2, 1, 10, 10]
Tracked bbox: [2, 2, 10, 10]
Tracked bbox: [2, 1, 10, 10]
Tracked bbox: [2, 2, 10, 10]
Tracked bbox: [2, 3, 10, 10]
Tracked bbox: [2, 1, 10, 10]
Tracked bbox: [1, 2, 10, 10]
Tracked bbox: [2, 1, 10, 10]
Tracked bbox: [2, 3, 10, 10]
Tracked bbox: [2, 2, 10, 10]
Tracked bbox: [2, 1, 10, 10]
Tracked bbox: [2, 2, 10, 10]
Tracked bbox: [2, 2, 10, 10]
Tracked bbox: [2, 2, 10, 10]
Tracked bbox: [2, 2, 10, 10]
Tracked bbox: [2, 2, 10, 10]
Tracked bbox: [2, 2, 10, 10]
Tracked bbox: [2, 2, 10, 10]
Tracked bbox: [2, 2, 10, 10]
Tracked b

In [24]:
#Метрики
import numpy as np

def iou(boxA, boxB):
    # boxA, boxB: [x, y, w, h]
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[0] + boxA[2], boxB[0] + boxB[2])
    yB = min(boxA[1] + boxA[3], boxB[1] + boxB[3])

    interW = max(0, xB - xA)
    interH = max(0, yB - yA)
    interArea = interW * interH

    boxAArea = boxA[2] * boxA[3]
    boxBArea = boxB[2] * boxB[3]
    unionArea = boxAArea + boxBArea - interArea

    if unionArea == 0:
        return 0.0
    return interArea / unionArea

def precision(boxA, boxB):
    # центры bbox
    centerA = (boxA[0] + boxA[2]/2, boxA[1] + boxA[3]/2)
    centerB = (boxB[0] + boxB[2]/2, boxB[1] + boxB[3]/2)
    dist = np.sqrt((centerA[0] - centerB[0])**2 + (centerA[1] - centerB[1])**2)
    return dist
sr_thresh = 0.5
prec_thresh = 20

In [None]:
#Трекинг got10k с метриками TRT
import glob
import time
import  os
gt_bboxes = []
pred_bboxes = []
seq_path = "val/GOT-10k_Val_000001"
txt_files = glob.glob(os.path.join(seq_path, '*.txt'))
if not txt_files:
    raise FileNotFoundError(f"No .txt files found in {seq_path}")

img_files = sorted(glob.glob(os.path.join(seq_path, '*.jpg')))
with open(txt_files[0], 'r') as f:
    gt_bboxes = [list(map(float, line.strip().split(','))) for line in f]

# Получаем размер первого изображения
sample_img = cv2.imread(img_files[0])
if sample_img is None:
    raise ValueError(f"Failed to read sample image: {img_files[0]}")

#height, width = sample_img.shape[:2]
#fourcc = cv2.VideoWriter_fourcc(*'XVID')
#output_filename = f"{seq_path.split('/')[-1]}_output.avi"
#video_vriter = cv2.VideoWriter(output_filename, fourcc, 10, (width, height))  

assert len(img_files) == len(gt_bboxes), "Количество кадров и bbox'ов не совпадает"

x, y, w, h = map(int, gt_bboxes[0])
init_state = [x, y, w, h]

def _build_init_info(box):
            return {'init_bbox': box}

counter = 0


#treacker.initialize(sample_img, _build_init_info(init_state))

start_time = time.time()  # Начало замера

for img_file, bbox in zip(img_files, gt_bboxes):
        
        # Читаем изображение
        img = cv2.imread(img_file)
        if img is None:
            print(f"Не удалось загрузить изображение: {img_file}")
            continue
        
        
        out  = treacker.track(img)
        state = [int(s) for s in out['target_bbox']]   
                           
        # Рисуем bounding box        
        x, y, w, h = [int(x) for x in state]

        cv2.rectangle(img, (x, y), (x + w, y + h), (0, 0, 200), 2)
        
        x1, y1, w1, h1 = map(int, bbox)
        cv2.rectangle(img, (x1, y1), (x1+w1, y1+h1), (0, 200, 0), 2)
        bbox_pred = x, y, w, h
        
        gt_bboxes.append(bbox)
        pred_bboxes.append(bbox_pred)

        #cv2.imshow(seq_path, img)
        #video_vriter.write(img)
        counter+=1


        # Выход по нажатию 'q' или ESC
        key = cv2.waitKey(1) & 0xFF
        if key == ord('q') or key == 27:
            break
       
        
                
end_time = time.time()    # Конец замера    
total_frames = counter       # Общее количество обработанных кадров
total_time = end_time - start_time
fps = total_frames / total_time
ious = [iou(gt, pred) for gt, pred in zip(gt_bboxes, pred_bboxes)]
ao = np.mean(ious)
sr = np.mean([1 if val >= sr_thresh else 0 for val in ious])
precisions = [precision(gt, pred) for gt, pred in zip(gt_bboxes, pred_bboxes)]
prec = np.mean([1 if d <= prec_thresh else 0 for d in precisions])

print(f"GOT: {seq_path}")
print(f"FPS: {fps:.2f}")
print(f'Success Rate (SR@0.5): {sr:.2f}')
print(f'Average Overlap (AO): {ao:.2f}')
print(f'Precision @20px: {prec:.2f}')

cv2.destroyAllWindows()
#video_vriter.release()
#print(f"Video saved as: {output_filename}")