C1+C6 maja bardzo czyste nalozenie pola widzenia ale ciezko bedzie przez wiele ludzi

In [None]:
import torch
import torch.nn as nn

class TrackEmbeddingUpdater(nn.Module):
    def __init__(self, num_cameras, embed_dim=256, num_heads=8):
        super().__init__()
        self.num_cameras = num_cameras
        
        self.camera_cross_attn = nn.ModuleList([
            nn.MultiheadAttention(embed_dim, num_heads, batch_first=True)
            for _ in range(num_cameras)
        ])
        
        self.self_attn = nn.MultiheadAttention(embed_dim, num_heads, batch_first=True)
        
        self.ffn = nn.Sequential(
            nn.Linear(embed_dim, 2048),
            nn.ReLU(),
            nn.Linear(2048, embed_dim)
        )

    def forward(self, track_embeds, detection_embeds_per_camera):
        cross_attn_outputs = []
        for cam_idx in range(self.num_cameras):
            attn_out, _ = self.camera_cross_attn[cam_idx](
                query=track_embeds,
                key=detection_embeds_per_camera[cam_idx],
                value=detection_embeds_per_camera[cam_idx]
            )
            cross_attn_outputs.append(attn_out)
        
        averaged = torch.mean(torch.stack(cross_attn_outputs), dim=0)
        
        self_attn_out, _ = self.self_attn(averaged, averaged, averaged)
        
        updated_tracks = self.ffn(self_attn_out)
        
        return updated_tracks

In [None]:
import torch
import torch.nn as nn
from scipy.optimize import linear_sum_assignment

class AssociationModule(nn.Module):
    def __init__(self, embed_dim=256, temp=1.0):
        super().__init__()
        self.temp = temp  
        
        self.detection_proj = nn.Linear(embed_dim, embed_dim, bias=False)
        self.track_proj = nn.Linear(embed_dim, embed_dim, bias=False)
        
    def forward(self, detection_embeds, track_embeds):

        Q = self.detection_proj(detection_embeds) 
        K = self.track_proj(track_embeds)         
        
        similarity = torch.matmul(Q, K.transpose(1,2))  
        
        assignment = torch.softmax(similarity / (self.temp * (Q.size(-1)**0.5)), dim=-1)
        
        return assignment
    
    @torch.no_grad()
    def hungarian_assignment(self, assignment_matrix):
        """
        assignment_matrix: [B, D, T]
        Returns: list of matched indices per batch
        """
        batch_indices = []
        for b in range(assignment_matrix.size(0)):
            cost_matrix = -assignment_matrix[b].cpu().numpy()
            row_ind, col_ind = linear_sum_assignment(cost_matrix)
            batch_indices.append((row_ind, col_ind))
        return batch_indices


In [None]:
import torch
import torch.nn as nn
from transformers import DetrImageProcessor, DetrForObjectDetection


class MCTR(nn.Module):
    def __init__(self, num_cameras=2, num_queries=100, embed_dim=256, num_heads=8):
        super().__init__()
        self.detr_fod = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50")
        self.detr_fod.requires_grad_(False)
        self.detr_ip = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50")
        self.detr_ip.requires_grad_(False)
        self.track_embeddings = nn.Embedding(num_queries, embed_dim)  
        self.tracking_module = TrackEmbeddingUpdater(num_cameras, embed_dim, num_heads)
        self.association_module = AssociationModule(embed_dim)

