In [1]:
import torch
import torch.nn.functional as F
from torch import nn
import math



In [None]:
class PositionEmbeddingSine(nn.Module):
    """
    This is a more standard version of the position embedding, very similar to the one
    used by the Attention is all you need paper, generalized to work on images.
    """
    def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
        super().__init__()
        self.num_pos_feats = num_pos_feats
        self.temperature = temperature
        self.normalize = normalize
        if scale is not None and normalize is False:
            raise ValueError("normalize should be True if scale is passed")
        if scale is None:
            scale = 2 * math.pi
        self.scale = scale

    def forward(self, tensor_list: NestedTensor):
        x = tensor_list.tensors
        mask = tensor_list.mask
        assert mask is not None
        not_mask = ~mask
        y_embed = not_mask.cumsum(1, dtype=torch.float32)
        x_embed = not_mask.cumsum(2, dtype=torch.float32)
        if self.normalize:
            eps = 1e-6
            y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
            x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale

        dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
        dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)

        pos_x = x_embed[:, :, :, None] / dim_t
        pos_y = y_embed[:, :, :, None] / dim_t
        pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
        pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
        pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
        return pos

In [3]:
# Q: Backbone class? Backbone consists of: 
    # 1. FrozenBatchNorm2D
    # 2. BackboneBase 
    # 3. Backbone 
        # When called, this class returns a Dict of nested Tensors (dict of Feature maps)
    # 4. Joiner
        # This class will call Backbone: self[0]
        # Init two lists: out (output feat maps), pos (pos embeddings for each feat map)
        # BackBone will return a dict of Nested Tensors:
            # Iterate over the dictionary: 
                # Append to out 
                # Append pos embeddings to pos  
        # Return out, pos 

In [None]:
class BackBone(nn.Module): 
    def __init__(): 
        super().__init__()
        pass 
    def forward(samples):
        pass 

In [None]:
# Q: what is nn.Module? 
# A: base class for all neural network modules. All models should subclass this class

class DETR(nn.Module): 
    '''
    This class is the DETR module that performs object detection.
    '''
    def __init__(self, backbone, transformer, num_classes, num_queries, aux_loss=False): 
        super().__init__() 
    
    def forward(samples): 
        # Samples is tuple of: 
            # Samples: [batch_sz, 3, H, W]
            # Binary mask: [batch_sz, H, W]
        # 1. First, pass this tuple -> the backbone, which was pre-built and passed to this module beforehand 
            # .build_backbone() will a) instantiate a Backbone, b) instantiate a Joiner, which takes in a Backbone -> model
        # 2. Then extract the tuple of: Samples, Mask from backbone out list (not the pos list) SPECIFICALLY the last one...
            # Pass in => Transformer model 
                # 1. Projection of the Samples
                # 2. Mask
                # 3. Take the last pos embedding too...(corresponding to inputs)
        # 3. Then you will take the output of the Transformer => the following: 
            # 1. Pass into an embed class => outputs the class 
            # 2. Pass into a coords class => outputs the coords 
        # 4. Init a dictionary for logits and boxes and then just return that
        pass 