In [1]:
from typing import Dict, List, Tuple

import torch
from torch import nn, Tensor
import numpy as np

In [2]:
@torch.no_grad()
def boxCxcywh2Xyxy(box: Tensor) -> Tensor:
    cx, cy, w, h = box.unbind(-1)

    x1 = cx - w / 2
    y1 = cy - h / 2
    x2 = cx + w / 2
    y2 = cy + h / 2

    return torch.stack([x1, y1, x2, y2], -1)

@torch.no_grad()
def boxIoU(boxes1: Tensor, boxes2: Tensor) -> Tuple[Tensor, Tensor]:
    boxes1Area = (boxes1[..., 2] - boxes1[..., 0]) * (boxes1[..., 3] - boxes1[..., 1])
    boxes2Area = (boxes2[..., 2] - boxes2[..., 0]) * (boxes2[..., 3] - boxes2[..., 1])

    lt = torch.max(boxes1[:, None, :2], boxes2[:, :2])
    rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])

    wh = (rb - lt).clamp(min=0)
    intersectArea = wh[:, :, 0] * wh[:, :, 1]

    unionArea = boxes1Area[:, None] + boxes2Area - intersectArea

    iou = intersectArea / unionArea
    return iou, unionArea

In [3]:
def computeMAP(x: Dict[str, Tensor], y: List[Dict[str, Tensor]], numClass):
        """
        :param x: a dictionary containing:
            'class': a tensor of shape [batchSize, numObjects, numClass + 1]
            'bbox': a tensor of shape [batchSize, numObjects, 4]

        :param y: a list of dictionaries containing:
            'labels': a tensor of shape [numObjects] that stores the ground-truth classes of objects
            'boxes': a tensor of shape [numObjects, 4] that stores the ground-truth bounding boxes of objects
            represented as [centerX, centerY, w, h]
        :numClass: number of classes

        :return: mean average precision
        """

        # MARK: - classification loss
        logits = torch.Tensor(x['class'])

        targetClassO = torch.Tensor(np.concatenate([np.asarray(t['labels']).reshape(1, len(t['labels'])) for t in y], 0))

        # MARK: - bbox loss
        # ignore boxes that has no object
        mask = targetClassO != numClass
        boxes = torch.Tensor(x['bbox'][mask])
        targetBoxes = torch.Tensor(np.concatenate([np.expand_dims(t['boxes'], 0) for t  in y], 0)[mask])

        numBoxes = len(targetBoxes) + 1e-6

        # MARK: - compute statistics
        with torch.no_grad():
            predClass = nn.functional.softmax(logits, -1).max(-1)[1]
            classMask = (predClass == targetClassO)[mask]
            iou = torch.diag(boxIoU(boxCxcywh2Xyxy(boxes), boxCxcywh2Xyxy(targetBoxes))[0])
            ap = []
            for threshold in range(50, 100, 5):
                ap.append(((iou >= threshold / 100) * classMask).sum() / numBoxes)

            ap = torch.mean(torch.stack(ap))

        return ap

## Directly Running main() will throw such error

In [4]:
if __name__ == '__main__':
    x = {}
    x['class'] = np.random.randn(4, 4, 3)
    x['bbox'] = np.random.rand(4, 4, 4)
    
    y = []
    for i in range(4):
        t = {}
        t['labels'] = [0, 1, 0, 1]
        t['boxes'] = np.random.rand(4,4)
        y.append(t)
    print(computeMAP(x, y, 2))

RuntimeError: Expected object of scalar type Long but got scalar type Float for argument #2 'other'

## Version Reference 

In [8]:
print('Numpy version: ', np.__version__)
print('Pytorch version: ', torch.__version__)

Numpy version:  1.19.2
Pytorch version:  1.1.0


## Running Line by LIne 

It seems that x['bbox'][mask] will simply take the second 4x4 matrix of x['bbox'] and repeat it 4x4 times to form a 4x4x4x4, which looks strange to me. I don't think it is ignoring boxes without objects as comments indicate. Same to targetBoxes

In [10]:
numClass = 2
# MARK: - classification loss
logits = torch.Tensor(x['class'])

targetClassO = torch.Tensor(np.concatenate([np.asarray(t['labels']).reshape(1, len(t['labels'])) for t in y], 0))

# MARK: - bbox loss
# ignore boxes that has no object
mask = targetClassO != numClass
boxes = torch.Tensor(x['bbox'][mask])
targetBoxes = torch.Tensor(np.concatenate([np.expand_dims(t['boxes'], 0) for t  in y], 0)[mask])

print("\nx['bbox'], shape={}".format(x['bbox'].shape))
print(x['bbox'])
print("\nboxes, shape={}".format(boxes.shape))
print(boxes)
print("\nmask, shape={}".format(mask.shape))
print(mask)


x['bbox'], shape=(4, 4, 4)
[[[0.57554573 0.13842347 0.93021693 0.1780981 ]
  [0.76918678 0.36194517 0.34250077 0.44525331]
  [0.93967581 0.69861593 0.96401425 0.99228171]
  [0.84339975 0.16456694 0.44220892 0.16937428]]

 [[0.26681668 0.53628004 0.2688837  0.02346345]
  [0.09278209 0.02989288 0.90094442 0.05706733]
  [0.71767554 0.06986079 0.19711414 0.34239977]
  [0.72031433 0.72727217 0.54567482 0.5997559 ]]

 [[0.14177796 0.69147427 0.84611442 0.99544954]
  [0.53655877 0.65480848 0.02098628 0.77952664]
  [0.56260845 0.70214682 0.24065189 0.4790804 ]
  [0.4554265  0.48009895 0.6610222  0.25873037]]

 [[0.24571804 0.48273494 0.47073937 0.81817623]
  [0.99321936 0.94752516 0.99490644 0.12420786]
  [0.55982867 0.77226061 0.90827594 0.55274729]
  [0.39546412 0.67409266 0.79862389 0.21357527]]]

boxes, shape=torch.Size([4, 4, 4, 4])
tensor([[[[0.2668, 0.5363, 0.2689, 0.0235],
          [0.0928, 0.0299, 0.9009, 0.0571],
          [0.7177, 0.0699, 0.1971, 0.3424],
          [0.7203, 0.7273

I think num of bbox should be 16 rather than 4 if no bbox is ignored (as indicated from mask)

In [12]:
numBoxes = len(targetBoxes) + 1e-6
print('numBoxes = ', numBoxes)

numBoxes =  4.000001
