In [13]:
import os
import sys
import json
import numpy as np

def iou(a, b):
    ax, ay, ar, ab = a
    bx, by, br, bb = b
    cross_x = max(ax, bx)
    cross_y = max(ay, by)
    cross_r = min(ar, br)
    cross_b = min(ab, bb)
    cross_w = max(0, (cross_r - cross_x) + 1)
    cross_h = max(0, (cross_b - cross_y) + 1)
    cross_area = cross_w * cross_h
    union = (ar - ax + 1) * (ab - ay + 1) + (br - bx + 1) * (bb - by + 1) - cross_area
    return cross_area / union

def nms(bboxes, threshold, confidence_index=-1):
    bboxes.sort(key=lambda x: x[confidence_index], reverse=True)
    flags = [True] * len(bboxes)
    keep = []
    for i in range(len(bboxes)):
        if not flags[i]: continue
        keep.append(bboxes[i])

        for j in range(i+1, len(bboxes)):
            if iou(bboxes[i][:4], bboxes[j][:4]) > threshold:
                flags[j] = False
    return keep

def nms_as_class(bboxes, threshold, class_index=-1, confidence_index=-2):
    boxasclass = {}
    for box in bboxes:
        classes = box[class_index]
        if classes not in boxasclass:
            boxasclass[classes] = []
        boxasclass[classes].append(box)

    output = []
    for key in boxasclass:
        result = nms(boxasclass[key], threshold, confidence_index)
        output.extend(result)
    return output

def xml_value(line):
    p0 = line.find(">") + 1
    p1 = line.find("</", p0)
    return line[p0:p1]

def xml_token(line):
    p0 = line.find("<") + 1
    p1 = line.find(">", p0)
    return line[p0:p1]

def load_voc_xml(file):

    with open(file, "r") as f:
        lines = f.readlines()

    name = None
    box = None
    bboxes = []
    enter_object = False
    enter_part = False
    for line in lines:
        token = xml_token(line)
        
        if token == "object":
            enter_object = True
        elif token == "/object":
            enter_object = False            
        elif enter_object:
            if token == "part":
                enter_part = True
            elif token == "/part":
                enter_part = False

            if not enter_part:
                if token == "name":
                    name = xml_value(line)
                elif token == "bndbox":
                    box = [name]
                    bboxes.append(box)
                elif token in ["xmin", "ymin", "xmax", "ymax"]:
                    box.append(float(xml_value(line)))
    return bboxes

def load_ann(root, call):
    files = os.listdir(root)
    anns = {}
    for file in files:
        name = file[:file.rfind(".")]
        anns[name] = call(os.path.join(root, file))
    return anns

def load_json_ann(root):
    def call(file):
        with open(file, "r") as f:
            ann = json.load(f)
        return ann
    return load_ann(root, call)

def load_xml_ann(root, label_map):
    def call(file):
        return [item[1:] + [0, label_map.index(item[0])] for item in load_voc_xml(file)]
    return load_ann(root, call)

In [14]:
# pip install pycocotools
from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval

In [15]:
detection_annotation_root = "predict_json"
groundtruth_annotation_root = "groundtruths_xml"
label_map = ["aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train", "tvmonitor"]
detection_annotation = load_json_ann(detection_annotation_root)
groundtruth_annotation = load_xml_ann(groundtruth_annotation_root, label_map)

for image_id in detection_annotation:
    image_base_annotations = detection_annotation[image_id]
    image_base_annotations = nms_as_class(image_base_annotations, 0.5)
    detection_annotation[image_id] = image_base_annotations

In [18]:
def mapCOCO(groundtruth_annotation, detection_annotation, label_map):
    images = []
    annotations = []
    categories = []
    ann_id = 0
    for label, label_name in enumerate(label_map):
        categories.append({"supercategory": label_name, "id": label, "name": label_name})

    for item in groundtruth_annotation:
        filename = item
        anns = groundtruth_annotation[item]
        image_id = int(filename)
        images.append({"id": image_id})

        for left, top, right, bottom, score, classes_index in anns:
            ann_id += 1
            width, height = right - left + 1, bottom - top + 1
            annotations.append({"image_id": image_id, "id": ann_id, "category_id": classes_index, "bbox": [left, top, width, height], "iscrowd": 0, "area": width * height})

    gt_coco = {"images": images, "annotations": annotations, "categories": categories}
    with open("gt_coco.json", "w") as f:
        json.dump(gt_coco, f)

    cocoGt = COCO("gt_coco.json")
    ann_dets = []
    for item in detection_annotation:
        anns = detection_annotation[item]
        image_id = int(item)  
        for left, top, right, bottom, score, classes in anns:
            # {"image_id":1,"category_id":2,"bbox":[199.84, 190.46, 77.71, 70.88],"score":0.236},
            width = right - left + 1
            height = bottom - top + 1
            object_item = {"image_id": image_id, "category_id": classes, "score": score, "bbox": [left, top, width, height]}
            ann_dets.append(object_item)

    cocoDt = cocoGt.loadRes(ann_dets)
    cocoEval = COCOeval(cocoGt, cocoDt,"bbox")
    cocoEval.evaluate()
    cocoEval.accumulate()
    cocoEval.summarize()

In [19]:
mapCOCO(groundtruth_annotation, detection_annotation, label_map)

loading annotations into memory...
Done (t=0.59s)
creating index...
index created!
Loading and preparing results...
DONE (t=0.07s)
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *bbox*
DONE (t=12.00s).
Accumulating evaluation results...
DONE (t=2.21s).
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.271
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.509
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.262
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.025
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.126
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.358
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.265
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.360
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDet