### Remove overlap with old

In [33]:
import numpy as np
import xml.etree.ElementTree as ET
import os
import pprint
import glob

In [34]:
def convert_string_to_int(s):
    if '.' in s:
        return int(float(s))
    else:
        return int(s)

def box_iou(a, b):
    '''
    Helper funciton to calculate the ratio between intersection and the union of
    two boxes a and b
    a[0], a[1], a[2], a[3] <-> left, top, right, bottom
    '''

    w_intsec = np.maximum(0, (np.minimum(a[2], b[2]) - np.maximum(a[0], b[0])))
    h_intsec = np.maximum(0, (np.minimum(a[3], b[3]) - np.maximum(a[1], b[1])))
    s_intsec = w_intsec * h_intsec
    s_a = (a[2] - a[0])*(a[3] - a[1])
    s_b = (b[2] - b[0])*(b[3] - b[1])

    return float(s_intsec)/(s_a + s_b - s_intsec)

def remove_box_overlap(ori_boxes, prc_boxes, threshold=0.3):
    """
    This function will remove all boundingbox overlap with original box 
    in dataset.
    
    Arguments:
        ori_boxes: list original box.
        prc_boxes: list processing box, will remove box in this list.
        threshold: is IOU threshold, will remove the box which overlap with
                   original box over `threshold` percent.
    """
    need_removed = []
    for b in ori_boxes:
        for pb in prc_boxes:
            if box_iou(b, pb) >= threshold:
                need_removed.append(pb)
    return list(set(prc_boxes) - set(need_removed))

def load_person_boxes(xml_path):
    """
    Load person bounding boxes from annotation file
    Arguments:
        xml_path: annotation file path.
    Returns:
        return person bounding boxes in annotation file
    """
    
    tree = ET.parse(xml_path)
    root = tree.getroot()
    
    boxes = []
    
    for element in root:
        if element.tag == 'object':
            n_ele = element.find('name').text

            if n_ele == None or n_ele != 'person': 
                continue

            bndbox = element.find('bndbox')
            xmin = convert_string_to_int(bndbox.find('xmin').text)
            ymin = convert_string_to_int(bndbox.find('ymin').text)
            xmax = convert_string_to_int(bndbox.find('xmax').text)
            ymax = convert_string_to_int(bndbox.find('ymax').text)

            boxes.append((xmin, ymin, xmax, ymax))
        
    return boxes

def remove_objects_in_annotation_file(ori_xml_path, xml_path, output_path, threshold):
    """
    Remove objects in annotation file.
    
    Arguments:
        ori_xml_path: path to origin annotation file.
        xml_path: path to another annotation file to compare with origin annotation.
        output_path: the file path will be saved after remove objects get in `xml_path`.
    Returns:
        None
    """
    ori_boxes = load_person_boxes(ori_xml_path)
    
    tree = ET.parse(xml_path)
    root = tree.getroot()
    
    rm = []
    for element in root:
        if element.tag == 'object':
            n_ele = element.find('name').text

            if n_ele == None or n_ele != 'person': 
                continue

            bndbox = element.find('bndbox')
            xmin = convert_string_to_int(bndbox.find('xmin').text)
            ymin = convert_string_to_int(bndbox.find('ymin').text)
            xmax = convert_string_to_int(bndbox.find('xmax').text)
            ymax = convert_string_to_int(bndbox.find('ymax').text)
            
            for b in ori_boxes:
                iou = box_iou(b, (xmin, ymin, xmax, ymax))
#                 print(iou)
                if iou >= threshold:
                    rm.append(element)
                    
    for e in rm:
        try:
            root.remove(e)
        except:
            print(ori_xml_path)
            print(xml_path)
            print(len(rm))
            print(e.find('name').text)
    
    tree.write(output_path)

In [None]:
ANNOTATION_DIR = 'Priv_personpart/xml'
IOU_THRESHOLD = 0.3

XML_FOLDER_NAME = 'xml'
XML_YOLO_NAME = 'xml_chaos'
OUTPUT_FOLDER_NAME = 'xml_removed'

xmls = glob.glob(os.path.join(ANNOTATION_DIR, '*.xml'))

for xml in xmls:
    prc_xml = xml.replace('/{}/'.format(XML_FOLDER_NAME), '/{}/'.format(XML_YOLO_NAME))
    output_xml_path = xml.replace('/{}/'.format(XML_FOLDER_NAME), '/{}/'.format(OUTPUT_FOLDER_NAME))
    remove_objects_in_annotation_file(xml, prc_xml, output_xml_path, IOU_THRESHOLD)

Priv_personpart/xml/000a902c8674739c97f188157c63d709b45b7595.xml
Priv_personpart/xml_chaos/000a902c8674739c97f188157c63d709b45b7595.xml
3
person
Priv_personpart/xml/000dc9b71496f3c885c8408967165045d9e386ec.xml
Priv_personpart/xml_chaos/000dc9b71496f3c885c8408967165045d9e386ec.xml
9
person
Priv_personpart/xml/000dc9b71496f3c885c8408967165045d9e386ec.xml
Priv_personpart/xml_chaos/000dc9b71496f3c885c8408967165045d9e386ec.xml
9
person
Priv_personpart/xml/000dc9b71496f3c885c8408967165045d9e386ec.xml
Priv_personpart/xml_chaos/000dc9b71496f3c885c8408967165045d9e386ec.xml
9
person
Priv_personpart/xml/000dc9b71496f3c885c8408967165045d9e386ec.xml
Priv_personpart/xml_chaos/000dc9b71496f3c885c8408967165045d9e386ec.xml
9
person
Priv_personpart/xml/000dc9b71496f3c885c8408967165045d9e386ec.xml
Priv_personpart/xml_chaos/000dc9b71496f3c885c8408967165045d9e386ec.xml
9
person
Priv_personpart/xml/00164b19136875dd03f4187b34adf99cb9cae506.xml
Priv_personpart/xml_chaos/00164b19136875dd03f4187b34adf99cb9cae50