Script for fixing xml files in the dataset I found from https://github.com/Deepknowledge-US/US-Real-time-gun-detection-in-CCTV-An-open-problem-dataset/tree/gh-pages. Changes classes to fit my model training, separates the XML annotations into two different folders of jpgs and annotations, and also chooses around 10% of the dataset randomly since there were too many sample before

In [None]:
import os
import xml.etree.ElementTree as ET
import random
from shutil import copy2

src_dir = '/Users/wesleyliu/Downloads/Images'
dest_images_dir = '/Users/wesleyliu/Downloads/CCTV/Images'
dest_xml_dir = '/Users/wesleyliu/Downloads/CCTV/annotations'

os.makedirs(dest_images_dir, exist_ok=True)
os.makedirs(dest_xml_dir, exist_ok=True)

# change class names to fit model classification
class_changes = {'Handgun': 'pistol', 'Short_rifle': 'rifle', 'Knife': 'knife'}

def process_and_copy_files(src_dir, dest_images_dir, dest_xml_dir, class_changes, fraction=0.1):
    # get XML files in directory
    all_files = [f for f in os.listdir(src_dir) if f.endswith('.xml') or f.endswith('.jpg')]
    xml_files = [f for f in all_files if f.endswith('.xml')]
    selected_xml_files = random.sample(xml_files, int(len(xml_files) * fraction)) # get a fraction of xml files

    for xml_file in selected_xml_files:
        # path to XML file
        full_path = os.path.join(src_dir, xml_file)
        tree = ET.parse(full_path)
        root = tree.getroot()

        # rename classes in XML
        for object_tag in root.findall('object'):
            class_name = object_tag.find('name').text
            if class_name in class_changes:
                object_tag.find('name').text = class_changes[class_name]

        # save XML to directory
        tree.write(os.path.join(dest_xml_dir, xml_file))

        # copy image file over
        image_filename = xml_file.replace('.xml', '.jpg')
        src_image_path = os.path.join(src_dir, image_filename)
        dest_image_path = os.path.join(dest_images_dir, image_filename)
        copy2(src_image_path, dest_image_path)

# process files
process_and_copy_files(src_dir, dest_images_dir, dest_xml_dir, class_changes)