In [1]:
# Immport
from xml.dom import minidom
import bs4 as bs
import os
from pathlib import Path
import glob
from tqdm import tqdm
import random
import shutil

In [2]:
def convertPascal2YOLOv8(filePath):

    class_mapping = {
        "D00": 0,
        "D10": 1,
        "D20": 2,
        "D40": 3,
        "D01": 4,
        "D11": 5,
        "D43": 6,
        "D44": 7,
        "D50": 8
    }
    
    # reading content
    file = open(filePath, "r")
    contents = file.read()

    # parsing
    soup = bs.BeautifulSoup(contents, 'xml')
    image_size = soup.find_all("size")[0]
    image_width = int(image_size.find_all("width")[0].get_text())
    image_height = int(image_size.find_all("height")[0].get_text())
    # print("w,h :", image_width, image_height)

    # Process Bounding Box
    objects = soup.find_all("object")

    # Placeholder
    bounding_box_list = []
    class_list = []

    for object in objects:
        
        # Object Class
        _class = object.find_all("name")[0].get_text()
        
        # Map the class to int number, if not defined > 10
        _class = class_mapping.get(_class, 10)
        class_list.append(_class)
        
        # Object Bounding Box
        _xmin = float(object.find_all("xmin")[0].get_text())
        _ymin = float(object.find_all("ymin")[0].get_text())
        _xmax = float(object.find_all("xmax")[0].get_text())
        _ymax = float(object.find_all("ymax")[0].get_text())

        # Convert to YOLOv8 Annotation
        # class x_center y_center width height
        w = (_xmax - _xmin)
        h = (_ymax - _ymin)
        cx = _xmin + (w/2)
        cy = _ymin + (h/2)

        # Normalize
        w = round((w / image_width), 4)
        h = round((h / image_height), 4)
        cx = round((cx / image_width), 4)
        cy = round((cy / image_height), 4)

        _bbox = [cx, cy, w, h]

        # print(_class, cx, cy, w, h)

        bounding_box_list.append(_bbox)

    # Get the filename
    outputFilename = os.path.split(filePath)[1]
    outputFilename = outputFilename.replace(".xml", ".txt")

    # Output Path
    outputDir = Path(filePath).parents[2]
    outputDir = outputDir / "labels"

    # Check if the directory exists
    if not os.path.exists(outputDir):
        os.makedirs(outputDir)

    # Final output path
    outputPath = outputDir / outputFilename
    # print(outputPath)

    # Write to .txt file
    with open(outputPath, 'w') as f:
        for i in range(len(class_list)):

            # Filter the class, drop unused class
            # 0: D00 > Longitudinal Crack
            # 1: D10 > Transverse Crack
            # 2: D20 > Alligator Crack
            # 3: D40 > Potholes
            
            if class_list[i] < 4:
                anno = str(class_list[i]) + " " +  str(bounding_box_list[i][0]) + " " +  str(bounding_box_list[i][1]) + " " +  str(bounding_box_list[i][2]) + " " +  str(bounding_box_list[i][3]) + "\n"
                f.write(anno)

In [3]:
# Dataset Root Directory
# ROOTDIR = "/dataset/RDD2022/"
current_dir = os.getcwd()
ROOTDIR = os.path.join(current_dir, "dataset", "RDD2022")
print(ROOTDIR)

if os.path.exists(ROOTDIR) and os.path.isdir(ROOTDIR):
    print(f"The directory '{ROOTDIR}' is valid and exists.")
else:
    print(f"The directory '{ROOTDIR}' is not valid or does not exist.")

# Base Directory
CountryListDir = ["/RDD2022_all_countries/Japan/train/annotations/xmls",
                  "/RDD2022_all_countries/India/train/annotations/xmls",
                  "/RDD2022_all_countries/China_Drone/train/annotations/xmls",
                  "/RDD2022_all_countries/China_MotorBike/train/annotations/xmls",
                  "/RDD2022_all_countries/Czech/train/annotations/xmls",
                  "/RDD2022_all_countries/Norway/Norway/train/annotations/xmls",
                  "/RDD2022_all_countries/United_States/United_States/train/annotations/xmls",
]

for CountryDir in CountryListDir:
    
    CountryDir = ROOTDIR + CountryDir
    print("CountryDir:",CountryDir)
    fileList = sorted(glob.glob(CountryDir + "/*.xml"))

    # Processing all the annotation
    for file in tqdm(fileList):
        convertPascal2YOLOv8(file)
        # break

d:\code\CS632_roaddamagedetection\dataset\RDD2022
The directory 'd:\code\CS632_roaddamagedetection\dataset\RDD2022' is valid and exists.
CountryDir: d:\code\CS632_roaddamagedetection\dataset\RDD2022/RDD2022_all_countries/Japan/train/annotations/xmls


100%|██████████| 10506/10506 [00:43<00:00, 241.12it/s]


CountryDir: d:\code\CS632_roaddamagedetection\dataset\RDD2022/RDD2022_all_countries/India/train/annotations/xmls


100%|██████████| 7706/7706 [00:58<00:00, 132.43it/s]


CountryDir: d:\code\CS632_roaddamagedetection\dataset\RDD2022/RDD2022_all_countries/China_Drone/train/annotations/xmls


100%|██████████| 2401/2401 [00:18<00:00, 128.50it/s]


CountryDir: d:\code\CS632_roaddamagedetection\dataset\RDD2022/RDD2022_all_countries/China_MotorBike/train/annotations/xmls


100%|██████████| 1977/1977 [00:17<00:00, 110.49it/s]


CountryDir: d:\code\CS632_roaddamagedetection\dataset\RDD2022/RDD2022_all_countries/Czech/train/annotations/xmls


0it [00:00, ?it/s]


CountryDir: d:\code\CS632_roaddamagedetection\dataset\RDD2022/RDD2022_all_countries/Norway/Norway/train/annotations/xmls


0it [00:00, ?it/s]


CountryDir: d:\code\CS632_roaddamagedetection\dataset\RDD2022/RDD2022_all_countries/United_States/United_States/train/annotations/xmls


0it [00:00, ?it/s]


In [4]:
def CopyDatasetSplit(baseDir):
    
    # Split the training data to train and validation data due to lack of annotation on test data
    # Seed
    random.seed(1337)
    
    # Output Directory
    # !!! Change this to your clone folder
    baseOutputDir = "D:/code/CS632_roaddamagedetection/dataset/rddJapanIndiaChinaFiltered/"
    countryName = Path(baseDir).parents[0]
    countryName = os.path.split(countryName)[1]

    baseImageDir = baseDir + "images/"
    baseAnnotDir = baseDir + "labels/"

    image_list_all = sorted(glob.glob(baseImageDir + "/*"))
    annot_list_all = sorted(glob.glob(baseAnnotDir + "/*"))

    # Drop any images that doesnt have annotation (background)
    # Or just leave it at some percentage of the dataset
    backgroundImages_Percentage = 0.1
    image_list = []
    annot_list = []
    
    dataset_length_all = len(image_list_all)
    max_background_image = int(dataset_length_all*backgroundImages_Percentage)
    _counter = 0

    for i in range(len(annot_list_all)):
        
        with open(annot_list_all[i]) as f:
            _annot = f.read()

            # Annotation not empty
            if _annot:
                image_list.append(image_list_all[i])
                annot_list.append(annot_list_all[i])
            elif _counter < max_background_image:
                image_list.append(image_list_all[i])
                annot_list.append(annot_list_all[i])
                _counter = _counter + 1
                
    # Dataset length
    dataset_length = len(image_list)
    # print(dataset_length, len(annot_list))

    split_ratio = 0.9
    middle_point = round(split_ratio * dataset_length)

    # Create random list number using seed
    numberList = list(range(0, dataset_length))
    random.shuffle(numberList)
    trainNumberList = numberList[:middle_point]
    validNumberList = numberList[middle_point:]
    print("Training/Validation Samples :", len(trainNumberList), len(validNumberList))

    # Training images and labels
    print("Copying training images and labels for", countryName)
    for i in tqdm(trainNumberList):

        # Images
        outputImagesDir = baseOutputDir + countryName + "/images/train/"
        if not os.path.exists(outputImagesDir):
            os.makedirs(outputImagesDir)

        shutil.copy2(image_list[i], outputImagesDir)

        # Annotations
        outputAnnotDir = baseOutputDir + countryName + "/labels/train/"
        if not os.path.exists(outputAnnotDir):
            os.makedirs(outputAnnotDir)

        shutil.copy2(annot_list[i], outputAnnotDir)
        # print(outputImagesDir, outputAnnotDir)

    # Validation images and labels
    print("Copying validation images and labels for", countryName)
    for i in tqdm(validNumberList):

        # Images
        outputImagesDir = baseOutputDir + countryName + "/images/val/"
        if not os.path.exists(outputImagesDir):
            os.makedirs(outputImagesDir)

        shutil.copy2(image_list[i], outputImagesDir)

        # Annotations
        outputAnnotDir = baseOutputDir + countryName + "/labels/val/"
        if not os.path.exists(outputAnnotDir):
            os.makedirs(outputAnnotDir)

        shutil.copy2(annot_list[i], outputAnnotDir)
        # print(outputImagesDir, outputAnnotDir)

# baseDir = "../dataset/RDD2022/RDD2022_all_countries/Japan/train/"
# CopyDatasetSplit(baseDir)

In [6]:
# Base Directory
ROOTDIR = os.path.join(current_dir, "dataset", "RDD2022")

# Use only japan india
CountryListDir = ["/RDD2022_all_countries/Japan/train/",
                  "/RDD2022_all_countries/India/train/",
                  "/RDD2022_all_countries/China_Drone/train/",
                  "/RDD2022_all_countries/China_MotorBike/train/",
                #   "/RDD2022/RDD2022_all_countries/Czech/train/",
                #   "/RDD2022/RDD2022_all_countries/Norway/Norway/train/",
                #   "/RDD2022/RDD2022_all_countries/United_States/United_States/train/",
]

for CountryDir in CountryListDir:
    CountryDir = ROOTDIR + CountryDir
    CopyDatasetSplit(CountryDir)

Training/Validation Samples : 8055 895
Copying training images and labels for Japan


100%|██████████| 8055/8055 [01:50<00:00, 72.64it/s] 


Copying validation images and labels for Japan


100%|██████████| 895/895 [00:18<00:00, 49.42it/s]


Training/Validation Samples : 3594 399
Copying training images and labels for India


100%|██████████| 3594/3594 [01:25<00:00, 42.20it/s]


Copying validation images and labels for India


100%|██████████| 399/399 [00:09<00:00, 43.48it/s]


Training/Validation Samples : 1943 216
Copying training images and labels for China_Drone


100%|██████████| 1943/1943 [00:39<00:00, 48.77it/s]


Copying validation images and labels for China_Drone


100%|██████████| 216/216 [00:04<00:00, 49.44it/s]


Training/Validation Samples : 1779 198
Copying training images and labels for China_MotorBike


100%|██████████| 1779/1779 [00:37<00:00, 46.87it/s]


Copying validation images and labels for China_MotorBike


100%|██████████| 198/198 [00:03<00:00, 55.39it/s]


In [9]:
!tree . /A

Folder PATH listing for volume D
Volume serial number is 146B-D5EC
D:\CODE\CS632_ROADDAMAGEDETECTION
\---dataset
    +---RDD2022
    |   \---RDD2022_all_countries
    |       +---China_Drone
    |       |   \---train
    |       |       +---annotations
    |       |       |   \---xmls
    |       |       +---images
    |       |       \---labels
    |       +---China_MotorBike
    |       |   +---test
    |       |   |   \---images
    |       |   \---train
    |       |       +---annotations
    |       |       |   \---xmls
    |       |       +---images
    |       |       \---labels
    |       +---India
    |       |   +---test
    |       |   |   \---images
    |       |   \---train
    |       |       +---annotations
    |       |       |   \---xmls
    |       |       +---images
    |       |       \---labels
    |       \---Japan
    |           +---test
    |           |   \---images
    |           \---train
    |               +---annotations
    |               |   \---xmls