In [None]:
import os 
import yaml
import shutil
import numpy as np 
import pandas as pd 
from tqdm import tqdm
import matplotlib.pyplot as plt 
import xml.etree.ElementTree as ET
from sklearn.model_selection import train_test_split

In [None]:
path = os.getcwd()
ANNOTATIONS_PATH = os.path.join(path, 'data', 'annotations')
IMAGES_PATH =  os.path.join(path, 'data', 'images')

TRAIN_PATH = os.path.join(path, 'data', 'train')
VAL_PATH= os.path.join(path, 'data', 'val')
TEST_PATH = os.path.join(path, 'data', 'test')

# Data Pre-Processing

## Reading annotations data

In [None]:
filenames = []

size_props = {
    'height':[],
    'width':[]
}

bounding_box_props = {
    'xmin':[],
    'ymin':[],
    'xmax':[],
    'ymax':[]
}

In [None]:

for file in tqdm(os.listdir(ANNOTATIONS_PATH)):
    annotation = ET.parse(os.path.join(ANNOTATIONS_PATH, file))
    filenames.append(os.path.join(ANNOTATIONS_PATH, file))
    size = annotation.find('size')
    
    for name, prop_list in size_props.items():
        prop_value = size.find(name).text
        size_props[name].append(int(prop_value))
    bounding_box = annotation.find('object').find('bndbox')

    for name, prop_list in bounding_box_props.items():
        prop_value = bounding_box.find(name).text
        bounding_box_props[name].append(int(prop_value))

In [None]:
df = pd.DataFrame({
    'file':filenames,
    'width':size_props['width'],
    'height':size_props['height'],
    'xmin':bounding_box_props['xmin'],
    'ymin':bounding_box_props['ymin'],
    'xmax':bounding_box_props['xmax'],
    'ymax':bounding_box_props['ymax']
})

## Making annotations compatible with YOLOv5

In [None]:
df['center_x'] = (df['xmax'] + df['xmin'])/(2*df['width'])
df['center_y'] = (df['ymax'] + df['ymin'])/(2*df['height'])

df['bb_width'] = (df['xmax'] - df['xmin'])/df['width']
df['bb_height'] = (df['ymax'] - df['ymin'])/df['height']

## Making train/val/test split

In [None]:
# Keeping important columns only 
yolo_df = df[['file', 'center_x', 'center_y', 'bb_width', 'bb_height']]

# Performing 70-15-15 split
test_size = int(0.15 * len(df))
df_train, df_test = train_test_split(yolo_df, test_size=test_size)
df_train, df_val = train_test_split(df_train, test_size=test_size)

## Seperating images with annotations

In [None]:
if not os.path.exists(TRAIN_PATH):
    os.makedirs()

if not os.path.exists(VAL_PATH):
    os.makedirs(VAL_PATH)

if not os.path.exists(TEST_PATH):
    os.makedirs(TEST_PATH)

In [None]:
def move_images(df:pd.DataFrame, PATH:str, set:str):
    print('Moving images for {set} set'.format(set=set))
    for _, row  in tqdm(df.iterrows()):
        annotation_path = row['file']
        image_name = os.path.split(annotation_path)[-1].replace('.xml','')
        image_src = os.path.join(IMAGES_PATH, f'{image_name}.png')
        image_dst = os.path.join(PATH, f'{image_name}.png')
        shutil.copy2(image_src, image_dst)
        label_text = f"0 {row['center_x']} {row['center_y']} {row['bb_width']} {row['bb_height']}"
        with open(os.path.join(PATH, f'{image_name}.txt'), 'w') as f:
            f.write(label_text)
    print('Done moving images for {set} set'.format(set=set))

In [None]:
# Moving images for train 
move_images(df_train, TRAIN_PATH, 'train')
# Moving images for valid
move_images(df_val, VAL_PATH, 'valid')
# Moving images for test 
move_images(df_test, TEST_PATH, 'test')

## Creating .yaml file which will be used to train YOLO on custom data 

In [None]:

data = {
    'names': ['License Plate'],
    'nc': 1,
    'train': os.path.abspath(TRAIN_PATH),
    'val': os.path.abspath(VAL_PATH)
}

with open('./data_yaml/LicensePlate.yaml', 'w') as f:
    yaml.dump(data, f)